Update ingest_claude.py — handle .json format, ingest memories, handle both array and jsonl
This commit is contained in:
+73
-35
@@ -1,7 +1,6 @@
|
|||||||
import json
|
import json
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from datetime import datetime
|
|
||||||
from sentence_transformers import SentenceTransformer
|
from sentence_transformers import SentenceTransformer
|
||||||
import chromadb
|
import chromadb
|
||||||
|
|
||||||
@@ -18,13 +17,11 @@ collection = client.get_or_create_collection(
|
|||||||
)
|
)
|
||||||
|
|
||||||
def extract_messages(convo):
|
def extract_messages(convo):
|
||||||
"""Extract messages from a Claude conversation object."""
|
|
||||||
messages = []
|
messages = []
|
||||||
for msg in convo.get("chat_messages", []):
|
for msg in convo.get("chat_messages", []):
|
||||||
role = msg.get("sender", "")
|
role = msg.get("sender", "")
|
||||||
if role not in ["human", "assistant"]:
|
if role not in ["human", "assistant"]:
|
||||||
continue
|
continue
|
||||||
# Claude export stores content as a list of content blocks
|
|
||||||
content = msg.get("content", [])
|
content = msg.get("content", [])
|
||||||
text = ""
|
text = ""
|
||||||
if isinstance(content, str):
|
if isinstance(content, str):
|
||||||
@@ -38,12 +35,10 @@ def extract_messages(convo):
|
|||||||
text = text.strip()
|
text = text.strip()
|
||||||
if not text:
|
if not text:
|
||||||
continue
|
continue
|
||||||
created_at = msg.get("created_at", "")
|
messages.append((msg.get("created_at", ""), role, text))
|
||||||
messages.append((created_at, role, text))
|
|
||||||
return messages
|
return messages
|
||||||
|
|
||||||
def chunk_conversation(convo):
|
def chunk_conversation(convo):
|
||||||
"""Turn a conversation into indexable chunks."""
|
|
||||||
chunks = []
|
chunks = []
|
||||||
title = convo.get("name", "Untitled conversation")
|
title = convo.get("name", "Untitled conversation")
|
||||||
uuid = convo.get("uuid", "")
|
uuid = convo.get("uuid", "")
|
||||||
@@ -52,7 +47,6 @@ def chunk_conversation(convo):
|
|||||||
if not messages:
|
if not messages:
|
||||||
return chunks
|
return chunks
|
||||||
|
|
||||||
# Chunk into sliding windows of 3 messages
|
|
||||||
window = []
|
window = []
|
||||||
for i, (ts, role, text) in enumerate(messages):
|
for i, (ts, role, text) in enumerate(messages):
|
||||||
label = "You" if role == "human" else "Claude"
|
label = "You" if role == "human" else "Claude"
|
||||||
@@ -65,25 +59,28 @@ def chunk_conversation(convo):
|
|||||||
"type": "claude_conversation",
|
"type": "claude_conversation",
|
||||||
"created_at": created_at,
|
"created_at": created_at,
|
||||||
}))
|
}))
|
||||||
window = window[-1:] # overlap by 1
|
window = window[-1:]
|
||||||
|
|
||||||
return chunks
|
return chunks
|
||||||
|
|
||||||
def ingest_file(jsonl_path):
|
def ingest_conversations(path):
|
||||||
print(f"Processing {jsonl_path.name}...")
|
print(f"\nIngesting conversations from {path.name}...")
|
||||||
conversations = []
|
conversations = []
|
||||||
with open(jsonl_path, encoding="utf-8") as f:
|
|
||||||
for line in f:
|
# Handle both .json (array) and .jsonl (one per line)
|
||||||
|
raw = path.read_text(encoding="utf-8").strip()
|
||||||
|
if raw.startswith("["):
|
||||||
|
conversations = json.loads(raw)
|
||||||
|
else:
|
||||||
|
for line in raw.splitlines():
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if not line:
|
if line:
|
||||||
continue
|
|
||||||
try:
|
try:
|
||||||
conversations.append(json.loads(line))
|
conversations.append(json.loads(line))
|
||||||
except json.JSONDecodeError:
|
except:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
print(f"Found {len(conversations)} conversations")
|
print(f"Found {len(conversations)} conversations")
|
||||||
total_chunks = 0
|
total = 0
|
||||||
skipped = 0
|
skipped = 0
|
||||||
|
|
||||||
for convo in conversations:
|
for convo in conversations:
|
||||||
@@ -96,10 +93,10 @@ def ingest_file(jsonl_path):
|
|||||||
texts = [c[1] for c in chunks]
|
texts = [c[1] for c in chunks]
|
||||||
metas = [c[2] for c in chunks]
|
metas = [c[2] for c in chunks]
|
||||||
|
|
||||||
# Check existing
|
|
||||||
existing = collection.get(ids=ids)
|
existing = collection.get(ids=ids)
|
||||||
existing_ids = set(existing["ids"])
|
existing_ids = set(existing["ids"])
|
||||||
new = [(id, txt, meta) for id, txt, meta in zip(ids, texts, metas) if id not in existing_ids]
|
new = [(id, txt, meta) for id, txt, meta in zip(ids, texts, metas)
|
||||||
|
if id not in existing_ids]
|
||||||
|
|
||||||
if not new:
|
if not new:
|
||||||
continue
|
continue
|
||||||
@@ -111,25 +108,66 @@ def ingest_file(jsonl_path):
|
|||||||
metadatas=[n[2] for n in new],
|
metadatas=[n[2] for n in new],
|
||||||
embeddings=embeddings,
|
embeddings=embeddings,
|
||||||
)
|
)
|
||||||
total_chunks += len(new)
|
total += len(new)
|
||||||
|
|
||||||
print(f"Done. {total_chunks} chunks added, {skipped} conversations skipped.")
|
print(f"Conversations: {total} chunks added, {skipped} skipped")
|
||||||
return total_chunks
|
return total
|
||||||
|
|
||||||
# Find the export file
|
def ingest_memories(path):
|
||||||
|
print(f"\nIngesting memories from {path.name}...")
|
||||||
|
raw = json.loads(path.read_text(encoding="utf-8"))
|
||||||
|
|
||||||
|
# Memories are a list of memory objects
|
||||||
|
memories = raw if isinstance(raw, list) else raw.get("memories", [])
|
||||||
|
if not memories:
|
||||||
|
print("No memories found")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# Combine all memories into one chunk — they're already distilled
|
||||||
|
memory_text = "\n".join([
|
||||||
|
f"- {m.get('content', m) if isinstance(m, dict) else m}"
|
||||||
|
for m in memories
|
||||||
|
])
|
||||||
|
|
||||||
|
chunk_text = f"[Claude memory — what Claude has learned about Aaron]\n\n{memory_text}"
|
||||||
|
chunk_id = "claude_memories_consolidated"
|
||||||
|
|
||||||
|
existing = collection.get(ids=[chunk_id])
|
||||||
|
if existing["ids"]:
|
||||||
|
# Update by deleting and re-adding
|
||||||
|
collection.delete(ids=[chunk_id])
|
||||||
|
|
||||||
|
embedding = embedder.encode([chunk_text]).tolist()
|
||||||
|
collection.add(
|
||||||
|
ids=[chunk_id],
|
||||||
|
documents=[chunk_text],
|
||||||
|
metadatas=[{
|
||||||
|
"source": "Claude: Memory",
|
||||||
|
"type": "claude_memory",
|
||||||
|
}],
|
||||||
|
embeddings=embedding,
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"Memories: 1 chunk added ({len(memories)} memory items)")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
# Run ingestion
|
||||||
export_dir = Path(EXPORT_DIR)
|
export_dir = Path(EXPORT_DIR)
|
||||||
export_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
jsonl_files = list(export_dir.glob("*.jsonl")) + list(export_dir.glob("**/*.jsonl"))
|
|
||||||
|
|
||||||
if not jsonl_files:
|
|
||||||
print(f"No .jsonl files found in {EXPORT_DIR}")
|
|
||||||
print("Place your Claude export conversations.jsonl file there and run again.")
|
|
||||||
sys.exit(0)
|
|
||||||
|
|
||||||
total = 0
|
total = 0
|
||||||
for f in jsonl_files:
|
|
||||||
total += ingest_file(f)
|
|
||||||
|
|
||||||
|
conv_files = list(export_dir.glob("conversations.*"))
|
||||||
|
for f in conv_files:
|
||||||
|
total += ingest_conversations(f)
|
||||||
|
|
||||||
|
mem_files = list(export_dir.glob("memories.*"))
|
||||||
|
for f in mem_files:
|
||||||
|
total += ingest_memories(f)
|
||||||
|
|
||||||
|
if total == 0:
|
||||||
|
print("\nNo files found or no new chunks to add.")
|
||||||
|
else:
|
||||||
print(f"\nTotal chunks added to corpus: {total}")
|
print(f"\nTotal chunks added to corpus: {total}")
|
||||||
print(f"Database at: {db_path}")
|
|
||||||
|
# Show updated corpus size
|
||||||
|
count = collection.count()
|
||||||
|
print(f"Corpus now contains {count} total chunks")
|
||||||
|
|||||||
Reference in New Issue
Block a user