aaronAI/deprecated/ingest_claude.py

import json
import sys
from pathlib import Path
from sentence_transformers import SentenceTransformer
import psycopg2
import psycopg2.extras
import json as json_module

# Paths
db_path = str(Path.home() / "aaronai" / "db")
EXPORT_DIR = "/home/aaron/nextcloud/data/data/aaron/files/Archive/Misc/Claude Export"

print("Loading embedding model...")
embedder = SentenceTransformer("all-MiniLM-L6-v2")
client = chromadb.PersistentClient(path=db_path)
collection = client.get_or_create_collection(
    name="aaronai",
    metadata={"hnsw:space": "cosine", "hnsw:allow_replace_deleted": True}
)

def extract_messages(convo):
    messages = []
    for msg in convo.get("chat_messages", []):
        role = msg.get("sender", "")
        if role not in ["human", "assistant"]:
            continue
        content = msg.get("content", [])
        text = ""
        if isinstance(content, str):
            text = content
        elif isinstance(content, list):
            for block in content:
                if isinstance(block, dict) and block.get("type") == "text":
                    text += block.get("text", "")
                elif isinstance(block, str):
                    text += block
        text = text.strip()
        if not text:
            continue
        messages.append((msg.get("created_at", ""), role, text))
    return messages

def chunk_conversation(convo):
    chunks = []
    title = convo.get("name", "Untitled conversation")
    uuid = convo.get("uuid", "")
    created_at = convo.get("created_at", "")
    messages = extract_messages(convo)
    if not messages:
        return chunks

    window = []
    for i, (ts, role, text) in enumerate(messages):
        label = "You" if role == "human" else "Claude"
        window.append(f"{label}: {text}")
        if len(window) >= 3 or i == len(messages) - 1:
            chunk_text = f"[Claude conversation: {title}]\n\n" + "\n\n".join(window)
            chunk_id = f"claude_{uuid}_{i}"
            chunks.append((chunk_id, chunk_text, {
                "source": f"Claude: {title}",
                "type": "claude_conversation",
                "created_at": created_at,
            }))
            window = window[-1:]
    return chunks

def ingest_conversations(path):
    print(f"\nIngesting conversations from {path.name}...")
    conversations = []

    # Handle both .json (array) and .jsonl (one per line)
    raw = path.read_text(encoding="utf-8").strip()
    if raw.startswith("["):
        conversations = json.loads(raw)
    else:
        for line in raw.splitlines():
            line = line.strip()
            if line:
                try:
                    conversations.append(json.loads(line))
                except:
                    continue

    print(f"Found {len(conversations)} conversations")
    total = 0
    skipped = 0

    for convo in conversations:
        chunks = chunk_conversation(convo)
        if not chunks:
            skipped += 1
            continue

        ids    = [c[0] for c in chunks]
        texts  = [c[1] for c in chunks]
        metas  = [c[2] for c in chunks]

        existing    = collection.get(ids=ids)
        existing_ids = set(existing["ids"])
        new = [(id, txt, meta) for id, txt, meta in zip(ids, texts, metas)
               if id not in existing_ids]

        if not new:
            continue

        embeddings = embedder.encode([n[1] for n in new]).tolist()
        pg = get_pg()
        cur = pg.cursor()
        for (chunk_id, chunk_text, meta), embedding in zip(new, embeddings):
            cur.execute("""
                INSERT INTO embeddings (id, document, embedding, source, type, created_at, metadata)
                VALUES (%s, %s, %s::vector, %s, %s, %s, %s)
                ON CONFLICT (id) DO UPDATE SET
                    document = EXCLUDED.document,
                    embedding = EXCLUDED.embedding,
                    source = EXCLUDED.source,
                    type = EXCLUDED.type,
                    created_at = EXCLUDED.created_at,
                    metadata = EXCLUDED.metadata
            """, (
                chunk_id, chunk_text, embedding,
                meta.get('source'), meta.get('type'), meta.get('created_at'),
                json_module.dumps(meta)
            ))
        pg.commit()
        pg.close()
        total += len(new)

    print(f"Conversations: {total} chunks added, {skipped} skipped")
    return total

def ingest_memories(path):
    print(f"\nIngesting memories from {path.name}...")
    raw = json.loads(path.read_text(encoding="utf-8"))

    # Memories are a list of memory objects
    memories = raw if isinstance(raw, list) else raw.get("memories", [])
    if not memories:
        print("No memories found")
        return 0

    # Combine all memories into one chunk — they're already distilled
    memory_text = "\n".join([
        f"- {m.get('content', m) if isinstance(m, dict) else m}"
        for m in memories
    ])

    chunk_text = f"[Claude memory — what Claude has learned about Aaron]\n\n{memory_text}"
    chunk_id = "claude_memories_consolidated"

    existing = collection.get(ids=[chunk_id])
    if existing["ids"]:
        # Update by deleting and re-adding
        collection.delete(ids=[chunk_id])

    embedding = embedder.encode([chunk_text]).tolist()
    collection.upsert(
        ids=[chunk_id],
        documents=[chunk_text],
        metadatas=[{
            "source": "Claude: Memory",
            "type": "claude_memory",
        }],
        embeddings=embedding,
    )

    print(f"Memories: 1 chunk added ({len(memories)} memory items)")
    return 1

# Run ingestion
export_dir = Path(EXPORT_DIR)
total = 0

conv_files = list(export_dir.glob("conversations.*"))
for f in conv_files:
    total += ingest_conversations(f)

mem_files = list(export_dir.glob("memories.*"))
for f in mem_files:
    total += ingest_memories(f)

if total == 0:
    print("\nNo files found or no new chunks to add.")
else:
    print(f"\nTotal chunks added to corpus: {total}")

# Show updated corpus size
count = collection.count()
print(f"Corpus now contains {count} total chunks")