chore: archive deprecated chromadb and migration scripts

2026-04-28 00:15:46 +00:00
parent d5b5c2ec14
commit 037d747573
10 changed files with 486 additions and 11 deletions
@@ -565,6 +565,7 @@ async def get_status(auth: str = Depends(require_auth)):

    # Watcher status
    watcher_running = False
+    watcher_ingestion = {"status": "idle", "message": "", "file_count": 0}
    last_indexed = "Unknown"
    try:
        import time as _time, json as _json
@@ -573,6 +574,7 @@ async def get_status(auth: str = Depends(require_auth)):
            _s = _json.loads(_sp.read_text())
            _age = _time.time() - _s.get("timestamp", 0)
            watcher_running = _s.get("running", False) and _age < 30
+            watcher_ingestion = _s.get("ingestion", watcher_ingestion)
    except:
        pass

@@ -613,6 +615,7 @@ async def get_status(auth: str = Depends(require_auth)):
    return JSONResponse({
        "aaron_ai": "running",
        "watcher": "running" if watcher_running else "stopped",
+        "watcher_ingestion": watcher_ingestion,
        "chunk_count": chunk_count,
        "file_count": file_count,
        "last_indexed": last_indexed,
@@ -1,250 +0,0 @@
-import os
-import json
-from pathlib import Path
-from dotenv import load_dotenv
-import chromadb
-from sentence_transformers import SentenceTransformer
-import anthropic
-from datetime import datetime
-
-load_dotenv(Path.home() / "aaronai" / ".env")
-
-memory_path = Path.home() / "aaronai" / "memory.md"
-db_path = str(Path.home() / "aaronai" / "db")
-
-print("Loading Aaron AI...")
-embedder = SentenceTransformer("all-MiniLM-L6-v2")
-chroma_client = chromadb.PersistentClient(path=db_path)
-collection = chroma_client.get_or_create_collection(
-    name="aaronai",
-    metadata={"hnsw:space": "cosine"}
-)
-anthropic_client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
-
-SYSTEM_PROMPT = """You are Aaron Nelson's personal AI assistant. Aaron is an Associate Professor
-of Digital Design & Fabrication and Program Director of the Hudson Valley Additive Manufacturing
-Center (HVAMC) at SUNY New Paltz. He is an expert in computational design, additive manufacturing,
-and digital fabrication with deep fluency in Rhino, Grasshopper, Stratasys FDM, PolyJet, and metal
-3D printing workflows. He runs a commercial venture called Mossygear and a consulting operation
-called FWN3D. He has a background in graffiti lettering and vector illustration.
-
-You have been provided with relevant excerpts from Aaron's own documents and his persistent memory.
-Use this context to give answers grounded in his actual work and history. When helping him write
-or create, match his voice and draw on his existing materials. Be direct and specific -
-Aaron values precision over padding. Always cite which documents you drew from when relevant.
-
-You have access to web search. Use it automatically when:
- Questions require current data (salaries, job postings, prices, news)
- Questions reference specific institutions, people, or organizations you need to verify
- Aaron's documents and memory don't contain sufficient information to answer well
-Do not announce that you are searching. Just search and incorporate results naturally."""
-
-CV_SOURCES = ["Aaron Nelson CV 2024.pdf"]
-conversation_history = []
-
-TOOLS = [
-    {
-        "type": "web_search_20250305",
-        "name": "web_search"
-    }
-]
-
-def load_memory():
-    if memory_path.exists():
-        return memory_path.read_text(encoding="utf-8")
-    return ""
-
-def save_memory(content):
-    memory_path.write_text(content, encoding="utf-8")
-
-def add_to_memory(new_item):
-    memory = load_memory()
-    timestamp = datetime.now().strftime("%Y-%m-%d")
-    note = f"\n- [{timestamp}] {new_item}"
-    if "## Notes" not in memory:
-        memory += "\n\n## Notes"
-    memory += note
-    save_memory(memory)
-
-def remove_from_memory(item):
-    memory = load_memory()
-    lines = memory.split("\n")
-    filtered = [l for l in lines if item.lower() not in l.lower()]
-    save_memory("\n".join(filtered))
-    return len(lines) - len(filtered)
-
-def get_pinned_cv_context():
-    results = collection.get(
-        where={"source": "Aaron Nelson CV 2024.pdf"},
-        include=["documents", "metadatas"]
-    )
-    return results["documents"], results["metadatas"]
-
-def is_professional_query(query):
-    keywords = [
-        "grant", "publication", "exhibition", "award", "fellowship",
-        "experience", "position", "job", "career", "cv", "resume",
-        "research", "work history", "accomplishment", "teaching",
-        "course", "client", "consultation", "presentation", "workshop",
-        "education", "degree", "institution", "service", "committee"
-    ]
-    return any(keyword in query.lower() for keyword in keywords)
-
-def retrieve_context(query, n_results=8):
-    query_embedding = embedder.encode([query]).tolist()
-    results = collection.query(
-        query_embeddings=query_embedding,
-        n_results=n_results,
-        include=["documents", "metadatas", "distances"]
-    )
-
-    context_pieces = []
-    sources = []
-
-    if is_professional_query(query):
-        cv_docs, cv_metas = get_pinned_cv_context()
-        for doc, meta in zip(cv_docs, cv_metas):
-            context_pieces.append(f"[CV] {doc}")
-            sources.append(meta["source"])
-
-    for doc, meta, dist in zip(
-        results["documents"][0],
-        results["metadatas"][0],
-        results["distances"][0]
-    ):
-        relevance = 1 - dist
-        if relevance > 0.3 and meta["source"] not in CV_SOURCES:
-            context_pieces.append(doc)
-            sources.append(meta["source"])
-
-    return context_pieces, sources
-
-def handle_command(user_input):
-    stripped = user_input.strip().lower()
-
-    if stripped == "show memory":
-        memory = load_memory()
-        print(f"\nAaron AI: Current memory:\n\n{memory}")
-        return True
-
-    if stripped.startswith("remember:"):
-        item = user_input[9:].strip()
-        add_to_memory(item)
-        print(f"\nAaron AI: Saved to memory: '{item}'")
-        return True
-
-    if stripped.startswith("forget:"):
-        item = user_input[7:].strip()
-        removed = remove_from_memory(item)
-        if removed:
-            print(f"\nAaron AI: Removed {removed} line(s) containing '{item}' from memory.")
-        else:
-            print(f"\nAaron AI: Nothing found in memory containing '{item}'.")
-        return True
-
-    if stripped == "clear":
-        conversation_history.clear()
-        print("\nAaron AI: Conversation history cleared.")
-        return True
-
-    return False
-
-def chat(user_message):
-    memory = load_memory()
-    context_pieces, sources = retrieve_context(user_message)
-
-    context_parts = []
-    if memory:
-        context_parts.append(f"Aaron's persistent memory:\n\n{memory}")
-    if context_pieces:
-        context_str = "\n\n---\n\n".join(context_pieces)
-        unique_sources = list(set(sources))
-        context_parts.append(
-            f"Relevant excerpts from Aaron's documents:\n\n{context_str}\n\nSources: {', '.join(unique_sources)}"
-        )
-
-    context_block = "\n\n====\n\n".join(context_parts) + "\n\n---\n\n" if context_parts else ""
-    full_message = context_block + user_message
-
-    # Build messages for this turn
-    messages = conversation_history + [{"role": "user", "content": full_message}]
-
-    # Agentic loop to handle tool use
-    while True:
-        response = anthropic_client.messages.create(
-            model="claude-sonnet-4-6",
-            max_tokens=2048,
-            system=SYSTEM_PROMPT,
-            tools=TOOLS,
-            messages=messages
-        )
-
-        # Check if we need to handle tool calls
-        if response.stop_reason == "tool_use":
-            # Add assistant response to messages
-            messages.append({"role": "assistant", "content": response.content})
-
-            # Process each tool use block
-            tool_results = []
-            for block in response.content:
-                if block.type == "tool_use":
-                    tool_results.append({
-                        "type": "tool_result",
-                        "tool_use_id": block.id,
-                        "content": "Search completed"
-                    })
-
-            # Add tool results and continue
-            messages.append({"role": "user", "content": tool_results})
-
-        else:
-            # Final response - extract text
-            assistant_message = ""
-            for block in response.content:
-                if hasattr(block, "text"):
-                    assistant_message += block.text
-
-            # Update conversation history with clean versions
-            conversation_history.append({"role": "user", "content": full_message})
-            conversation_history.append({"role": "assistant", "content": assistant_message})
-
-            if len(conversation_history) > 20:
-                conversation_history.pop(0)
-                conversation_history.pop(0)
-
-            return assistant_message, sources
-
-def main():
-    print("Aaron AI ready. Corpus, memory, and web search loaded.")
-    print("Commands: 'remember: [fact]' | 'forget: [text]' | 'show memory' | 'clear' | 'quit'")
-    print("=" * 60)
-
-    while True:
-        try:
-            user_input = input("\nYou: ").strip()
-
-            if not user_input:
-                continue
-
-            if user_input.strip().lower() == "quit":
-                print("Goodbye.")
-                break
-
-            if handle_command(user_input):
-                continue
-
-            response, sources = chat(user_input)
-            print(f"\nAaron AI: {response}")
-
-            if sources:
-                unique = list(set(sources))
-                print(f"\n[Sources: {', '.join(unique)}]")
-
-        except KeyboardInterrupt:
-            print("\nGoodbye.")
-            break
-        except Exception as e:
-            print(f"Error: {e}")
-
-if __name__ == "__main__":
-    main()
@@ -145,7 +145,6 @@ def ingest_folder(folder_path):
        total_chunks += ingest_file(f)

    print(f"\nDone. Total chunks indexed: {total_chunks}")
-    print(f"Database stored at: {db_path}")

 if __name__ == "__main__":
    target = sys.argv[1] if len(sys.argv) > 1 else str(Path.home() / "aaronai" / "docs")
@@ -1,152 +0,0 @@
-import json
-import sys
-from pathlib import Path
-from datetime import datetime
-from sentence_transformers import SentenceTransformer
-import psycopg2
-import psycopg2.extras
-import json as json_module
-
-# Paths
-db_path = str(Path.home() / "aaronai" / "db")
-EXPORT_DIR = "/home/aaron/nextcloud/data/data/aaron/files/Archive/Misc/ChatGPT Export"
-
-print("Loading embedding model...")
-embedder = SentenceTransformer("all-MiniLM-L6-v2")
-client = chromadb.PersistentClient(path=db_path)
-collection = client.get_or_create_collection(
-    name="aaronai",
-    metadata={"hnsw:space": "cosine", "hnsw:allow_replace_deleted": True}
-)
-
-def extract_messages(convo):
-    """Extract ordered user/assistant messages from a conversation."""
-    mapping = convo.get("mapping", {})
-    messages = []
-
-    for node in mapping.values():
-        msg = node.get("message")
-        if not msg:
-            continue
-
-        role = msg.get("author", {}).get("role")
-        if role not in ["user", "assistant"]:
-            continue
-
-        content = msg.get("content", {})
-        parts = content.get("parts", [])
-
-        # Extract text parts only
-        text = ""
-        for part in parts:
-            if isinstance(part, str):
-                text += part
-            elif isinstance(part, dict) and part.get("content_type") == "text":
-                text += part.get("text", "")
-
-        text = text.strip()
-        if not text:
-            continue
-
-        create_time = msg.get("create_time") or 0
-        messages.append((create_time, role, text))
-
-    # Sort by timestamp
-    messages.sort(key=lambda x: x[0])
-    return messages
-
-def chunk_conversation(title, messages, chunk_size=600, overlap=100):
-    """Convert a conversation into overlapping text chunks."""
-    # Build full conversation text
-    lines = [f"[Conversation: {title}]", ""]
-    for _, role, text in messages:
-        label = "Aaron" if role == "user" else "ChatGPT"
-        lines.append(f"{label}: {text}")
-        lines.append("")
-
-    full_text = "\n".join(lines)
-
-    # Split into word-level chunks with overlap
-    words = full_text.split()
-    chunks = []
-    start = 0
-    while start < len(words):
-        end = start + chunk_size
-        chunk = " ".join(words[start:end])
-        if chunk.strip():
-            chunks.append(chunk)
-        start += chunk_size - overlap
-
-    return chunks
-
-def ingest_file(json_path):
-    print(f"\nLoading {json_path.name}...")
-    data = json.load(open(json_path, encoding="utf-8"))
-    print(f"Found {len(data)} conversations")
-
-    total_chunks = 0
-    skipped = 0
-
-    for i, convo in enumerate(data):
-        title = convo.get("title", "Untitled")
-        convo_id = convo.get("id", f"convo_{i}")
-        create_time = convo.get("create_time", 0)
-
-        try:
-            date_str = datetime.fromtimestamp(create_time).strftime("%Y-%m-%d")
-        except:
-            date_str = "unknown"
-
-        messages = extract_messages(convo)
-
-        if len(messages) < 2:
-            skipped += 1
-            continue
-
-        chunks = chunk_conversation(title, messages)
-        if not chunks:
-            skipped += 1
-            continue
-
-        # Embed and store
-        embeddings = embedder.encode(chunks).tolist()
-        ids = [f"chatgpt_{convo_id}_{j}" for j in range(len(chunks))]
-        metadatas = [{
-            "source": f"ChatGPT: {title}",
-            "filepath": str(json_path),
-            "date": date_str,
-            "type": "chatgpt_conversation"
-        } for _ in chunks]
-
-        collection.upsert(
-            documents=chunks,
-            embeddings=embeddings,
-            ids=ids,
-            metadatas=metadatas
-        )
-
-        total_chunks += len(chunks)
-        print(f"  [{i+1}/{len(data)}] {title[:60]} — {len(chunks)} chunks ({date_str})")
-
-    print(f"\nDone with {json_path.name}: {total_chunks} chunks indexed, {skipped} conversations skipped")
-    return total_chunks
-
-def main():
-    export_dir = Path(EXPORT_DIR)
-    files = [
-        export_dir / "conversations-000.json",
-        export_dir / "conversations-001.json"
-    ]
-
-    grand_total = 0
-    for f in files:
-        if f.exists():
-            grand_total += ingest_file(f)
-        else:
-            print(f"Not found: {f}")
-
-    print(f"\nTotal chunks added to corpus: {grand_total}")
-    print(f"Database at: {db_path}")
-
-if __name__ == "__main__":
-    main()
@@ -1,189 +0,0 @@
-import json
-import sys
-from pathlib import Path
-from sentence_transformers import SentenceTransformer
-import psycopg2
-import psycopg2.extras
-import json as json_module
-
-# Paths
-db_path = str(Path.home() / "aaronai" / "db")
-EXPORT_DIR = "/home/aaron/nextcloud/data/data/aaron/files/Archive/Misc/Claude Export"
-
-print("Loading embedding model...")
-embedder = SentenceTransformer("all-MiniLM-L6-v2")
-client = chromadb.PersistentClient(path=db_path)
-collection = client.get_or_create_collection(
-    name="aaronai",
-    metadata={"hnsw:space": "cosine", "hnsw:allow_replace_deleted": True}
-)
-
-def extract_messages(convo):
-    messages = []
-    for msg in convo.get("chat_messages", []):
-        role = msg.get("sender", "")
-        if role not in ["human", "assistant"]:
-            continue
-        content = msg.get("content", [])
-        text = ""
-        if isinstance(content, str):
-            text = content
-        elif isinstance(content, list):
-            for block in content:
-                if isinstance(block, dict) and block.get("type") == "text":
-                    text += block.get("text", "")
-                elif isinstance(block, str):
-                    text += block
-        text = text.strip()
-        if not text:
-            continue
-        messages.append((msg.get("created_at", ""), role, text))
-    return messages
-
-def chunk_conversation(convo):
-    chunks = []
-    title = convo.get("name", "Untitled conversation")
-    uuid = convo.get("uuid", "")
-    created_at = convo.get("created_at", "")
-    messages = extract_messages(convo)
-    if not messages:
-        return chunks
-
-    window = []
-    for i, (ts, role, text) in enumerate(messages):
-        label = "You" if role == "human" else "Claude"
-        window.append(f"{label}: {text}")
-        if len(window) >= 3 or i == len(messages) - 1:
-            chunk_text = f"[Claude conversation: {title}]\n\n" + "\n\n".join(window)
-            chunk_id = f"claude_{uuid}_{i}"
-            chunks.append((chunk_id, chunk_text, {
-                "source": f"Claude: {title}",
-                "type": "claude_conversation",
-                "created_at": created_at,
-            }))
-            window = window[-1:]
-    return chunks
-
-def ingest_conversations(path):
-    print(f"\nIngesting conversations from {path.name}...")
-    conversations = []
-
-    # Handle both .json (array) and .jsonl (one per line)
-    raw = path.read_text(encoding="utf-8").strip()
-    if raw.startswith("["):
-        conversations = json.loads(raw)
-    else:
-        for line in raw.splitlines():
-            line = line.strip()
-            if line:
-                try:
-                    conversations.append(json.loads(line))
-                except:
-                    continue
-
-    print(f"Found {len(conversations)} conversations")
-    total = 0
-    skipped = 0
-
-    for convo in conversations:
-        chunks = chunk_conversation(convo)
-        if not chunks:
-            skipped += 1
-            continue
-
-        ids    = [c[0] for c in chunks]
-        texts  = [c[1] for c in chunks]
-        metas  = [c[2] for c in chunks]
-
-        existing    = collection.get(ids=ids)
-        existing_ids = set(existing["ids"])
-        new = [(id, txt, meta) for id, txt, meta in zip(ids, texts, metas)
-               if id not in existing_ids]
-
-        if not new:
-            continue
-
-        embeddings = embedder.encode([n[1] for n in new]).tolist()
-        pg = get_pg()
-        cur = pg.cursor()
-        for (chunk_id, chunk_text, meta), embedding in zip(new, embeddings):
-            cur.execute("""
-                INSERT INTO embeddings (id, document, embedding, source, type, created_at, metadata)
-                VALUES (%s, %s, %s::vector, %s, %s, %s, %s)
-                ON CONFLICT (id) DO UPDATE SET
-                    document = EXCLUDED.document,
-                    embedding = EXCLUDED.embedding,
-                    source = EXCLUDED.source,
-                    type = EXCLUDED.type,
-                    created_at = EXCLUDED.created_at,
-                    metadata = EXCLUDED.metadata
-            """, (
-                chunk_id, chunk_text, embedding,
-                meta.get('source'), meta.get('type'), meta.get('created_at'),
-                json_module.dumps(meta)
-            ))
-        pg.commit()
-        pg.close()
-        total += len(new)
-
-    print(f"Conversations: {total} chunks added, {skipped} skipped")
-    return total
-
-def ingest_memories(path):
-    print(f"\nIngesting memories from {path.name}...")
-    raw = json.loads(path.read_text(encoding="utf-8"))
-
-    # Memories are a list of memory objects
-    memories = raw if isinstance(raw, list) else raw.get("memories", [])
-    if not memories:
-        print("No memories found")
-        return 0
-
-    # Combine all memories into one chunk — they're already distilled
-    memory_text = "\n".join([
-        f"- {m.get('content', m) if isinstance(m, dict) else m}"
-        for m in memories
-    ])
-
-    chunk_text = f"[Claude memory — what Claude has learned about Aaron]\n\n{memory_text}"
-    chunk_id = "claude_memories_consolidated"
-
-    existing = collection.get(ids=[chunk_id])
-    if existing["ids"]:
-        # Update by deleting and re-adding
-        collection.delete(ids=[chunk_id])
-
-    embedding = embedder.encode([chunk_text]).tolist()
-    collection.upsert(
-        ids=[chunk_id],
-        documents=[chunk_text],
-        metadatas=[{
-            "source": "Claude: Memory",
-            "type": "claude_memory",
-        }],
-        embeddings=embedding,
-    )
-
-    print(f"Memories: 1 chunk added ({len(memories)} memory items)")
-    return 1
-
-# Run ingestion
-export_dir = Path(EXPORT_DIR)
-total = 0
-
-conv_files = list(export_dir.glob("conversations.*"))
-for f in conv_files:
-    total += ingest_conversations(f)
-
-mem_files = list(export_dir.glob("memories.*"))
-for f in mem_files:
-    total += ingest_memories(f)
-
-if total == 0:
-    print("\nNo files found or no new chunks to add.")
-else:
-    print(f"\nTotal chunks added to corpus: {total}")
-
-# Show updated corpus size
-count = collection.count()
-print(f"Corpus now contains {count} total chunks")
@@ -1,125 +0,0 @@
-"""
-Migration: ChromaDB → pgvector
-Re-embeds all documents from ChromaDB SQLite into PostgreSQL with pgvector.
-Keeps ChromaDB intact as backup until migration is verified.
-"""
-import sqlite3
-import psycopg2
-import json
-from pathlib import Path
-from sentence_transformers import SentenceTransformer
-
-CHROMA_SQLITE = str(Path.home() / "aaronai" / "db" / "chroma.sqlite3")
-import os
-PG_DSN = os.getenv("PG_DSN")
-if not PG_DSN:
-    raise RuntimeError("PG_DSN environment variable not set")
-
-print("Loading embedding model...")
-embedder = SentenceTransformer("all-MiniLM-L6-v2")
-
-print("Connecting to databases...")
-chroma = sqlite3.connect(CHROMA_SQLITE)
-chroma.row_factory = sqlite3.Row
-c = chroma.cursor()
-
-pg = psycopg2.connect(PG_DSN)
-pg_cur = pg.cursor()
-
-# Get all documents with their metadata from ChromaDB
-print("Reading documents from ChromaDB...")
-c.execute("""
-    SELECT 
-        e.id as row_id,
-        e.embedding_id,
-        MAX(CASE WHEN em.key = 'chroma:document' THEN em.string_value END) as document,
-        MAX(CASE WHEN em.key = 'source' THEN em.string_value END) as source,
-        MAX(CASE WHEN em.key = 'type' THEN em.string_value END) as type,
-        MAX(CASE WHEN em.key = 'created_at' THEN em.string_value END) as created_at
-    FROM embeddings e
-    LEFT JOIN embedding_metadata em ON e.id = em.id
-    GROUP BY e.id, e.embedding_id
-    HAVING document IS NOT NULL
-    ORDER BY e.id
-""")
-
-rows = c.fetchall()
-print(f"Found {len(rows)} documents to migrate")
-
-# Check existing in PostgreSQL
-pg_cur.execute("SELECT id FROM embeddings")
-existing_ids = set(r[0] for r in pg_cur.fetchall())
-print(f"Already in PostgreSQL: {len(existing_ids)}")
-
-# Filter to only new ones
-to_migrate = [r for r in rows if r['embedding_id'] not in existing_ids]
-print(f"Need to migrate: {len(to_migrate)}")
-
-if not to_migrate:
-    print("Nothing to migrate — already complete")
-    chroma.close()
-    pg.close()
-    exit(0)
-
-# Migrate in batches
-batch_size = 200
-migrated = 0
-errors = 0
-
-for i in range(0, len(to_migrate), batch_size):
-    batch = to_migrate[i:i+batch_size]
-    
-    # Generate embeddings
-    texts = [r['document'] for r in batch]
-    try:
-        embeddings = embedder.encode(texts, show_progress_bar=False).tolist()
-    except Exception as e:
-        print(f"Embedding error at batch {i}: {e}")
-        errors += len(batch)
-        continue
-    
-    # Insert into PostgreSQL
-    for row, embedding in zip(batch, embeddings):
-        try:
-            pg_cur.execute("""
-                INSERT INTO embeddings (id, document, embedding, source, type, created_at, metadata)
-                VALUES (%s, %s, %s::vector, %s, %s, %s, %s)
-                ON CONFLICT (id) DO UPDATE SET
-                    document = EXCLUDED.document,
-                    embedding = EXCLUDED.embedding,
-                    source = EXCLUDED.source,
-                    type = EXCLUDED.type,
-                    created_at = EXCLUDED.created_at,
-                    metadata = EXCLUDED.metadata
-            """, (
-                row['embedding_id'],
-                row['document'],
-                embedding,
-                row['source'],
-                row['type'],
-                row['created_at'],
-                json.dumps({
-                    'source': row['source'],
-                    'type': row['type'],
-                    'created_at': row['created_at'],
-                })
-            ))
-            migrated += 1
-        except Exception as e:
-            print(f"Insert error for {row['embedding_id']}: {e}")
-            errors += 1
-    
-    pg.commit()
-    print(f"Progress: {min(i+batch_size, len(to_migrate))}/{len(to_migrate)} ({errors} errors)")
-
-# Final count
-pg_cur.execute("SELECT COUNT(*) FROM embeddings")
-final_count = pg_cur.fetchone()[0]
-
-chroma.close()
-pg.close()
-
-print(f"\nMigration complete:")
-print(f"  Migrated: {migrated}")
-print(f"  Errors: {errors}")
-print(f"  PostgreSQL total: {final_count}")
@@ -2,6 +2,7 @@ import time
 import subprocess
 import logging
 import json
+import threading
 from pathlib import Path
 from watchdog.observers import Observer
 from watchdog.events import FileSystemEventHandler
@@ -25,16 +26,35 @@ logging.basicConfig(
    ]
 )

+ingestion_state = {
+    "status": "idle",
+    "message": "",
+    "file_count": 0,
+    "started_at": None,
+    "finished_at": None,
+    "last_error": "",
+}
+ingestion_lock = threading.Lock()
+ingestion_thread = None
+
+
+def set_ingestion_state(**kwargs):
+    with ingestion_lock:
+        ingestion_state.update(kwargs)
+
+
 def load_state():
    if Path(STATE_FILE).exists():
        with open(STATE_FILE) as f:
            return json.load(f)
    return {}

+
 def save_state(state):
    with open(STATE_FILE, 'w') as f:
        json.dump(state, f)

+
 def get_changed_files():
    state = load_state()
    changed = []
@@ -52,13 +72,25 @@ def get_changed_files():
            changed.append(path)
    return changed, state

+
 def run_ingestion():
    changed, state = get_changed_files()
    if not changed:
        logging.info("No new or changed files detected — skipping ingestion.")
+        set_ingestion_state(status="idle", message="No changes detected", file_count=0)
        return

-    logging.info(f"Found {len(changed)} new or changed files — starting ingestion...")
+    count = len(changed)
+    logging.info(f"Found {count} new or changed files — starting ingestion...")
+    set_ingestion_state(
+        status="ingesting",
+        message=f"Ingesting {count} file(s)...",
+        file_count=count,
+        started_at=time.time(),
+        finished_at=None,
+        last_error="",
+    )
+
    try:
        result = subprocess.run(
            [PYTHON, INGEST_SCRIPT, NEXTCLOUD_PATH],
@@ -67,19 +99,51 @@ def run_ingestion():
            timeout=1800
        )
        if result.returncode == 0:
-            # Update state with new mtimes
            root = Path(NEXTCLOUD_PATH)
            for path in root.rglob("*"):
                if path.is_file() and path.suffix.lower() in SUPPORTED:
                    state[str(path)] = str(path.stat().st_mtime)
            save_state(state)
            logging.info("Ingestion complete. State updated.")
+            set_ingestion_state(
+                status="idle",
+                message=f"Last run: ingested {count} file(s) successfully",
+                finished_at=time.time(),
+            )
        else:
            logging.error(f"Ingestion error: {result.stderr}")
+            set_ingestion_state(
+                status="error",
+                message="Ingestion failed — see log",
+                last_error=result.stderr[-300:],
+                finished_at=time.time(),
+            )
    except subprocess.TimeoutExpired:
        logging.error("Ingestion timed out.")
+        set_ingestion_state(
+            status="error",
+            message="Ingestion timed out (>30 min)",
+            last_error="TimeoutExpired",
+            finished_at=time.time(),
+        )
    except Exception as e:
        logging.error(f"Ingestion failed: {e}")
+        set_ingestion_state(
+            status="error",
+            message=f"Ingestion exception: {e}",
+            last_error=str(e),
+            finished_at=time.time(),
+        )
+
+
+def start_ingestion_thread():
+    global ingestion_thread
+    if ingestion_thread and ingestion_thread.is_alive():
+        logging.info("Ingestion already running — skipping.")
+        return
+    ingestion_thread = threading.Thread(target=run_ingestion, daemon=True)
+    ingestion_thread.start()
+

 class IngestHandler(FileSystemEventHandler):
    def __init__(self):
@@ -98,9 +162,26 @@ class IngestHandler(FileSystemEventHandler):
            return
        if 'Journal/Media' in str(path):
            return
+        if event.event_type not in ('modified', 'created', 'moved'):
+            return
+        logging.info(f"Event: {event.event_type} {event.src_path}")
        self.pending = True
        self.last_event = time.time()

+
+def write_status(handler):
+    with ingestion_lock:
+        status = {
+            "running": True,
+            "timestamp": time.time(),
+            "pending": handler.pending,
+            "last_event": handler.last_event,
+            "ingestion": dict(ingestion_state),
+        }
+    with open(STATUS_FILE, 'w') as f:
+        json.dump(status, f)
+
+
 def main():
    logging.info("Aaron AI Watcher starting...")
    logging.info(f"Watching: {NEXTCLOUD_PATH}")
@@ -112,23 +193,18 @@ def main():

    try:
        while True:
-            import json as _json
-            _json.dump({
-                "running": True,
-                "timestamp": time.time(),
-                "pending": handler.pending,
-                "last_event": handler.last_event
-            }, open(STATUS_FILE, 'w'))
+            write_status(handler)
            if handler.pending:
                elapsed = time.time() - handler.last_event
                if elapsed >= DEBOUNCE_SECONDS:
                    handler.pending = False
-                    run_ingestion()
+                    start_ingestion_thread()
            time.sleep(5)
    except KeyboardInterrupt:
        observer.stop()
    observer.join()
    logging.info("Watcher stopped.")

+
 if __name__ == "__main__":
    main()