Code review fixes: CV pinning, F1 (excluded_sources), F14 (50KB truncation), F37

- api.py: strip CV pinning workaround (parity violation, see architecture doc) - dream.py: F1 — retrieve_graphiti() now accepts excluded_sources, over-fetches 3x and filters in-process. Was silently dropping the parameter; would have confounded E3 with broken cross-stage exclusion in Graphiti arm. - watcher.py + ingest.py: F14 — drop full_text[:50000] truncation. Was propagating through entire cascade. Postgres TEXT can hold up to 1GB. - corpus_integrity.py: F37 — same truncation, third path now clean. Backups: api.py.bak.*, dream.py.bak.*, watcher.py.bak.*, ingest.py.bak.*, corpus_integrity.py.bak.* timestamped pre-fix. Re-cascaded Shop Class as Soulcraft (only already-cascaded source affected by F14, 414KB).
2026-05-01 02:26:37 +00:00
parent 25e42c0231
commit 465f2f725b
17 changed files with 4432 additions and 58 deletions
@@ -0,0 +1,182 @@
+import os
+import sys
+import hashlib
+from pathlib import Path
+from dotenv import load_dotenv
+import psycopg2
+import psycopg2.extras
+import json
+from sentence_transformers import SentenceTransformer
+from docx import Document
+from pypdf import PdfReader
+from pptx import Presentation
+
+load_dotenv(Path.home() / "aaronai" / ".env", override=True)
+
+print("Loading embedding model...")
+embedder = SentenceTransformer("all-MiniLM-L6-v2")
+
+PG_DSN = os.getenv("PG_DSN")
+
+def get_pg():
+    return psycopg2.connect(PG_DSN)
+
+def extract_text_from_docx(path):
+    doc = Document(path)
+    return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
+
+def extract_text_from_pdf(path):
+    reader = PdfReader(path)
+    text = ""
+    for page in reader.pages:
+        extracted = page.extract_text()
+        if extracted:
+            text += extracted + "\n"
+    return text
+
+def extract_text_from_pptx(path):
+    prs = Presentation(path)
+    text = ""
+    for slide in prs.slides:
+        for shape in slide.shapes:
+            if hasattr(shape, "text") and shape.text.strip():
+                text += shape.text + "\n"
+    return text
+
+def extract_text_from_txt(path):
+    with open(path, "r", encoding="utf-8", errors="ignore") as f:
+        return f.read()
+
+def chunk_text(text, chunk_size=500, overlap=50):
+    words = text.split()
+    chunks = []
+    start = 0
+    while start < len(words):
+        end = start + chunk_size
+        chunk = " ".join(words[start:end])
+        if chunk.strip():
+            chunks.append(chunk)
+        start += chunk_size - overlap
+    return chunks
+
+def make_id(filepath, chunk_index):
+    path_hash = hashlib.md5(str(filepath).encode()).hexdigest()[:8]
+    return f"{path_hash}_{chunk_index}"
+
+def enqueue_stage2(source, full_text):
+    """Enqueue document for Stage 2 (Mistral orientation) → Stage 3 (Graphiti ingest).
+    TEMPORARY: this queue feed will be removed when pgvector is decommissioned
+    and the watcher calls Stage 2 directly.
+    """
+    try:
+        pg = get_pg()
+        cur = pg.cursor()
+        cur.execute("""
+            INSERT INTO stage_2_queue (source, full_text, char_length)
+            VALUES (%s, %s, %s)
+            ON CONFLICT (source) DO UPDATE SET
+                full_text = EXCLUDED.full_text,
+                char_length = EXCLUDED.char_length,
+                enqueued_at = NOW(),
+                completed_at = NULL,
+                failed_at = NULL,
+                attempts = 0
+        """, (source, full_text[:50000], len(full_text)))
+        pg.commit()
+        pg.close()
+    except Exception as e:
+        print(f"  Stage 2 queue insert failed (non-fatal): {e}")
+
+def ingest_file(filepath):
+    path = Path(filepath)
+    suffix = path.suffix.lower()
+
+    if path.name.startswith("~$") or path.name.startswith("."):
+        return 0
+
+    try:
+        if suffix == ".docx":
+            text = extract_text_from_docx(path)
+        elif suffix == ".pdf":
+            text = extract_text_from_pdf(path)
+        elif suffix == ".pptx":
+            text = extract_text_from_pptx(path)
+        elif suffix in [".txt", ".md"]:
+            text = extract_text_from_txt(path)
+        else:
+            return 0
+
+        if not text.strip():
+            return 0
+
+        chunks = chunk_text(text)
+        if not chunks:
+            return 0
+
+        embeddings = embedder.encode(chunks).tolist()
+        ids = [make_id(path, i) for i in range(len(chunks))]
+        metadatas = [{
+            "source": path.name,
+            "filepath": str(path),
+            "folder": str(path.parent.relative_to(Path(sys.argv[1]) if len(sys.argv) > 1 else path.parent))
+        } for _ in chunks]
+
+        # STAGE 1: Write to pgvector (TEMPORARY — remove when chat agent migrates to Graphiti)
+        pg = get_pg()
+        cur = pg.cursor()
+        for chunk_id, chunk, embedding, meta in zip(ids, chunks, embeddings, metadatas):
+            cur.execute("""
+                INSERT INTO embeddings (id, document, embedding, source, type, created_at, metadata)
+                VALUES (%s, %s, %s::vector, %s, %s, %s, %s)
+                ON CONFLICT (id) DO UPDATE SET
+                    document = EXCLUDED.document,
+                    embedding = EXCLUDED.embedding,
+                    source = EXCLUDED.source,
+                    metadata = EXCLUDED.metadata
+            """, (
+                chunk_id, chunk, embedding,
+                meta.get("source"), "document", None,
+                json.dumps(meta)
+            ))
+        pg.commit()
+        pg.close()
+        print(f"  Indexed {len(chunks)} chunks: {path.name}")
+
+        # Enqueue for Stage 2 → Stage 3 (Graphiti pipeline)
+        # SKIP_STAGE2_ENQUEUE env var set by migration scripts to prevent bulk enqueue
+        if not os.getenv("SKIP_STAGE2_ENQUEUE"):
+            enqueue_stage2(path.name, text)
+
+        return len(chunks)
+
+    except Exception as e:
+        print(f"  Error: {path.name}: {e}")
+        return 0
+
+def ingest_folder(folder_path):
+    folder = Path(folder_path)
+    if not folder.exists():
+        print(f"Folder not found: {folder_path}")
+        sys.exit(1)
+
+    supported = [".docx", ".pdf", ".pptx", ".txt", ".md"]
+    files = [f for f in folder.rglob("*")
+             if f.suffix.lower() in supported
+             and not f.name.startswith("~$")
+             and not f.name.startswith(".")]
+
+    if not files:
+        print("No supported files found.")
+        sys.exit(1)
+
+    print(f"Found {len(files)} files to process\n")
+    total_chunks = 0
+    for f in files:
+        total_chunks += ingest_file(f)
+
+    print(f"\nDone. Total chunks indexed: {total_chunks}")
+
+if __name__ == "__main__":
+    target = sys.argv[1] if len(sys.argv) > 1 else str(Path.home() / "aaronai" / "docs")
+    print(f"Ingesting from: {target}\n")
+    ingest_folder(target)