scripts/encoding.py: Stage 1 dual-implementation consolidation (Track 1 Finding 11)

Consolidates four extract paths and two extract-chunk-embed-write pipelines into a single shared encoding module. Fixes the embedder lifecycle divergence between watcher and /api/reindex (no more 200MB reload per reindex click) and unifies failure tracking so /api/reindex failures now surface in SettingsPanel "Ingest Health". New files: - scripts/encoding.py — extract_text, chunk_text, chunk_and_embed, write_embeddings_batch - scripts/failures.py — record_ingest_failure, resolve_ingest_failure (shared by watcher.py and ingest.py) Refactored: - scripts/watcher.py — drops local extract/chunk/embed implementations and CHUNK_SIZE/CHUNK_OVERLAP/SUPPORTED constants; imports from encoding and failures. Now writes ingest_failures row on empty-text-extract (was silent return 0). - scripts/ingest.py — substantial rewrite. Exposes ingest_directory(folder, embedder=None) for in-process invocation; CLI back-compat preserved via ingest_folder wrapper. Module-level SentenceTransformer load removed. - scripts/corpus_integrity.py — imports extract_text from encoding; extract_text_for_retry function removed. - scripts/api.py — /api/reindex rewritten with BackgroundTasks (uses module-level embedder; no subprocess); new /api/reindex/status endpoint reading ~/aaronai/reindex_status.json; /api/corpus/retry imports extract_text from encoding; INGEST_SCRIPT constant removed (dead after this refactor); 409 reentrance guard prevents double-click stomping. Behavior changes: - /api/reindex no longer subprocess.Popens; runs in FastAPI BackgroundTasks threadpool, doesn't block API thread. - /api/reindex no longer reloads SentenceTransformer on each click. - /api/reindex failures newly write to ingest_failures (visible in SettingsPanel "Ingest Health" — badge will jump on first reindex). - New embeddings rows always have created_at = NOW() (canonical, server-side). - New embeddings rows always include metadata.folder field (None when not derivable). - /api/reindex returns 409 on second click while a job is running. - New /api/reindex/status endpoint for polling. Existing 9,815 NULL created_at rows remain unchanged; backfill is a separate decision if desired. 199 insertions, 256 deletions across 6 files (codebase shrinks net). Found by Track 1 inventory 2026-05-02 (Finding 11 / cross-cutting F11). Pre-commit verification: BackgroundTasks already imported, sys.path resolves correctly via script-path semantics, static import clean.
2026-05-03 01:40:47 +00:00
parent a317df66f8
commit 1101bef226
6 changed files with 357 additions and 264 deletions
@@ -1,70 +1,37 @@
+"""
+Aaron AI bulk ingester. Two entry points:
+  - ingest_directory(folder, embedder=None) — programmatic; called from
+    api.py /api/reindex with the api process's shared embedder
+  - python3 scripts/ingest.py <folder> — CLI back-compat; loads its own embedder
+
+Stage 1 helpers (extract / chunk / embed / write) live in scripts/encoding.py.
+Failure tracking SQL lives in scripts/failures.py.
+"""
+
 import os
 import sys
-import hashlib
 from pathlib import Path
 from dotenv import load_dotenv
 import psycopg2
-import psycopg2.extras
-import json
 from sentence_transformers import SentenceTransformer
-from docx import Document
-from pypdf import PdfReader
-from pptx import Presentation
+
+from encoding import extract_text, chunk_and_embed, write_embeddings_batch, SUPPORTED
+from failures import (
+    record_ingest_failure as _record_failure_sql,
+    resolve_ingest_failure as _resolve_failure_sql,
+)

 load_dotenv(Path.home() / "aaronai" / ".env", override=True)

-print("Loading embedding model...")
-embedder = SentenceTransformer("all-MiniLM-L6-v2")
-
 PG_DSN = os.getenv("PG_DSN")

+
 def get_pg():
    return psycopg2.connect(PG_DSN)

-def extract_text_from_docx(path):
-    doc = Document(path)
-    return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
-
-def extract_text_from_pdf(path):
-    reader = PdfReader(path)
-    text = ""
-    for page in reader.pages:
-        extracted = page.extract_text()
-        if extracted:
-            text += extracted + "\n"
-    return text
-
-def extract_text_from_pptx(path):
-    prs = Presentation(path)
-    text = ""
-    for slide in prs.slides:
-        for shape in slide.shapes:
-            if hasattr(shape, "text") and shape.text.strip():
-                text += shape.text + "\n"
-    return text
-
-def extract_text_from_txt(path):
-    with open(path, "r", encoding="utf-8", errors="ignore") as f:
-        return f.read()
-
-def chunk_text(text, chunk_size=500, overlap=50):
-    words = text.split()
-    chunks = []
-    start = 0
-    while start < len(words):
-        end = start + chunk_size
-        chunk = " ".join(words[start:end])
-        if chunk.strip():
-            chunks.append(chunk)
-        start += chunk_size - overlap
-    return chunks
-
-def make_id(filepath, chunk_index):
-    path_hash = hashlib.md5(str(filepath).encode()).hexdigest()[:8]
-    return f"{path_hash}_{chunk_index}"

 def enqueue_stage2(source, full_text):
-    """Enqueue document for Stage 2 (Mistral orientation) → Stage 3 (Graphiti ingest).
+    """Enqueue document for Stage 2 (Mistral orientation) -> Stage 3 (Graphiti ingest).
    TEMPORARY: this queue feed will be removed when pgvector is decommissioned
    and the watcher calls Stage 2 directly.
    """
@@ -87,94 +54,108 @@ def enqueue_stage2(source, full_text):
    except Exception as e:
        print(f"  Stage 2 queue insert failed (non-fatal): {e}")

-def ingest_file(filepath):
-    path = Path(filepath)
-    suffix = path.suffix.lower()
-
-    if path.name.startswith("~$") or path.name.startswith("."):
-        return 0

+def _record_failure(filepath: Path, error: str) -> None:
    try:
-        if suffix == ".docx":
-            text = extract_text_from_docx(path)
-        elif suffix == ".pdf":
-            text = extract_text_from_pdf(path)
-        elif suffix == ".pptx":
-            text = extract_text_from_pptx(path)
-        elif suffix in [".txt", ".md"]:
-            text = extract_text_from_txt(path)
-        else:
-            return 0
-
-        if not text.strip():
-            return 0
-
-        chunks = chunk_text(text)
-        if not chunks:
-            return 0
-
-        embeddings = embedder.encode(chunks).tolist()
-        ids = [make_id(path, i) for i in range(len(chunks))]
-        metadatas = [{
-            "source": path.name,
-            "filepath": str(path),
-            "folder": str(path.parent.relative_to(Path(sys.argv[1]) if len(sys.argv) > 1 else path.parent))
-        } for _ in chunks]
-
-        # STAGE 1: Write to pgvector (TEMPORARY — remove when chat agent migrates to Graphiti)
        pg = get_pg()
-        cur = pg.cursor()
-        for chunk_id, chunk, embedding, meta in zip(ids, chunks, embeddings, metadatas):
-            cur.execute("""
-                INSERT INTO embeddings (id, document, embedding, source, type, created_at, metadata)
-                VALUES (%s, %s, %s::vector, %s, %s, %s, %s)
-                ON CONFLICT (id) DO UPDATE SET
-                    document = EXCLUDED.document,
-                    embedding = EXCLUDED.embedding,
-                    source = EXCLUDED.source,
-                    metadata = EXCLUDED.metadata
-            """, (
-                chunk_id, chunk, embedding,
-                meta.get("source"), "document", None,
-                json.dumps(meta)
-            ))
-        pg.commit()
-        pg.close()
-        print(f"  Indexed {len(chunks)} chunks: {path.name}")
-
-        # Enqueue for Stage 2 → Stage 3 (Graphiti pipeline)
-        # SKIP_STAGE2_ENQUEUE env var set by migration scripts to prevent bulk enqueue
-        if not os.getenv("SKIP_STAGE2_ENQUEUE"):
-            enqueue_stage2(path.name, text)
-
-        return len(chunks)
-
+        try:
+            _record_failure_sql(pg, filepath.name, filepath, error)
+        finally:
+            pg.close()
    except Exception as e:
-        print(f"  Error: {path.name}: {e}")
+        print(f"  Could not record ingest failure (non-fatal): {e}")
+
+
+def _resolve_failure(source: str) -> None:
+    try:
+        pg = get_pg()
+        try:
+            _resolve_failure_sql(pg, source)
+        finally:
+            pg.close()
+    except Exception as e:
+        print(f"  Could not resolve ingest failure record (non-fatal): {e}")
+
+
+def _ingest_one(filepath: Path, embedder, root: Path = None) -> int:
+    """Ingest a single file. Returns chunk count, 0 on skip/failure."""
+    if filepath.name.startswith(("~$", ".")):
        return 0
+    if filepath.suffix.lower() not in SUPPORTED:
+        return 0
+    text = extract_text(filepath)
+    if not text.strip():
+        _record_failure(filepath, "Text extraction failed or empty")
+        return 0
+    folder_rel = None
+    if root is not None:
+        try:
+            folder_rel = str(filepath.parent.relative_to(root))
+        except ValueError:
+            pass
+    try:
+        rows = chunk_and_embed(text, filepath.name, embedder,
+                               filepath=filepath, folder=folder_rel)
+    except Exception as e:
+        _record_failure(filepath, f"Embedding failed: {e}")
+        return 0
+    if not rows:
+        return 0
+    try:
+        pg = get_pg()
+        try:
+            write_embeddings_batch(pg, rows)
+        finally:
+            pg.close()
+    except Exception as e:
+        _record_failure(filepath, f"pgvector write failed: {e}")
+        return 0
+    print(f"  Indexed {len(rows)} chunks: {filepath.name}")
+    _resolve_failure(filepath.name)
+    if not os.getenv("SKIP_STAGE2_ENQUEUE"):
+        enqueue_stage2(filepath.name, text)
+    return len(rows)
+
+
+def ingest_directory(folder, embedder=None) -> dict:
+    """Programmatic entry point. Returns {scanned, ingested, failed, total_chunks}.
+
+    If embedder is None, loads its own SentenceTransformer (CLI back-compat path).
+    Caller (e.g. api.py /api/reindex) should pass its module-level embedder so
+    the ~200MB model isn't reloaded per call.
+    """
+    folder = Path(folder)
+    if not folder.exists():
+        return {"scanned": 0, "ingested": 0, "failed": 0, "total_chunks": 0,
+                "error": f"folder not found: {folder}"}
+
+    if embedder is None:
+        print("Loading embedding model...")
+        embedder = SentenceTransformer("all-MiniLM-L6-v2")
+
+    files = [f for f in folder.rglob("*")
+             if f.suffix.lower() in SUPPORTED
+             and not f.name.startswith(("~$", "."))]
+    print(f"Found {len(files)} files to process")
+
+    ingested = failed = total_chunks = 0
+    for f in files:
+        n = _ingest_one(f, embedder, root=folder)
+        if n > 0:
+            ingested += 1
+            total_chunks += n
+        else:
+            failed += 1
+    return {"scanned": len(files), "ingested": ingested, "failed": failed,
+            "total_chunks": total_chunks}
+

 def ingest_folder(folder_path):
-    folder = Path(folder_path)
-    if not folder.exists():
-        print(f"Folder not found: {folder_path}")
-        sys.exit(1)
+    """CLI back-compat wrapper. Loads its own embedder."""
+    result = ingest_directory(Path(folder_path))
+    print(f"\nDone. {result['ingested']} files / {result['total_chunks']} chunks indexed; "
+          f"{result['failed']} failed.")

-    supported = [".docx", ".pdf", ".pptx", ".txt", ".md"]
-    files = [f for f in folder.rglob("*")
-             if f.suffix.lower() in supported
-             and not f.name.startswith("~$")
-             and not f.name.startswith(".")]
-
-    if not files:
-        print("No supported files found.")
-        sys.exit(1)
-
-    print(f"Found {len(files)} files to process\n")
-    total_chunks = 0
-    for f in files:
-        total_chunks += ingest_file(f)
-
-    print(f"\nDone. Total chunks indexed: {total_chunks}")

 if __name__ == "__main__":
    target = sys.argv[1] if len(sys.argv) > 1 else str(Path.home() / "aaronai" / "docs")