scripts/encoding.py: Stage 1 dual-implementation consolidation (Track 1 Finding 11)

Consolidates four extract paths and two extract-chunk-embed-write pipelines into a single shared encoding module. Fixes the embedder lifecycle divergence between watcher and /api/reindex (no more 200MB reload per reindex click) and unifies failure tracking so /api/reindex failures now surface in SettingsPanel "Ingest Health". New files: - scripts/encoding.py — extract_text, chunk_text, chunk_and_embed, write_embeddings_batch - scripts/failures.py — record_ingest_failure, resolve_ingest_failure (shared by watcher.py and ingest.py) Refactored: - scripts/watcher.py — drops local extract/chunk/embed implementations and CHUNK_SIZE/CHUNK_OVERLAP/SUPPORTED constants; imports from encoding and failures. Now writes ingest_failures row on empty-text-extract (was silent return 0). - scripts/ingest.py — substantial rewrite. Exposes ingest_directory(folder, embedder=None) for in-process invocation; CLI back-compat preserved via ingest_folder wrapper. Module-level SentenceTransformer load removed. - scripts/corpus_integrity.py — imports extract_text from encoding; extract_text_for_retry function removed. - scripts/api.py — /api/reindex rewritten with BackgroundTasks (uses module-level embedder; no subprocess); new /api/reindex/status endpoint reading ~/aaronai/reindex_status.json; /api/corpus/retry imports extract_text from encoding; INGEST_SCRIPT constant removed (dead after this refactor); 409 reentrance guard prevents double-click stomping. Behavior changes: - /api/reindex no longer subprocess.Popens; runs in FastAPI BackgroundTasks threadpool, doesn't block API thread. - /api/reindex no longer reloads SentenceTransformer on each click. - /api/reindex failures newly write to ingest_failures (visible in SettingsPanel "Ingest Health" — badge will jump on first reindex). - New embeddings rows always have created_at = NOW() (canonical, server-side). - New embeddings rows always include metadata.folder field (None when not derivable). - /api/reindex returns 409 on second click while a job is running. - New /api/reindex/status endpoint for polling. Existing 9,815 NULL created_at rows remain unchanged; backfill is a separate decision if desired. 199 insertions, 256 deletions across 6 files (codebase shrinks net). Found by Track 1 inventory 2026-05-02 (Finding 11 / cross-cutting F11). Pre-commit verification: BackgroundTasks already imported, sys.path resolves correctly via script-path semantics, static import clean.
2026-05-03 01:40:47 +00:00
parent a317df66f8
commit 1101bef226
6 changed files with 357 additions and 264 deletions
@@ -19,7 +19,6 @@ Architecture: Stage 1 (watcher) -> stage_2_queue -> Stage 2 (Mistral) -> stage_3
 import os
 import time
 import json
-import hashlib
 import logging
 import threading
 from pathlib import Path
@@ -30,9 +29,11 @@ from sentence_transformers import SentenceTransformer
 from watchdog.observers import Observer
 from watchdog.events import FileSystemEventHandler

-from docx import Document as DocxDocument
-from pypdf import PdfReader
-from pptx import Presentation
+from encoding import extract_text, chunk_and_embed, write_embeddings_batch, SUPPORTED
+from failures import (
+    record_ingest_failure as _record_failure_sql,
+    resolve_ingest_failure as _resolve_failure_sql,
+)

 load_dotenv(Path.home() / "aaronai" / ".env", override=True)

@@ -42,10 +43,7 @@ STATE_FILE     = "/home/aaron/aaronai/watcher_state.json"
 STATUS_FILE    = "/home/aaron/aaronai/watcher_status.json"
 HEARTBEAT_FILE = "/home/aaron/aaronai/watcher_heartbeat"

-SUPPORTED        = {".pdf", ".docx", ".pptx", ".txt", ".md"}
 DEBOUNCE_SECONDS = 120
-CHUNK_SIZE       = 500
-CHUNK_OVERLAP    = 50
 EMBED_MODEL      = "all-MiniLM-L6-v2"

 PG_DSN = os.getenv("PG_DSN")
@@ -76,49 +74,6 @@ def get_pg():
    return psycopg2.connect(PG_DSN)


-def extract_text(path: Path) -> str:
-    suffix = path.suffix.lower()
-    try:
-        if suffix == ".docx":
-            doc = DocxDocument(path)
-            return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
-        elif suffix == ".pdf":
-            reader = PdfReader(path)
-            return "".join(
-                page.extract_text() + "\n"
-                for page in reader.pages if page.extract_text()
-            )
-        elif suffix == ".pptx":
-            prs = Presentation(path)
-            return "\n".join(
-                shape.text for slide in prs.slides
-                for shape in slide.shapes
-                if hasattr(shape, "text") and shape.text.strip()
-            )
-        elif suffix in {".txt", ".md"}:
-            return path.read_text(encoding="utf-8", errors="ignore")
-    except Exception as e:
-        log.warning(f"Text extraction failed for {path.name}: {e}")
-        record_ingest_failure(path, f"Text extraction failed: {e}")
-    return ""
-
-
-def chunk_text(text: str) -> list:
-    words = text.split()
-    chunks = []
-    start = 0
-    while start < len(words):
-        chunk = " ".join(words[start:start + CHUNK_SIZE])
-        if chunk.strip():
-            chunks.append(chunk)
-        start += CHUNK_SIZE - CHUNK_OVERLAP
-    return chunks
-
-
-def make_chunk_id(filepath: Path, chunk_index: int) -> str:
-    return hashlib.md5(str(filepath).encode()).hexdigest()[:8] + f"_{chunk_index}"
-
-
 def enqueue_stage2(source: str, full_text: str):
    if os.getenv("SKIP_STAGE2_ENQUEUE"):
        return
@@ -143,21 +98,15 @@ def enqueue_stage2(source: str, full_text: str):


 def record_ingest_failure(filepath: Path, error: str):
-    """Write extraction or ingest failure to ingest_failures table for UI visibility."""
+    """Write extraction or ingest failure to ingest_failures table for UI visibility.
+    Local wrapper around failures.record_ingest_failure — opens conn, delegates,
+    logs non-fatal errors so the caller never has to handle them."""
    try:
        pg = get_pg()
-        cur = pg.cursor()
-        cur.execute("""
-            INSERT INTO ingest_failures (source, filepath, error, retry_count, first_failed_at, last_failed_at)
-            VALUES (%s, %s, %s, 0, NOW(), NOW())
-            ON CONFLICT (source) DO UPDATE SET
-                error          = EXCLUDED.error,
-                retry_count    = ingest_failures.retry_count + 1,
-                last_failed_at = NOW(),
-                resolved       = FALSE
-        """, (filepath.name, str(filepath), error[:1000]))
-        pg.commit()
-        pg.close()
+        try:
+            _record_failure_sql(pg, filepath.name, filepath, error)
+        finally:
+            pg.close()
    except Exception as e:
        log.warning(f"Could not record ingest failure (non-fatal): {e}")

@@ -166,10 +115,10 @@ def resolve_ingest_failure(source: str):
    """Mark a previously failed file as resolved after successful ingest."""
    try:
        pg = get_pg()
-        cur = pg.cursor()
-        cur.execute("UPDATE ingest_failures SET resolved = TRUE WHERE source = %s", (source,))
-        pg.commit()
-        pg.close()
+        try:
+            _resolve_failure_sql(pg, source)
+        finally:
+            pg.close()
    except Exception as e:
        log.warning(f"Could not resolve ingest failure record (non-fatal): {e}")

@@ -181,42 +130,37 @@ def ingest_file(filepath: Path, embedder) -> int:
        return 0
    text = extract_text(filepath)
    if not text.strip():
+        record_ingest_failure(filepath, "Text extraction failed or empty")
        return 0
-    chunks = chunk_text(text)
-    if not chunks:
-        return 0
+    folder_rel = None
    try:
-        embeddings = embedder.encode(chunks).tolist()
+        folder_rel = str(filepath.parent.relative_to(NEXTCLOUD_PATH))
+    except ValueError:
+        pass
+    try:
+        rows = chunk_and_embed(text, filepath.name, embedder,
+                               filepath=filepath, folder=folder_rel)
    except Exception as e:
        log.error(f"Embedding failed for {filepath.name}: {e}")
        record_ingest_failure(filepath, f"Embedding failed: {e}")
        return 0
+    if not rows:
+        return 0
    source = filepath.name
    try:
-        pg  = get_pg()
-        cur = pg.cursor()
-        for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
-            chunk_id = make_chunk_id(filepath, i)
-            cur.execute("""
-                INSERT INTO embeddings (id, document, embedding, source, type, created_at, metadata)
-                VALUES (%s, %s, %s::vector, %s, %s, NOW(), %s)
-                ON CONFLICT (id) DO UPDATE SET
-                    document  = EXCLUDED.document,
-                    embedding = EXCLUDED.embedding,
-                    source    = EXCLUDED.source,
-                    metadata  = EXCLUDED.metadata
-            """, (chunk_id, chunk, embedding, source, "document",
-                  json.dumps({"source": source, "filepath": str(filepath)})))
-        pg.commit()
-        pg.close()
+        pg = get_pg()
+        try:
+            write_embeddings_batch(pg, rows)
+        finally:
+            pg.close()
    except Exception as e:
        log.error(f"pgvector write failed for {filepath.name}: {e}")
        record_ingest_failure(filepath, f"pgvector write failed: {e}")
        return 0
-    log.info(f"Indexed {len(chunks)} chunks: {filepath.name}")
+    log.info(f"Indexed {len(rows)} chunks: {filepath.name}")
    resolve_ingest_failure(source)
    enqueue_stage2(source, text)
-    return len(chunks)
+    return len(rows)


 def ingest_files(paths: list, embedder, state: dict) -> dict: