scripts/encoding.py: Stage 1 dual-implementation consolidation (Track 1 Finding 11)

Consolidates four extract paths and two extract-chunk-embed-write pipelines into a single shared encoding module. Fixes the embedder lifecycle divergence between watcher and /api/reindex (no more 200MB reload per reindex click) and unifies failure tracking so /api/reindex failures now surface in SettingsPanel "Ingest Health". New files: - scripts/encoding.py — extract_text, chunk_text, chunk_and_embed, write_embeddings_batch - scripts/failures.py — record_ingest_failure, resolve_ingest_failure (shared by watcher.py and ingest.py) Refactored: - scripts/watcher.py — drops local extract/chunk/embed implementations and CHUNK_SIZE/CHUNK_OVERLAP/SUPPORTED constants; imports from encoding and failures. Now writes ingest_failures row on empty-text-extract (was silent return 0). - scripts/ingest.py — substantial rewrite. Exposes ingest_directory(folder, embedder=None) for in-process invocation; CLI back-compat preserved via ingest_folder wrapper. Module-level SentenceTransformer load removed. - scripts/corpus_integrity.py — imports extract_text from encoding; extract_text_for_retry function removed. - scripts/api.py — /api/reindex rewritten with BackgroundTasks (uses module-level embedder; no subprocess); new /api/reindex/status endpoint reading ~/aaronai/reindex_status.json; /api/corpus/retry imports extract_text from encoding; INGEST_SCRIPT constant removed (dead after this refactor); 409 reentrance guard prevents double-click stomping. Behavior changes: - /api/reindex no longer subprocess.Popens; runs in FastAPI BackgroundTasks threadpool, doesn't block API thread. - /api/reindex no longer reloads SentenceTransformer on each click. - /api/reindex failures newly write to ingest_failures (visible in SettingsPanel "Ingest Health" — badge will jump on first reindex). - New embeddings rows always have created_at = NOW() (canonical, server-side). - New embeddings rows always include metadata.folder field (None when not derivable). - /api/reindex returns 409 on second click while a job is running. - New /api/reindex/status endpoint for polling. Existing 9,815 NULL created_at rows remain unchanged; backfill is a separate decision if desired. 199 insertions, 256 deletions across 6 files (codebase shrinks net). Found by Track 1 inventory 2026-05-02 (Finding 11 / cross-cutting F11). Pre-commit verification: BackgroundTasks already imported, sys.path resolves correctly via script-path semantics, static import clean.
2026-05-03 01:40:47 +00:00
parent a317df66f8
commit 1101bef226
6 changed files with 357 additions and 264 deletions
@@ -31,6 +31,9 @@ from fastapi.responses import StreamingResponse
 from apscheduler.schedulers.background import BackgroundScheduler
 from apscheduler.triggers.cron import CronTrigger

+from encoding import extract_text as encoding_extract_text
+from ingest import ingest_directory
+
 load_dotenv(Path.home() / "aaronai" / ".env")

 MEMORY_PATH = Path.home() / "aaronai" / "memory.md"
@@ -39,7 +42,6 @@ SETTINGS_PATH = Path.home() / "aaronai" / "settings.json"
 WATCHER_LOG = str(Path.home() / "aaronai" / "watcher.log")
 WATCHER_STATE = str(Path.home() / "aaronai" / "watcher_state.json")
 NEXTCLOUD_PATH = "/home/aaron/nextcloud/data/data/aaron/files"
-INGEST_SCRIPT = str(Path.home() / "aaronai" / "scripts" / "ingest.py")
 PYTHON = str(Path.home() / "aaronai" / "venv" / "bin" / "python3")

 DEFAULT_SETTINGS = {
@@ -908,13 +910,62 @@ async def list_captures():
    except Exception as e:
        return JSONResponse({"captures": []})

-@app.post("/api/reindex")
-async def trigger_reindex(auth: str = Depends(require_auth)):
+REINDEX_STATUS_PATH = Path.home() / "aaronai" / "reindex_status.json"
+
+
+def _read_reindex_status() -> dict:
+    if REINDEX_STATUS_PATH.exists():
+        try:
+            return json.loads(REINDEX_STATUS_PATH.read_text())
+        except Exception:
+            return {}
+    return {}
+
+
+def _write_reindex_status(state: dict):
+    REINDEX_STATUS_PATH.write_text(json.dumps(state, indent=2))
+
+
+def _reindex_running() -> bool:
+    return _read_reindex_status().get("status") == "running"
+
+
+def _run_reindex_background():
+    """Background-thread entry: shares api.py's module-level embedder."""
+    started = datetime.now().isoformat()
+    _write_reindex_status({"status": "running", "started_at": started})
    try:
-        subprocess.Popen([PYTHON, INGEST_SCRIPT, NEXTCLOUD_PATH])
-        return JSONResponse({"started": True, "message": "Re-indexing started in background"})
+        result = ingest_directory(Path(NEXTCLOUD_PATH), embedder=embedder)
+        _write_reindex_status({
+            "status": "complete",
+            "started_at": started,
+            "finished_at": datetime.now().isoformat(),
+            **result,
+        })
    except Exception as e:
-        return JSONResponse({"started": False, "error": str(e)})
+        _write_reindex_status({
+            "status": "error",
+            "started_at": started,
+            "finished_at": datetime.now().isoformat(),
+            "error": str(e),
+        })
+
+
+@app.post("/api/reindex")
+async def trigger_reindex(background_tasks: BackgroundTasks,
+                          auth: str = Depends(require_auth)):
+    if _reindex_running():
+        return JSONResponse(
+            {"started": False, "message": "reindex already running"},
+            status_code=409,
+        )
+    background_tasks.add_task(_run_reindex_background)
+    return JSONResponse({"started": True, "message": "Re-indexing started in background"})
+
+
+@app.get("/api/reindex/status")
+async def reindex_status(auth: str = Depends(require_auth)):
+    return JSONResponse(_read_reindex_status())

@app.delete("/api/conversations")
 async def clear_all_conversations(auth: str = Depends(require_auth)):
@@ -1042,22 +1093,8 @@ async def corpus_retry(request: Request, auth: str = Depends(require_auth)):
        filepath = Path(row[0])
        if not filepath.exists():
            return JSONResponse({"error": f"file not found: {filepath}"}, status_code=404)
-        suffix = filepath.suffix.lower()
-        text = ""
        try:
-            if suffix in {".txt", ".md"}:
-                text = filepath.read_text(encoding="utf-8", errors="ignore")
-            elif suffix == ".pdf":
-                from pypdf import PdfReader
-                text = "".join(p.extract_text() + "\n" for p in PdfReader(filepath).pages if p.extract_text())
-            elif suffix == ".docx":
-                from docx import Document as DocxDocument
-                text = "\n".join(p.text for p in DocxDocument(filepath).paragraphs if p.text.strip())
-            elif suffix == ".pptx":
-                from pptx import Presentation
-                prs = Presentation(filepath)
-                text = "\n".join(shape.text for slide in prs.slides for shape in slide.shapes
-                                if hasattr(shape, "text") and shape.text.strip())
+            text = encoding_extract_text(filepath)
        except Exception as e:
            return JSONResponse({"error": f"extraction failed: {e}"}, status_code=500)
        if not text.strip():