scripts/encoding.py: Stage 1 dual-implementation consolidation (Track 1 Finding 11)

Consolidates four extract paths and two extract-chunk-embed-write pipelines
into a single shared encoding module. Fixes the embedder lifecycle
divergence between watcher and /api/reindex (no more 200MB reload per
reindex click) and unifies failure tracking so /api/reindex failures now
surface in SettingsPanel "Ingest Health".

New files:
- scripts/encoding.py — extract_text, chunk_text, chunk_and_embed,
  write_embeddings_batch
- scripts/failures.py — record_ingest_failure, resolve_ingest_failure
  (shared by watcher.py and ingest.py)

Refactored:
- scripts/watcher.py — drops local extract/chunk/embed implementations
  and CHUNK_SIZE/CHUNK_OVERLAP/SUPPORTED constants; imports from encoding
  and failures. Now writes ingest_failures row on empty-text-extract
  (was silent return 0).
- scripts/ingest.py — substantial rewrite. Exposes ingest_directory(folder,
  embedder=None) for in-process invocation; CLI back-compat preserved via
  ingest_folder wrapper. Module-level SentenceTransformer load removed.
- scripts/corpus_integrity.py — imports extract_text from encoding;
  extract_text_for_retry function removed.
- scripts/api.py — /api/reindex rewritten with BackgroundTasks (uses
  module-level embedder; no subprocess); new /api/reindex/status endpoint
  reading ~/aaronai/reindex_status.json; /api/corpus/retry imports
  extract_text from encoding; INGEST_SCRIPT constant removed (dead after
  this refactor); 409 reentrance guard prevents double-click stomping.

Behavior changes:
- /api/reindex no longer subprocess.Popens; runs in FastAPI BackgroundTasks
  threadpool, doesn't block API thread.
- /api/reindex no longer reloads SentenceTransformer on each click.
- /api/reindex failures newly write to ingest_failures (visible in
  SettingsPanel "Ingest Health" — badge will jump on first reindex).
- New embeddings rows always have created_at = NOW() (canonical, server-side).
- New embeddings rows always include metadata.folder field (None when not
  derivable).
- /api/reindex returns 409 on second click while a job is running.
- New /api/reindex/status endpoint for polling.

Existing 9,815 NULL created_at rows remain unchanged; backfill is a
separate decision if desired.

199 insertions, 256 deletions across 6 files (codebase shrinks net).

Found by Track 1 inventory 2026-05-02 (Finding 11 / cross-cutting F11).
Pre-commit verification: BackgroundTasks already imported, sys.path
resolves correctly via script-path semantics, static import clean.
This commit is contained in:
2026-05-03 01:40:47 +00:00
parent a317df66f8
commit 1101bef226
6 changed files with 357 additions and 264 deletions
+58 -21
View File
@@ -31,6 +31,9 @@ from fastapi.responses import StreamingResponse
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.triggers.cron import CronTrigger
from encoding import extract_text as encoding_extract_text
from ingest import ingest_directory
load_dotenv(Path.home() / "aaronai" / ".env")
MEMORY_PATH = Path.home() / "aaronai" / "memory.md"
@@ -39,7 +42,6 @@ SETTINGS_PATH = Path.home() / "aaronai" / "settings.json"
WATCHER_LOG = str(Path.home() / "aaronai" / "watcher.log")
WATCHER_STATE = str(Path.home() / "aaronai" / "watcher_state.json")
NEXTCLOUD_PATH = "/home/aaron/nextcloud/data/data/aaron/files"
INGEST_SCRIPT = str(Path.home() / "aaronai" / "scripts" / "ingest.py")
PYTHON = str(Path.home() / "aaronai" / "venv" / "bin" / "python3")
DEFAULT_SETTINGS = {
@@ -908,13 +910,62 @@ async def list_captures():
except Exception as e:
return JSONResponse({"captures": []})
@app.post("/api/reindex")
async def trigger_reindex(auth: str = Depends(require_auth)):
REINDEX_STATUS_PATH = Path.home() / "aaronai" / "reindex_status.json"
def _read_reindex_status() -> dict:
if REINDEX_STATUS_PATH.exists():
try:
return json.loads(REINDEX_STATUS_PATH.read_text())
except Exception:
return {}
return {}
def _write_reindex_status(state: dict):
REINDEX_STATUS_PATH.write_text(json.dumps(state, indent=2))
def _reindex_running() -> bool:
return _read_reindex_status().get("status") == "running"
def _run_reindex_background():
"""Background-thread entry: shares api.py's module-level embedder."""
started = datetime.now().isoformat()
_write_reindex_status({"status": "running", "started_at": started})
try:
subprocess.Popen([PYTHON, INGEST_SCRIPT, NEXTCLOUD_PATH])
return JSONResponse({"started": True, "message": "Re-indexing started in background"})
result = ingest_directory(Path(NEXTCLOUD_PATH), embedder=embedder)
_write_reindex_status({
"status": "complete",
"started_at": started,
"finished_at": datetime.now().isoformat(),
**result,
})
except Exception as e:
return JSONResponse({"started": False, "error": str(e)})
_write_reindex_status({
"status": "error",
"started_at": started,
"finished_at": datetime.now().isoformat(),
"error": str(e),
})
@app.post("/api/reindex")
async def trigger_reindex(background_tasks: BackgroundTasks,
auth: str = Depends(require_auth)):
if _reindex_running():
return JSONResponse(
{"started": False, "message": "reindex already running"},
status_code=409,
)
background_tasks.add_task(_run_reindex_background)
return JSONResponse({"started": True, "message": "Re-indexing started in background"})
@app.get("/api/reindex/status")
async def reindex_status(auth: str = Depends(require_auth)):
return JSONResponse(_read_reindex_status())
@app.delete("/api/conversations")
async def clear_all_conversations(auth: str = Depends(require_auth)):
@@ -1042,22 +1093,8 @@ async def corpus_retry(request: Request, auth: str = Depends(require_auth)):
filepath = Path(row[0])
if not filepath.exists():
return JSONResponse({"error": f"file not found: {filepath}"}, status_code=404)
suffix = filepath.suffix.lower()
text = ""
try:
if suffix in {".txt", ".md"}:
text = filepath.read_text(encoding="utf-8", errors="ignore")
elif suffix == ".pdf":
from pypdf import PdfReader
text = "".join(p.extract_text() + "\n" for p in PdfReader(filepath).pages if p.extract_text())
elif suffix == ".docx":
from docx import Document as DocxDocument
text = "\n".join(p.text for p in DocxDocument(filepath).paragraphs if p.text.strip())
elif suffix == ".pptx":
from pptx import Presentation
prs = Presentation(filepath)
text = "\n".join(shape.text for slide in prs.slides for shape in slide.shapes
if hasattr(shape, "text") and shape.text.strip())
text = encoding_extract_text(filepath)
except Exception as e:
return JSONResponse({"error": f"extraction failed: {e}"}, status_code=500)
if not text.strip():