scripts/encoding.py: Stage 1 dual-implementation consolidation (Track 1 Finding 11)
Consolidates four extract paths and two extract-chunk-embed-write pipelines into a single shared encoding module. Fixes the embedder lifecycle divergence between watcher and /api/reindex (no more 200MB reload per reindex click) and unifies failure tracking so /api/reindex failures now surface in SettingsPanel "Ingest Health". New files: - scripts/encoding.py — extract_text, chunk_text, chunk_and_embed, write_embeddings_batch - scripts/failures.py — record_ingest_failure, resolve_ingest_failure (shared by watcher.py and ingest.py) Refactored: - scripts/watcher.py — drops local extract/chunk/embed implementations and CHUNK_SIZE/CHUNK_OVERLAP/SUPPORTED constants; imports from encoding and failures. Now writes ingest_failures row on empty-text-extract (was silent return 0). - scripts/ingest.py — substantial rewrite. Exposes ingest_directory(folder, embedder=None) for in-process invocation; CLI back-compat preserved via ingest_folder wrapper. Module-level SentenceTransformer load removed. - scripts/corpus_integrity.py — imports extract_text from encoding; extract_text_for_retry function removed. - scripts/api.py — /api/reindex rewritten with BackgroundTasks (uses module-level embedder; no subprocess); new /api/reindex/status endpoint reading ~/aaronai/reindex_status.json; /api/corpus/retry imports extract_text from encoding; INGEST_SCRIPT constant removed (dead after this refactor); 409 reentrance guard prevents double-click stomping. Behavior changes: - /api/reindex no longer subprocess.Popens; runs in FastAPI BackgroundTasks threadpool, doesn't block API thread. - /api/reindex no longer reloads SentenceTransformer on each click. - /api/reindex failures newly write to ingest_failures (visible in SettingsPanel "Ingest Health" — badge will jump on first reindex). - New embeddings rows always have created_at = NOW() (canonical, server-side). - New embeddings rows always include metadata.folder field (None when not derivable). - /api/reindex returns 409 on second click while a job is running. - New /api/reindex/status endpoint for polling. Existing 9,815 NULL created_at rows remain unchanged; backfill is a separate decision if desired. 199 insertions, 256 deletions across 6 files (codebase shrinks net). Found by Track 1 inventory 2026-05-02 (Finding 11 / cross-cutting F11). Pre-commit verification: BackgroundTasks already imported, sys.path resolves correctly via script-path semantics, static import clean.
This commit is contained in:
+58
-21
@@ -31,6 +31,9 @@ from fastapi.responses import StreamingResponse
|
||||
from apscheduler.schedulers.background import BackgroundScheduler
|
||||
from apscheduler.triggers.cron import CronTrigger
|
||||
|
||||
from encoding import extract_text as encoding_extract_text
|
||||
from ingest import ingest_directory
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
MEMORY_PATH = Path.home() / "aaronai" / "memory.md"
|
||||
@@ -39,7 +42,6 @@ SETTINGS_PATH = Path.home() / "aaronai" / "settings.json"
|
||||
WATCHER_LOG = str(Path.home() / "aaronai" / "watcher.log")
|
||||
WATCHER_STATE = str(Path.home() / "aaronai" / "watcher_state.json")
|
||||
NEXTCLOUD_PATH = "/home/aaron/nextcloud/data/data/aaron/files"
|
||||
INGEST_SCRIPT = str(Path.home() / "aaronai" / "scripts" / "ingest.py")
|
||||
PYTHON = str(Path.home() / "aaronai" / "venv" / "bin" / "python3")
|
||||
|
||||
DEFAULT_SETTINGS = {
|
||||
@@ -908,13 +910,62 @@ async def list_captures():
|
||||
except Exception as e:
|
||||
return JSONResponse({"captures": []})
|
||||
|
||||
@app.post("/api/reindex")
|
||||
async def trigger_reindex(auth: str = Depends(require_auth)):
|
||||
REINDEX_STATUS_PATH = Path.home() / "aaronai" / "reindex_status.json"
|
||||
|
||||
|
||||
def _read_reindex_status() -> dict:
|
||||
if REINDEX_STATUS_PATH.exists():
|
||||
try:
|
||||
return json.loads(REINDEX_STATUS_PATH.read_text())
|
||||
except Exception:
|
||||
return {}
|
||||
return {}
|
||||
|
||||
|
||||
def _write_reindex_status(state: dict):
|
||||
REINDEX_STATUS_PATH.write_text(json.dumps(state, indent=2))
|
||||
|
||||
|
||||
def _reindex_running() -> bool:
|
||||
return _read_reindex_status().get("status") == "running"
|
||||
|
||||
|
||||
def _run_reindex_background():
|
||||
"""Background-thread entry: shares api.py's module-level embedder."""
|
||||
started = datetime.now().isoformat()
|
||||
_write_reindex_status({"status": "running", "started_at": started})
|
||||
try:
|
||||
subprocess.Popen([PYTHON, INGEST_SCRIPT, NEXTCLOUD_PATH])
|
||||
return JSONResponse({"started": True, "message": "Re-indexing started in background"})
|
||||
result = ingest_directory(Path(NEXTCLOUD_PATH), embedder=embedder)
|
||||
_write_reindex_status({
|
||||
"status": "complete",
|
||||
"started_at": started,
|
||||
"finished_at": datetime.now().isoformat(),
|
||||
**result,
|
||||
})
|
||||
except Exception as e:
|
||||
return JSONResponse({"started": False, "error": str(e)})
|
||||
_write_reindex_status({
|
||||
"status": "error",
|
||||
"started_at": started,
|
||||
"finished_at": datetime.now().isoformat(),
|
||||
"error": str(e),
|
||||
})
|
||||
|
||||
|
||||
@app.post("/api/reindex")
|
||||
async def trigger_reindex(background_tasks: BackgroundTasks,
|
||||
auth: str = Depends(require_auth)):
|
||||
if _reindex_running():
|
||||
return JSONResponse(
|
||||
{"started": False, "message": "reindex already running"},
|
||||
status_code=409,
|
||||
)
|
||||
background_tasks.add_task(_run_reindex_background)
|
||||
return JSONResponse({"started": True, "message": "Re-indexing started in background"})
|
||||
|
||||
|
||||
@app.get("/api/reindex/status")
|
||||
async def reindex_status(auth: str = Depends(require_auth)):
|
||||
return JSONResponse(_read_reindex_status())
|
||||
|
||||
@app.delete("/api/conversations")
|
||||
async def clear_all_conversations(auth: str = Depends(require_auth)):
|
||||
@@ -1042,22 +1093,8 @@ async def corpus_retry(request: Request, auth: str = Depends(require_auth)):
|
||||
filepath = Path(row[0])
|
||||
if not filepath.exists():
|
||||
return JSONResponse({"error": f"file not found: {filepath}"}, status_code=404)
|
||||
suffix = filepath.suffix.lower()
|
||||
text = ""
|
||||
try:
|
||||
if suffix in {".txt", ".md"}:
|
||||
text = filepath.read_text(encoding="utf-8", errors="ignore")
|
||||
elif suffix == ".pdf":
|
||||
from pypdf import PdfReader
|
||||
text = "".join(p.extract_text() + "\n" for p in PdfReader(filepath).pages if p.extract_text())
|
||||
elif suffix == ".docx":
|
||||
from docx import Document as DocxDocument
|
||||
text = "\n".join(p.text for p in DocxDocument(filepath).paragraphs if p.text.strip())
|
||||
elif suffix == ".pptx":
|
||||
from pptx import Presentation
|
||||
prs = Presentation(filepath)
|
||||
text = "\n".join(shape.text for slide in prs.slides for shape in slide.shapes
|
||||
if hasattr(shape, "text") and shape.text.strip())
|
||||
text = encoding_extract_text(filepath)
|
||||
except Exception as e:
|
||||
return JSONResponse({"error": f"extraction failed: {e}"}, status_code=500)
|
||||
if not text.strip():
|
||||
|
||||
Reference in New Issue
Block a user