1101bef226
Consolidates four extract paths and two extract-chunk-embed-write pipelines into a single shared encoding module. Fixes the embedder lifecycle divergence between watcher and /api/reindex (no more 200MB reload per reindex click) and unifies failure tracking so /api/reindex failures now surface in SettingsPanel "Ingest Health". New files: - scripts/encoding.py — extract_text, chunk_text, chunk_and_embed, write_embeddings_batch - scripts/failures.py — record_ingest_failure, resolve_ingest_failure (shared by watcher.py and ingest.py) Refactored: - scripts/watcher.py — drops local extract/chunk/embed implementations and CHUNK_SIZE/CHUNK_OVERLAP/SUPPORTED constants; imports from encoding and failures. Now writes ingest_failures row on empty-text-extract (was silent return 0). - scripts/ingest.py — substantial rewrite. Exposes ingest_directory(folder, embedder=None) for in-process invocation; CLI back-compat preserved via ingest_folder wrapper. Module-level SentenceTransformer load removed. - scripts/corpus_integrity.py — imports extract_text from encoding; extract_text_for_retry function removed. - scripts/api.py — /api/reindex rewritten with BackgroundTasks (uses module-level embedder; no subprocess); new /api/reindex/status endpoint reading ~/aaronai/reindex_status.json; /api/corpus/retry imports extract_text from encoding; INGEST_SCRIPT constant removed (dead after this refactor); 409 reentrance guard prevents double-click stomping. Behavior changes: - /api/reindex no longer subprocess.Popens; runs in FastAPI BackgroundTasks threadpool, doesn't block API thread. - /api/reindex no longer reloads SentenceTransformer on each click. - /api/reindex failures newly write to ingest_failures (visible in SettingsPanel "Ingest Health" — badge will jump on first reindex). - New embeddings rows always have created_at = NOW() (canonical, server-side). - New embeddings rows always include metadata.folder field (None when not derivable). - /api/reindex returns 409 on second click while a job is running. - New /api/reindex/status endpoint for polling. Existing 9,815 NULL created_at rows remain unchanged; backfill is a separate decision if desired. 199 insertions, 256 deletions across 6 files (codebase shrinks net). Found by Track 1 inventory 2026-05-02 (Finding 11 / cross-cutting F11). Pre-commit verification: BackgroundTasks already imported, sys.path resolves correctly via script-path semantics, static import clean.
164 lines
5.2 KiB
Python
164 lines
5.2 KiB
Python
"""
|
|
Aaron AI bulk ingester. Two entry points:
|
|
- ingest_directory(folder, embedder=None) — programmatic; called from
|
|
api.py /api/reindex with the api process's shared embedder
|
|
- python3 scripts/ingest.py <folder> — CLI back-compat; loads its own embedder
|
|
|
|
Stage 1 helpers (extract / chunk / embed / write) live in scripts/encoding.py.
|
|
Failure tracking SQL lives in scripts/failures.py.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
from dotenv import load_dotenv
|
|
import psycopg2
|
|
from sentence_transformers import SentenceTransformer
|
|
|
|
from encoding import extract_text, chunk_and_embed, write_embeddings_batch, SUPPORTED
|
|
from failures import (
|
|
record_ingest_failure as _record_failure_sql,
|
|
resolve_ingest_failure as _resolve_failure_sql,
|
|
)
|
|
|
|
load_dotenv(Path.home() / "aaronai" / ".env", override=True)
|
|
|
|
PG_DSN = os.getenv("PG_DSN")
|
|
|
|
|
|
def get_pg():
|
|
return psycopg2.connect(PG_DSN)
|
|
|
|
|
|
def enqueue_stage2(source, full_text):
|
|
"""Enqueue document for Stage 2 (Mistral orientation) -> Stage 3 (Graphiti ingest).
|
|
TEMPORARY: this queue feed will be removed when pgvector is decommissioned
|
|
and the watcher calls Stage 2 directly.
|
|
"""
|
|
try:
|
|
pg = get_pg()
|
|
cur = pg.cursor()
|
|
cur.execute("""
|
|
INSERT INTO stage_2_queue (source, full_text, char_length)
|
|
VALUES (%s, %s, %s)
|
|
ON CONFLICT (source) DO UPDATE SET
|
|
full_text = EXCLUDED.full_text,
|
|
char_length = EXCLUDED.char_length,
|
|
enqueued_at = NOW(),
|
|
completed_at = NULL,
|
|
failed_at = NULL,
|
|
attempts = 0
|
|
""", (source, full_text, len(full_text)))
|
|
pg.commit()
|
|
pg.close()
|
|
except Exception as e:
|
|
print(f" Stage 2 queue insert failed (non-fatal): {e}")
|
|
|
|
|
|
def _record_failure(filepath: Path, error: str) -> None:
|
|
try:
|
|
pg = get_pg()
|
|
try:
|
|
_record_failure_sql(pg, filepath.name, filepath, error)
|
|
finally:
|
|
pg.close()
|
|
except Exception as e:
|
|
print(f" Could not record ingest failure (non-fatal): {e}")
|
|
|
|
|
|
def _resolve_failure(source: str) -> None:
|
|
try:
|
|
pg = get_pg()
|
|
try:
|
|
_resolve_failure_sql(pg, source)
|
|
finally:
|
|
pg.close()
|
|
except Exception as e:
|
|
print(f" Could not resolve ingest failure record (non-fatal): {e}")
|
|
|
|
|
|
def _ingest_one(filepath: Path, embedder, root: Path = None) -> int:
|
|
"""Ingest a single file. Returns chunk count, 0 on skip/failure."""
|
|
if filepath.name.startswith(("~$", ".")):
|
|
return 0
|
|
if filepath.suffix.lower() not in SUPPORTED:
|
|
return 0
|
|
text = extract_text(filepath)
|
|
if not text.strip():
|
|
_record_failure(filepath, "Text extraction failed or empty")
|
|
return 0
|
|
folder_rel = None
|
|
if root is not None:
|
|
try:
|
|
folder_rel = str(filepath.parent.relative_to(root))
|
|
except ValueError:
|
|
pass
|
|
try:
|
|
rows = chunk_and_embed(text, filepath.name, embedder,
|
|
filepath=filepath, folder=folder_rel)
|
|
except Exception as e:
|
|
_record_failure(filepath, f"Embedding failed: {e}")
|
|
return 0
|
|
if not rows:
|
|
return 0
|
|
try:
|
|
pg = get_pg()
|
|
try:
|
|
write_embeddings_batch(pg, rows)
|
|
finally:
|
|
pg.close()
|
|
except Exception as e:
|
|
_record_failure(filepath, f"pgvector write failed: {e}")
|
|
return 0
|
|
print(f" Indexed {len(rows)} chunks: {filepath.name}")
|
|
_resolve_failure(filepath.name)
|
|
if not os.getenv("SKIP_STAGE2_ENQUEUE"):
|
|
enqueue_stage2(filepath.name, text)
|
|
return len(rows)
|
|
|
|
|
|
def ingest_directory(folder, embedder=None) -> dict:
|
|
"""Programmatic entry point. Returns {scanned, ingested, failed, total_chunks}.
|
|
|
|
If embedder is None, loads its own SentenceTransformer (CLI back-compat path).
|
|
Caller (e.g. api.py /api/reindex) should pass its module-level embedder so
|
|
the ~200MB model isn't reloaded per call.
|
|
"""
|
|
folder = Path(folder)
|
|
if not folder.exists():
|
|
return {"scanned": 0, "ingested": 0, "failed": 0, "total_chunks": 0,
|
|
"error": f"folder not found: {folder}"}
|
|
|
|
if embedder is None:
|
|
print("Loading embedding model...")
|
|
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
|
|
|
files = [f for f in folder.rglob("*")
|
|
if f.suffix.lower() in SUPPORTED
|
|
and not f.name.startswith(("~$", "."))]
|
|
print(f"Found {len(files)} files to process")
|
|
|
|
ingested = failed = total_chunks = 0
|
|
for f in files:
|
|
n = _ingest_one(f, embedder, root=folder)
|
|
if n > 0:
|
|
ingested += 1
|
|
total_chunks += n
|
|
else:
|
|
failed += 1
|
|
return {"scanned": len(files), "ingested": ingested, "failed": failed,
|
|
"total_chunks": total_chunks}
|
|
|
|
|
|
def ingest_folder(folder_path):
|
|
"""CLI back-compat wrapper. Loads its own embedder."""
|
|
result = ingest_directory(Path(folder_path))
|
|
print(f"\nDone. {result['ingested']} files / {result['total_chunks']} chunks indexed; "
|
|
f"{result['failed']} failed.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
target = sys.argv[1] if len(sys.argv) > 1 else str(Path.home() / "aaronai" / "docs")
|
|
print(f"Ingesting from: {target}\n")
|
|
ingest_folder(target)
|