scripts/encoding.py: Stage 1 dual-implementation consolidation (Track 1 Finding 11)
Consolidates four extract paths and two extract-chunk-embed-write pipelines into a single shared encoding module. Fixes the embedder lifecycle divergence between watcher and /api/reindex (no more 200MB reload per reindex click) and unifies failure tracking so /api/reindex failures now surface in SettingsPanel "Ingest Health". New files: - scripts/encoding.py — extract_text, chunk_text, chunk_and_embed, write_embeddings_batch - scripts/failures.py — record_ingest_failure, resolve_ingest_failure (shared by watcher.py and ingest.py) Refactored: - scripts/watcher.py — drops local extract/chunk/embed implementations and CHUNK_SIZE/CHUNK_OVERLAP/SUPPORTED constants; imports from encoding and failures. Now writes ingest_failures row on empty-text-extract (was silent return 0). - scripts/ingest.py — substantial rewrite. Exposes ingest_directory(folder, embedder=None) for in-process invocation; CLI back-compat preserved via ingest_folder wrapper. Module-level SentenceTransformer load removed. - scripts/corpus_integrity.py — imports extract_text from encoding; extract_text_for_retry function removed. - scripts/api.py — /api/reindex rewritten with BackgroundTasks (uses module-level embedder; no subprocess); new /api/reindex/status endpoint reading ~/aaronai/reindex_status.json; /api/corpus/retry imports extract_text from encoding; INGEST_SCRIPT constant removed (dead after this refactor); 409 reentrance guard prevents double-click stomping. Behavior changes: - /api/reindex no longer subprocess.Popens; runs in FastAPI BackgroundTasks threadpool, doesn't block API thread. - /api/reindex no longer reloads SentenceTransformer on each click. - /api/reindex failures newly write to ingest_failures (visible in SettingsPanel "Ingest Health" — badge will jump on first reindex). - New embeddings rows always have created_at = NOW() (canonical, server-side). - New embeddings rows always include metadata.folder field (None when not derivable). - /api/reindex returns 409 on second click while a job is running. - New /api/reindex/status endpoint for polling. Existing 9,815 NULL created_at rows remain unchanged; backfill is a separate decision if desired. 199 insertions, 256 deletions across 6 files (codebase shrinks net). Found by Track 1 inventory 2026-05-02 (Finding 11 / cross-cutting F11). Pre-commit verification: BackgroundTasks already imported, sys.path resolves correctly via script-path semantics, static import clean.
This commit is contained in:
+112
-131
@@ -1,70 +1,37 @@
|
||||
"""
|
||||
Aaron AI bulk ingester. Two entry points:
|
||||
- ingest_directory(folder, embedder=None) — programmatic; called from
|
||||
api.py /api/reindex with the api process's shared embedder
|
||||
- python3 scripts/ingest.py <folder> — CLI back-compat; loads its own embedder
|
||||
|
||||
Stage 1 helpers (extract / chunk / embed / write) live in scripts/encoding.py.
|
||||
Failure tracking SQL lives in scripts/failures.py.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
from dotenv import load_dotenv
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
import json
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from docx import Document
|
||||
from pypdf import PdfReader
|
||||
from pptx import Presentation
|
||||
|
||||
from encoding import extract_text, chunk_and_embed, write_embeddings_batch, SUPPORTED
|
||||
from failures import (
|
||||
record_ingest_failure as _record_failure_sql,
|
||||
resolve_ingest_failure as _resolve_failure_sql,
|
||||
)
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env", override=True)
|
||||
|
||||
print("Loading embedding model...")
|
||||
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
|
||||
PG_DSN = os.getenv("PG_DSN")
|
||||
|
||||
|
||||
def get_pg():
|
||||
return psycopg2.connect(PG_DSN)
|
||||
|
||||
def extract_text_from_docx(path):
|
||||
doc = Document(path)
|
||||
return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
|
||||
|
||||
def extract_text_from_pdf(path):
|
||||
reader = PdfReader(path)
|
||||
text = ""
|
||||
for page in reader.pages:
|
||||
extracted = page.extract_text()
|
||||
if extracted:
|
||||
text += extracted + "\n"
|
||||
return text
|
||||
|
||||
def extract_text_from_pptx(path):
|
||||
prs = Presentation(path)
|
||||
text = ""
|
||||
for slide in prs.slides:
|
||||
for shape in slide.shapes:
|
||||
if hasattr(shape, "text") and shape.text.strip():
|
||||
text += shape.text + "\n"
|
||||
return text
|
||||
|
||||
def extract_text_from_txt(path):
|
||||
with open(path, "r", encoding="utf-8", errors="ignore") as f:
|
||||
return f.read()
|
||||
|
||||
def chunk_text(text, chunk_size=500, overlap=50):
|
||||
words = text.split()
|
||||
chunks = []
|
||||
start = 0
|
||||
while start < len(words):
|
||||
end = start + chunk_size
|
||||
chunk = " ".join(words[start:end])
|
||||
if chunk.strip():
|
||||
chunks.append(chunk)
|
||||
start += chunk_size - overlap
|
||||
return chunks
|
||||
|
||||
def make_id(filepath, chunk_index):
|
||||
path_hash = hashlib.md5(str(filepath).encode()).hexdigest()[:8]
|
||||
return f"{path_hash}_{chunk_index}"
|
||||
|
||||
def enqueue_stage2(source, full_text):
|
||||
"""Enqueue document for Stage 2 (Mistral orientation) → Stage 3 (Graphiti ingest).
|
||||
"""Enqueue document for Stage 2 (Mistral orientation) -> Stage 3 (Graphiti ingest).
|
||||
TEMPORARY: this queue feed will be removed when pgvector is decommissioned
|
||||
and the watcher calls Stage 2 directly.
|
||||
"""
|
||||
@@ -87,94 +54,108 @@ def enqueue_stage2(source, full_text):
|
||||
except Exception as e:
|
||||
print(f" Stage 2 queue insert failed (non-fatal): {e}")
|
||||
|
||||
def ingest_file(filepath):
|
||||
path = Path(filepath)
|
||||
suffix = path.suffix.lower()
|
||||
|
||||
if path.name.startswith("~$") or path.name.startswith("."):
|
||||
return 0
|
||||
|
||||
def _record_failure(filepath: Path, error: str) -> None:
|
||||
try:
|
||||
if suffix == ".docx":
|
||||
text = extract_text_from_docx(path)
|
||||
elif suffix == ".pdf":
|
||||
text = extract_text_from_pdf(path)
|
||||
elif suffix == ".pptx":
|
||||
text = extract_text_from_pptx(path)
|
||||
elif suffix in [".txt", ".md"]:
|
||||
text = extract_text_from_txt(path)
|
||||
else:
|
||||
return 0
|
||||
|
||||
if not text.strip():
|
||||
return 0
|
||||
|
||||
chunks = chunk_text(text)
|
||||
if not chunks:
|
||||
return 0
|
||||
|
||||
embeddings = embedder.encode(chunks).tolist()
|
||||
ids = [make_id(path, i) for i in range(len(chunks))]
|
||||
metadatas = [{
|
||||
"source": path.name,
|
||||
"filepath": str(path),
|
||||
"folder": str(path.parent.relative_to(Path(sys.argv[1]) if len(sys.argv) > 1 else path.parent))
|
||||
} for _ in chunks]
|
||||
|
||||
# STAGE 1: Write to pgvector (TEMPORARY — remove when chat agent migrates to Graphiti)
|
||||
pg = get_pg()
|
||||
cur = pg.cursor()
|
||||
for chunk_id, chunk, embedding, meta in zip(ids, chunks, embeddings, metadatas):
|
||||
cur.execute("""
|
||||
INSERT INTO embeddings (id, document, embedding, source, type, created_at, metadata)
|
||||
VALUES (%s, %s, %s::vector, %s, %s, %s, %s)
|
||||
ON CONFLICT (id) DO UPDATE SET
|
||||
document = EXCLUDED.document,
|
||||
embedding = EXCLUDED.embedding,
|
||||
source = EXCLUDED.source,
|
||||
metadata = EXCLUDED.metadata
|
||||
""", (
|
||||
chunk_id, chunk, embedding,
|
||||
meta.get("source"), "document", None,
|
||||
json.dumps(meta)
|
||||
))
|
||||
pg.commit()
|
||||
pg.close()
|
||||
print(f" Indexed {len(chunks)} chunks: {path.name}")
|
||||
|
||||
# Enqueue for Stage 2 → Stage 3 (Graphiti pipeline)
|
||||
# SKIP_STAGE2_ENQUEUE env var set by migration scripts to prevent bulk enqueue
|
||||
if not os.getenv("SKIP_STAGE2_ENQUEUE"):
|
||||
enqueue_stage2(path.name, text)
|
||||
|
||||
return len(chunks)
|
||||
|
||||
try:
|
||||
_record_failure_sql(pg, filepath.name, filepath, error)
|
||||
finally:
|
||||
pg.close()
|
||||
except Exception as e:
|
||||
print(f" Error: {path.name}: {e}")
|
||||
print(f" Could not record ingest failure (non-fatal): {e}")
|
||||
|
||||
|
||||
def _resolve_failure(source: str) -> None:
|
||||
try:
|
||||
pg = get_pg()
|
||||
try:
|
||||
_resolve_failure_sql(pg, source)
|
||||
finally:
|
||||
pg.close()
|
||||
except Exception as e:
|
||||
print(f" Could not resolve ingest failure record (non-fatal): {e}")
|
||||
|
||||
|
||||
def _ingest_one(filepath: Path, embedder, root: Path = None) -> int:
|
||||
"""Ingest a single file. Returns chunk count, 0 on skip/failure."""
|
||||
if filepath.name.startswith(("~$", ".")):
|
||||
return 0
|
||||
if filepath.suffix.lower() not in SUPPORTED:
|
||||
return 0
|
||||
text = extract_text(filepath)
|
||||
if not text.strip():
|
||||
_record_failure(filepath, "Text extraction failed or empty")
|
||||
return 0
|
||||
folder_rel = None
|
||||
if root is not None:
|
||||
try:
|
||||
folder_rel = str(filepath.parent.relative_to(root))
|
||||
except ValueError:
|
||||
pass
|
||||
try:
|
||||
rows = chunk_and_embed(text, filepath.name, embedder,
|
||||
filepath=filepath, folder=folder_rel)
|
||||
except Exception as e:
|
||||
_record_failure(filepath, f"Embedding failed: {e}")
|
||||
return 0
|
||||
if not rows:
|
||||
return 0
|
||||
try:
|
||||
pg = get_pg()
|
||||
try:
|
||||
write_embeddings_batch(pg, rows)
|
||||
finally:
|
||||
pg.close()
|
||||
except Exception as e:
|
||||
_record_failure(filepath, f"pgvector write failed: {e}")
|
||||
return 0
|
||||
print(f" Indexed {len(rows)} chunks: {filepath.name}")
|
||||
_resolve_failure(filepath.name)
|
||||
if not os.getenv("SKIP_STAGE2_ENQUEUE"):
|
||||
enqueue_stage2(filepath.name, text)
|
||||
return len(rows)
|
||||
|
||||
|
||||
def ingest_directory(folder, embedder=None) -> dict:
|
||||
"""Programmatic entry point. Returns {scanned, ingested, failed, total_chunks}.
|
||||
|
||||
If embedder is None, loads its own SentenceTransformer (CLI back-compat path).
|
||||
Caller (e.g. api.py /api/reindex) should pass its module-level embedder so
|
||||
the ~200MB model isn't reloaded per call.
|
||||
"""
|
||||
folder = Path(folder)
|
||||
if not folder.exists():
|
||||
return {"scanned": 0, "ingested": 0, "failed": 0, "total_chunks": 0,
|
||||
"error": f"folder not found: {folder}"}
|
||||
|
||||
if embedder is None:
|
||||
print("Loading embedding model...")
|
||||
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
|
||||
files = [f for f in folder.rglob("*")
|
||||
if f.suffix.lower() in SUPPORTED
|
||||
and not f.name.startswith(("~$", "."))]
|
||||
print(f"Found {len(files)} files to process")
|
||||
|
||||
ingested = failed = total_chunks = 0
|
||||
for f in files:
|
||||
n = _ingest_one(f, embedder, root=folder)
|
||||
if n > 0:
|
||||
ingested += 1
|
||||
total_chunks += n
|
||||
else:
|
||||
failed += 1
|
||||
return {"scanned": len(files), "ingested": ingested, "failed": failed,
|
||||
"total_chunks": total_chunks}
|
||||
|
||||
|
||||
def ingest_folder(folder_path):
|
||||
folder = Path(folder_path)
|
||||
if not folder.exists():
|
||||
print(f"Folder not found: {folder_path}")
|
||||
sys.exit(1)
|
||||
"""CLI back-compat wrapper. Loads its own embedder."""
|
||||
result = ingest_directory(Path(folder_path))
|
||||
print(f"\nDone. {result['ingested']} files / {result['total_chunks']} chunks indexed; "
|
||||
f"{result['failed']} failed.")
|
||||
|
||||
supported = [".docx", ".pdf", ".pptx", ".txt", ".md"]
|
||||
files = [f for f in folder.rglob("*")
|
||||
if f.suffix.lower() in supported
|
||||
and not f.name.startswith("~$")
|
||||
and not f.name.startswith(".")]
|
||||
|
||||
if not files:
|
||||
print("No supported files found.")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Found {len(files)} files to process\n")
|
||||
total_chunks = 0
|
||||
for f in files:
|
||||
total_chunks += ingest_file(f)
|
||||
|
||||
print(f"\nDone. Total chunks indexed: {total_chunks}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
target = sys.argv[1] if len(sys.argv) > 1 else str(Path.home() / "aaronai" / "docs")
|
||||
|
||||
Reference in New Issue
Block a user