corpus integrity: ingest_failures tracking in watcher, reconciliation script, corpus status/retry/reconcile endpoints
This commit is contained in:
+173
@@ -957,6 +957,179 @@ async def clear_all_conversations(auth: str = Depends(require_auth)):
|
|||||||
return JSONResponse({"cleared": True})
|
return JSONResponse({"cleared": True})
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Corpus Integrity Endpoints ─────────────────────────────────────────────
|
||||||
|
|
||||||
|
CORPUS_INTEGRITY_SCRIPT = str(Path.home() / "aaronai" / "scripts" / "corpus_integrity.py")
|
||||||
|
CORPUS_REPORT_PATH = Path.home() / "aaronai" / "corpus_integrity_report.json"
|
||||||
|
SUPPORTED_EXTS = {".pdf", ".docx", ".pptx", ".txt", ".md"}
|
||||||
|
MIGRATION_STATE_PATH = Path.home() / "aaronai" / "experiments" / "tier1_migration_state.json"
|
||||||
|
|
||||||
|
|
||||||
|
def get_corpus_status_data():
|
||||||
|
fs_count = 0
|
||||||
|
try:
|
||||||
|
root = Path(NEXTCLOUD_PATH)
|
||||||
|
for path in root.rglob("*"):
|
||||||
|
if path.is_file() and path.suffix.lower() in SUPPORTED_EXTS:
|
||||||
|
if path.name.startswith((".", "~$")): continue
|
||||||
|
if "Admin/Backups" in str(path) or "Backups" in path.parts: continue
|
||||||
|
if "Journal/Media" in str(path): continue
|
||||||
|
fs_count += 1
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
pv_count = 0
|
||||||
|
try:
|
||||||
|
pg = get_pg()
|
||||||
|
cur = pg.cursor()
|
||||||
|
cur.execute("SELECT COUNT(DISTINCT source) FROM embeddings WHERE source IS NOT NULL")
|
||||||
|
pv_count = cur.fetchone()[0]
|
||||||
|
pg.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
gr_sources = set()
|
||||||
|
try:
|
||||||
|
if MIGRATION_STATE_PATH.exists():
|
||||||
|
state = json.loads(MIGRATION_STATE_PATH.read_text())
|
||||||
|
for fp in state.get("ingested", []):
|
||||||
|
gr_sources.add(Path(fp).name)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
pg = get_pg()
|
||||||
|
cur = pg.cursor()
|
||||||
|
cur.execute("SELECT DISTINCT source FROM stage_3_queue WHERE completed_at IS NOT NULL")
|
||||||
|
for row in cur.fetchall(): gr_sources.add(row[0])
|
||||||
|
pg.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
failures = []
|
||||||
|
try:
|
||||||
|
pg = get_pg()
|
||||||
|
cur = pg.cursor()
|
||||||
|
cur.execute("""
|
||||||
|
SELECT source, filepath, error, retry_count, first_failed_at, last_failed_at
|
||||||
|
FROM ingest_failures WHERE resolved = FALSE
|
||||||
|
ORDER BY last_failed_at DESC LIMIT 50
|
||||||
|
""")
|
||||||
|
for row in cur.fetchall():
|
||||||
|
failures.append({
|
||||||
|
"source": row[0], "filepath": row[1], "error": row[2],
|
||||||
|
"retry_count": row[3], "first_failed_at": str(row[4]),
|
||||||
|
"last_failed_at": str(row[5]),
|
||||||
|
})
|
||||||
|
pg.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
last_report = None
|
||||||
|
try:
|
||||||
|
if CORPUS_REPORT_PATH.exists():
|
||||||
|
report = json.loads(CORPUS_REPORT_PATH.read_text())
|
||||||
|
last_report = {
|
||||||
|
"timestamp": report.get("timestamp"),
|
||||||
|
"gaps": report.get("summary", {}).get("neither", 0),
|
||||||
|
"auto_queued": len(report.get("auto_queued", [])),
|
||||||
|
}
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return {
|
||||||
|
"filesystem": fs_count,
|
||||||
|
"pgvector": pv_count,
|
||||||
|
"graphiti": len(gr_sources),
|
||||||
|
"failures": failures,
|
||||||
|
"failure_count": len(failures),
|
||||||
|
"last_reconciliation": last_report,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/corpus/status")
|
||||||
|
async def corpus_status(auth: str = Depends(require_auth)):
|
||||||
|
try:
|
||||||
|
return JSONResponse(get_corpus_status_data())
|
||||||
|
except Exception as e:
|
||||||
|
return JSONResponse({"error": str(e)}, status_code=500)
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/api/corpus/retry")
|
||||||
|
async def corpus_retry(request: Request, auth: str = Depends(require_auth)):
|
||||||
|
try:
|
||||||
|
body = await request.json()
|
||||||
|
source = body.get("source", "")
|
||||||
|
if not source:
|
||||||
|
return JSONResponse({"error": "source required"}, status_code=400)
|
||||||
|
pg = get_pg()
|
||||||
|
cur = pg.cursor()
|
||||||
|
cur.execute("SELECT filepath FROM ingest_failures WHERE source = %s", (source,))
|
||||||
|
row = cur.fetchone()
|
||||||
|
pg.close()
|
||||||
|
if not row:
|
||||||
|
return JSONResponse({"error": "source not found in failures"}, status_code=404)
|
||||||
|
filepath = Path(row[0])
|
||||||
|
if not filepath.exists():
|
||||||
|
return JSONResponse({"error": f"file not found: {filepath}"}, status_code=404)
|
||||||
|
suffix = filepath.suffix.lower()
|
||||||
|
text = ""
|
||||||
|
try:
|
||||||
|
if suffix in {".txt", ".md"}:
|
||||||
|
text = filepath.read_text(encoding="utf-8", errors="ignore")
|
||||||
|
elif suffix == ".pdf":
|
||||||
|
from pypdf import PdfReader
|
||||||
|
text = "".join(p.extract_text() + "\n" for p in PdfReader(filepath).pages if p.extract_text())
|
||||||
|
elif suffix == ".docx":
|
||||||
|
from docx import Document as DocxDocument
|
||||||
|
text = "\n".join(p.text for p in DocxDocument(filepath).paragraphs if p.text.strip())
|
||||||
|
elif suffix == ".pptx":
|
||||||
|
from pptx import Presentation
|
||||||
|
prs = Presentation(filepath)
|
||||||
|
text = "\n".join(shape.text for slide in prs.slides for shape in slide.shapes
|
||||||
|
if hasattr(shape, "text") and shape.text.strip())
|
||||||
|
except Exception as e:
|
||||||
|
return JSONResponse({"error": f"extraction failed: {e}"}, status_code=500)
|
||||||
|
if not text.strip():
|
||||||
|
return JSONResponse({"error": "file produces empty text — may be corrupt"}, status_code=422)
|
||||||
|
pg = get_pg()
|
||||||
|
cur = pg.cursor()
|
||||||
|
cur.execute("""
|
||||||
|
INSERT INTO stage_2_queue (source, full_text, char_length)
|
||||||
|
VALUES (%s, %s, %s)
|
||||||
|
ON CONFLICT (source) DO UPDATE SET
|
||||||
|
full_text = EXCLUDED.full_text, char_length = EXCLUDED.char_length,
|
||||||
|
enqueued_at = NOW(), completed_at = NULL, failed_at = NULL, attempts = 0
|
||||||
|
""", (source, text[:50000], len(text)))
|
||||||
|
cur.execute("""
|
||||||
|
UPDATE ingest_failures SET retry_count = retry_count + 1, last_failed_at = NOW()
|
||||||
|
WHERE source = %s
|
||||||
|
""", (source,))
|
||||||
|
pg.commit()
|
||||||
|
pg.close()
|
||||||
|
return JSONResponse({"queued": True, "source": source})
|
||||||
|
except Exception as e:
|
||||||
|
return JSONResponse({"error": str(e)}, status_code=500)
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/api/corpus/reconcile")
|
||||||
|
async def corpus_reconcile(request: Request, background_tasks: BackgroundTasks, auth: str = Depends(require_auth)):
|
||||||
|
try:
|
||||||
|
body = await request.json()
|
||||||
|
fix = body.get("fix", True)
|
||||||
|
except Exception:
|
||||||
|
fix = True
|
||||||
|
def run_reconcile():
|
||||||
|
try:
|
||||||
|
cmd = [PYTHON, CORPUS_INTEGRITY_SCRIPT]
|
||||||
|
if fix:
|
||||||
|
cmd.append("--fix")
|
||||||
|
subprocess.run(cmd, cwd=str(Path.home() / "aaronai"), timeout=300)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Reconciliation failed: {e}")
|
||||||
|
background_tasks.add_task(run_reconcile)
|
||||||
|
return JSONResponse({"started": True, "fix": fix})
|
||||||
|
|
||||||
# ─── Scheduler ──────────────────────────────────────────────────────────────
|
# ─── Scheduler ──────────────────────────────────────────────────────────────
|
||||||
scheduler = BackgroundScheduler()
|
scheduler = BackgroundScheduler()
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,230 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
corpus_integrity.py — BirdAI Corpus Integrity Check
|
||||||
|
|
||||||
|
Compares three sources of truth:
|
||||||
|
1. Filesystem (Nextcloud) — what files exist
|
||||||
|
2. pgvector (embeddings table) — what's been through Stage 1
|
||||||
|
3. Graphiti (migration state + stage_3_queue) — what's been through Stage 3
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 corpus_integrity.py # report only
|
||||||
|
python3 corpus_integrity.py --fix # report + auto-queue gaps for retry
|
||||||
|
python3 corpus_integrity.py --json # output JSON to stdout
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import psycopg2
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
load_dotenv(Path.home() / "aaronai" / ".env", override=True)
|
||||||
|
|
||||||
|
NEXTCLOUD_PATH = "/home/aaron/nextcloud/data/data/aaron/files"
|
||||||
|
MIGRATION_STATE = str(Path.home() / "aaronai" / "experiments" / "tier1_migration_state.json")
|
||||||
|
REPORT_PATH = str(Path.home() / "aaronai" / "corpus_integrity_report.json")
|
||||||
|
SUPPORTED = {".pdf", ".docx", ".pptx", ".txt", ".md"}
|
||||||
|
PG_DSN = os.getenv("PG_DSN")
|
||||||
|
|
||||||
|
|
||||||
|
def get_pg():
|
||||||
|
return psycopg2.connect(PG_DSN)
|
||||||
|
|
||||||
|
|
||||||
|
def get_filesystem_files():
|
||||||
|
files = []
|
||||||
|
root = Path(NEXTCLOUD_PATH)
|
||||||
|
for path in root.rglob("*"):
|
||||||
|
if path.is_dir(): continue
|
||||||
|
if path.suffix.lower() not in SUPPORTED: continue
|
||||||
|
if path.name.startswith((".", "~$")): continue
|
||||||
|
if "Admin/Backups" in str(path) or "Backups" in path.parts: continue
|
||||||
|
if "Journal/Media" in str(path): continue
|
||||||
|
files.append({"source": path.name, "filepath": str(path),
|
||||||
|
"size": path.stat().st_size, "mtime": path.stat().st_mtime})
|
||||||
|
return files
|
||||||
|
|
||||||
|
|
||||||
|
def get_pgvector_sources():
|
||||||
|
try:
|
||||||
|
pg = get_pg()
|
||||||
|
cur = pg.cursor()
|
||||||
|
cur.execute("SELECT DISTINCT source FROM embeddings WHERE source IS NOT NULL")
|
||||||
|
sources = {row[0] for row in cur.fetchall()}
|
||||||
|
pg.close()
|
||||||
|
return sources
|
||||||
|
except Exception as e:
|
||||||
|
print(f"ERROR: pgvector: {e}", file=sys.stderr)
|
||||||
|
return set()
|
||||||
|
|
||||||
|
|
||||||
|
def get_graphiti_sources():
|
||||||
|
sources = set()
|
||||||
|
try:
|
||||||
|
state_path = Path(MIGRATION_STATE)
|
||||||
|
if state_path.exists():
|
||||||
|
state = json.loads(state_path.read_text())
|
||||||
|
for filepath in state.get("ingested", []):
|
||||||
|
sources.add(Path(filepath).name)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"WARNING: migration state: {e}", file=sys.stderr)
|
||||||
|
try:
|
||||||
|
pg = get_pg()
|
||||||
|
cur = pg.cursor()
|
||||||
|
cur.execute("SELECT DISTINCT source FROM stage_3_queue WHERE completed_at IS NOT NULL")
|
||||||
|
for row in cur.fetchall(): sources.add(row[0])
|
||||||
|
pg.close()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"WARNING: stage_3_queue: {e}", file=sys.stderr)
|
||||||
|
return sources
|
||||||
|
|
||||||
|
|
||||||
|
def get_ingest_failures():
|
||||||
|
failures = {}
|
||||||
|
try:
|
||||||
|
pg = get_pg()
|
||||||
|
cur = pg.cursor()
|
||||||
|
cur.execute("""
|
||||||
|
SELECT source, filepath, error, retry_count, first_failed_at, last_failed_at
|
||||||
|
FROM ingest_failures WHERE resolved = FALSE ORDER BY last_failed_at DESC
|
||||||
|
""")
|
||||||
|
for row in cur.fetchall():
|
||||||
|
failures[row[0]] = {"source": row[0], "filepath": row[1], "error": row[2],
|
||||||
|
"retry_count": row[3], "first_failed_at": str(row[4]),
|
||||||
|
"last_failed_at": str(row[5])}
|
||||||
|
pg.close()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"WARNING: ingest_failures: {e}", file=sys.stderr)
|
||||||
|
return failures
|
||||||
|
|
||||||
|
|
||||||
|
def extract_text_for_retry(filepath):
|
||||||
|
path = Path(filepath)
|
||||||
|
suffix = path.suffix.lower()
|
||||||
|
try:
|
||||||
|
if suffix == ".docx":
|
||||||
|
from docx import Document as D
|
||||||
|
return "\n".join(p.text for p in D(path).paragraphs if p.text.strip())
|
||||||
|
elif suffix == ".pdf":
|
||||||
|
from pypdf import PdfReader
|
||||||
|
return "".join(p.extract_text() + "\n" for p in PdfReader(path).pages if p.extract_text())
|
||||||
|
elif suffix == ".pptx":
|
||||||
|
from pptx import Presentation
|
||||||
|
prs = Presentation(path)
|
||||||
|
return "\n".join(shape.text for slide in prs.slides for shape in slide.shapes
|
||||||
|
if hasattr(shape, "text") and shape.text.strip())
|
||||||
|
elif suffix in {".txt", ".md"}:
|
||||||
|
return path.read_text(encoding="utf-8", errors="ignore")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"WARNING: extraction failed {path.name}: {e}", file=sys.stderr)
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def queue_for_retry(source, full_text, filepath):
|
||||||
|
try:
|
||||||
|
pg = get_pg()
|
||||||
|
cur = pg.cursor()
|
||||||
|
cur.execute("""
|
||||||
|
INSERT INTO stage_2_queue (source, full_text, char_length)
|
||||||
|
VALUES (%s, %s, %s)
|
||||||
|
ON CONFLICT (source) DO UPDATE SET
|
||||||
|
full_text = EXCLUDED.full_text, char_length = EXCLUDED.char_length,
|
||||||
|
enqueued_at = NOW(), completed_at = NULL, failed_at = NULL, attempts = 0
|
||||||
|
""", (source, full_text[:50000], len(full_text)))
|
||||||
|
pg.commit()
|
||||||
|
pg.close()
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"WARNING: queue failed {source}: {e}", file=sys.stderr)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def run_reconciliation(fix=False):
|
||||||
|
print(f"BirdAI Corpus Integrity Check — {datetime.now().isoformat()}")
|
||||||
|
print()
|
||||||
|
print("Scanning filesystem...")
|
||||||
|
fs_files = get_filesystem_files()
|
||||||
|
fs_sources = {f["source"]: f for f in fs_files}
|
||||||
|
print(f" Filesystem: {len(fs_files)} files")
|
||||||
|
print("Querying pgvector...")
|
||||||
|
pv_sources = get_pgvector_sources()
|
||||||
|
print(f" pgvector: {len(pv_sources)} distinct sources")
|
||||||
|
print("Querying Graphiti...")
|
||||||
|
gr_sources = get_graphiti_sources()
|
||||||
|
print(f" Graphiti: {len(gr_sources)} sources")
|
||||||
|
print("Querying ingest failures...")
|
||||||
|
failures = get_ingest_failures()
|
||||||
|
print(f" Failures: {len(failures)} unresolved")
|
||||||
|
print()
|
||||||
|
|
||||||
|
both, pv_only, neither, gr_only = [], [], [], []
|
||||||
|
for source, finfo in fs_sources.items():
|
||||||
|
in_pv = source in pv_sources
|
||||||
|
in_gr = source in gr_sources
|
||||||
|
if in_pv and in_gr: both.append(finfo)
|
||||||
|
elif in_pv: pv_only.append(finfo)
|
||||||
|
elif in_gr: gr_only.append(finfo)
|
||||||
|
else: neither.append(finfo)
|
||||||
|
|
||||||
|
orphans_pv = pv_sources - set(fs_sources.keys())
|
||||||
|
orphans_gr = gr_sources - set(fs_sources.keys())
|
||||||
|
|
||||||
|
print(f"Results:")
|
||||||
|
print(f" Both (pgvector + Graphiti): {len(both)}")
|
||||||
|
print(f" pgvector only: {len(pv_only)}")
|
||||||
|
print(f" Neither (corpus gap): {len(neither)}")
|
||||||
|
print(f" Graphiti only: {len(gr_only)}")
|
||||||
|
print(f" Ingest failures: {len(failures)}")
|
||||||
|
print(f" pgvector orphans: {len(orphans_pv)}")
|
||||||
|
print(f" Graphiti orphans: {len(orphans_gr)}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
auto_queued = []
|
||||||
|
if fix and neither:
|
||||||
|
print(f"Auto-queuing {len(neither)} gap files...")
|
||||||
|
for finfo in neither:
|
||||||
|
text = extract_text_for_retry(finfo["filepath"])
|
||||||
|
if text.strip():
|
||||||
|
if queue_for_retry(finfo["source"], text, finfo["filepath"]):
|
||||||
|
auto_queued.append(finfo["source"])
|
||||||
|
print(f" Queued: {finfo['source']}")
|
||||||
|
else:
|
||||||
|
print(f" Skipped (unreadable): {finfo['source']}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
report = {
|
||||||
|
"timestamp": datetime.now().isoformat(),
|
||||||
|
"summary": {
|
||||||
|
"filesystem_total": len(fs_files), "pgvector_total": len(pv_sources),
|
||||||
|
"graphiti_total": len(gr_sources), "both": len(both),
|
||||||
|
"pgvector_only": len(pv_only), "neither": len(neither),
|
||||||
|
"graphiti_only": len(gr_only), "failures": len(failures),
|
||||||
|
"orphans_pgvector": len(orphans_pv), "orphans_graphiti": len(orphans_gr),
|
||||||
|
},
|
||||||
|
"gaps": [f["source"] for f in neither],
|
||||||
|
"failures": list(failures.values()),
|
||||||
|
"auto_queued": auto_queued,
|
||||||
|
"pgvector_only_sample": [f["source"] for f in pv_only[:20]],
|
||||||
|
"graphiti_only": list(gr_only),
|
||||||
|
}
|
||||||
|
Path(REPORT_PATH).write_text(json.dumps(report, indent=2))
|
||||||
|
print(f"Report written to: {REPORT_PATH}")
|
||||||
|
return report
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--fix", action="store_true")
|
||||||
|
parser.add_argument("--json", action="store_true")
|
||||||
|
args = parser.parse_args()
|
||||||
|
report = run_reconciliation(fix=args.fix)
|
||||||
|
if args.json:
|
||||||
|
print(json.dumps(report, indent=2))
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -99,6 +99,7 @@ def extract_text(path: Path) -> str:
|
|||||||
return path.read_text(encoding="utf-8", errors="ignore")
|
return path.read_text(encoding="utf-8", errors="ignore")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.warning(f"Text extraction failed for {path.name}: {e}")
|
log.warning(f"Text extraction failed for {path.name}: {e}")
|
||||||
|
record_ingest_failure(path, f"Text extraction failed: {e}")
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
@@ -141,6 +142,38 @@ def enqueue_stage2(source: str, full_text: str):
|
|||||||
log.warning(f"Stage 2 enqueue failed (non-fatal): {e}")
|
log.warning(f"Stage 2 enqueue failed (non-fatal): {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def record_ingest_failure(filepath: Path, error: str):
|
||||||
|
"""Write extraction or ingest failure to ingest_failures table for UI visibility."""
|
||||||
|
try:
|
||||||
|
pg = get_pg()
|
||||||
|
cur = pg.cursor()
|
||||||
|
cur.execute("""
|
||||||
|
INSERT INTO ingest_failures (source, filepath, error, retry_count, first_failed_at, last_failed_at)
|
||||||
|
VALUES (%s, %s, %s, 0, NOW(), NOW())
|
||||||
|
ON CONFLICT (source) DO UPDATE SET
|
||||||
|
error = EXCLUDED.error,
|
||||||
|
retry_count = ingest_failures.retry_count + 1,
|
||||||
|
last_failed_at = NOW(),
|
||||||
|
resolved = FALSE
|
||||||
|
""", (filepath.name, str(filepath), error[:1000]))
|
||||||
|
pg.commit()
|
||||||
|
pg.close()
|
||||||
|
except Exception as e:
|
||||||
|
log.warning(f"Could not record ingest failure (non-fatal): {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_ingest_failure(source: str):
|
||||||
|
"""Mark a previously failed file as resolved after successful ingest."""
|
||||||
|
try:
|
||||||
|
pg = get_pg()
|
||||||
|
cur = pg.cursor()
|
||||||
|
cur.execute("UPDATE ingest_failures SET resolved = TRUE WHERE source = %s", (source,))
|
||||||
|
pg.commit()
|
||||||
|
pg.close()
|
||||||
|
except Exception as e:
|
||||||
|
log.warning(f"Could not resolve ingest failure record (non-fatal): {e}")
|
||||||
|
|
||||||
|
|
||||||
def ingest_file(filepath: Path, embedder) -> int:
|
def ingest_file(filepath: Path, embedder) -> int:
|
||||||
if filepath.name.startswith(("~$", ".")):
|
if filepath.name.startswith(("~$", ".")):
|
||||||
return 0
|
return 0
|
||||||
@@ -156,6 +189,7 @@ def ingest_file(filepath: Path, embedder) -> int:
|
|||||||
embeddings = embedder.encode(chunks).tolist()
|
embeddings = embedder.encode(chunks).tolist()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.error(f"Embedding failed for {filepath.name}: {e}")
|
log.error(f"Embedding failed for {filepath.name}: {e}")
|
||||||
|
record_ingest_failure(filepath, f"Embedding failed: {e}")
|
||||||
return 0
|
return 0
|
||||||
source = filepath.name
|
source = filepath.name
|
||||||
try:
|
try:
|
||||||
@@ -177,8 +211,10 @@ def ingest_file(filepath: Path, embedder) -> int:
|
|||||||
pg.close()
|
pg.close()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.error(f"pgvector write failed for {filepath.name}: {e}")
|
log.error(f"pgvector write failed for {filepath.name}: {e}")
|
||||||
|
record_ingest_failure(filepath, f"pgvector write failed: {e}")
|
||||||
return 0
|
return 0
|
||||||
log.info(f"Indexed {len(chunks)} chunks: {filepath.name}")
|
log.info(f"Indexed {len(chunks)} chunks: {filepath.name}")
|
||||||
|
resolve_ingest_failure(source)
|
||||||
enqueue_stage2(source, text)
|
enqueue_stage2(source, text)
|
||||||
return len(chunks)
|
return len(chunks)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user