corpus integrity: ingest_failures tracking in watcher, reconciliation script, corpus status/retry/reconcile endpoints
This commit is contained in:
+173
@@ -957,6 +957,179 @@ async def clear_all_conversations(auth: str = Depends(require_auth)):
|
||||
return JSONResponse({"cleared": True})
|
||||
|
||||
|
||||
|
||||
# ─── Corpus Integrity Endpoints ─────────────────────────────────────────────
|
||||
|
||||
CORPUS_INTEGRITY_SCRIPT = str(Path.home() / "aaronai" / "scripts" / "corpus_integrity.py")
|
||||
CORPUS_REPORT_PATH = Path.home() / "aaronai" / "corpus_integrity_report.json"
|
||||
SUPPORTED_EXTS = {".pdf", ".docx", ".pptx", ".txt", ".md"}
|
||||
MIGRATION_STATE_PATH = Path.home() / "aaronai" / "experiments" / "tier1_migration_state.json"
|
||||
|
||||
|
||||
def get_corpus_status_data():
|
||||
fs_count = 0
|
||||
try:
|
||||
root = Path(NEXTCLOUD_PATH)
|
||||
for path in root.rglob("*"):
|
||||
if path.is_file() and path.suffix.lower() in SUPPORTED_EXTS:
|
||||
if path.name.startswith((".", "~$")): continue
|
||||
if "Admin/Backups" in str(path) or "Backups" in path.parts: continue
|
||||
if "Journal/Media" in str(path): continue
|
||||
fs_count += 1
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
pv_count = 0
|
||||
try:
|
||||
pg = get_pg()
|
||||
cur = pg.cursor()
|
||||
cur.execute("SELECT COUNT(DISTINCT source) FROM embeddings WHERE source IS NOT NULL")
|
||||
pv_count = cur.fetchone()[0]
|
||||
pg.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
gr_sources = set()
|
||||
try:
|
||||
if MIGRATION_STATE_PATH.exists():
|
||||
state = json.loads(MIGRATION_STATE_PATH.read_text())
|
||||
for fp in state.get("ingested", []):
|
||||
gr_sources.add(Path(fp).name)
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
pg = get_pg()
|
||||
cur = pg.cursor()
|
||||
cur.execute("SELECT DISTINCT source FROM stage_3_queue WHERE completed_at IS NOT NULL")
|
||||
for row in cur.fetchall(): gr_sources.add(row[0])
|
||||
pg.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
failures = []
|
||||
try:
|
||||
pg = get_pg()
|
||||
cur = pg.cursor()
|
||||
cur.execute("""
|
||||
SELECT source, filepath, error, retry_count, first_failed_at, last_failed_at
|
||||
FROM ingest_failures WHERE resolved = FALSE
|
||||
ORDER BY last_failed_at DESC LIMIT 50
|
||||
""")
|
||||
for row in cur.fetchall():
|
||||
failures.append({
|
||||
"source": row[0], "filepath": row[1], "error": row[2],
|
||||
"retry_count": row[3], "first_failed_at": str(row[4]),
|
||||
"last_failed_at": str(row[5]),
|
||||
})
|
||||
pg.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
last_report = None
|
||||
try:
|
||||
if CORPUS_REPORT_PATH.exists():
|
||||
report = json.loads(CORPUS_REPORT_PATH.read_text())
|
||||
last_report = {
|
||||
"timestamp": report.get("timestamp"),
|
||||
"gaps": report.get("summary", {}).get("neither", 0),
|
||||
"auto_queued": len(report.get("auto_queued", [])),
|
||||
}
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return {
|
||||
"filesystem": fs_count,
|
||||
"pgvector": pv_count,
|
||||
"graphiti": len(gr_sources),
|
||||
"failures": failures,
|
||||
"failure_count": len(failures),
|
||||
"last_reconciliation": last_report,
|
||||
}
|
||||
|
||||
|
||||
@app.get("/api/corpus/status")
|
||||
async def corpus_status(auth: str = Depends(require_auth)):
|
||||
try:
|
||||
return JSONResponse(get_corpus_status_data())
|
||||
except Exception as e:
|
||||
return JSONResponse({"error": str(e)}, status_code=500)
|
||||
|
||||
|
||||
@app.post("/api/corpus/retry")
|
||||
async def corpus_retry(request: Request, auth: str = Depends(require_auth)):
|
||||
try:
|
||||
body = await request.json()
|
||||
source = body.get("source", "")
|
||||
if not source:
|
||||
return JSONResponse({"error": "source required"}, status_code=400)
|
||||
pg = get_pg()
|
||||
cur = pg.cursor()
|
||||
cur.execute("SELECT filepath FROM ingest_failures WHERE source = %s", (source,))
|
||||
row = cur.fetchone()
|
||||
pg.close()
|
||||
if not row:
|
||||
return JSONResponse({"error": "source not found in failures"}, status_code=404)
|
||||
filepath = Path(row[0])
|
||||
if not filepath.exists():
|
||||
return JSONResponse({"error": f"file not found: {filepath}"}, status_code=404)
|
||||
suffix = filepath.suffix.lower()
|
||||
text = ""
|
||||
try:
|
||||
if suffix in {".txt", ".md"}:
|
||||
text = filepath.read_text(encoding="utf-8", errors="ignore")
|
||||
elif suffix == ".pdf":
|
||||
from pypdf import PdfReader
|
||||
text = "".join(p.extract_text() + "\n" for p in PdfReader(filepath).pages if p.extract_text())
|
||||
elif suffix == ".docx":
|
||||
from docx import Document as DocxDocument
|
||||
text = "\n".join(p.text for p in DocxDocument(filepath).paragraphs if p.text.strip())
|
||||
elif suffix == ".pptx":
|
||||
from pptx import Presentation
|
||||
prs = Presentation(filepath)
|
||||
text = "\n".join(shape.text for slide in prs.slides for shape in slide.shapes
|
||||
if hasattr(shape, "text") and shape.text.strip())
|
||||
except Exception as e:
|
||||
return JSONResponse({"error": f"extraction failed: {e}"}, status_code=500)
|
||||
if not text.strip():
|
||||
return JSONResponse({"error": "file produces empty text — may be corrupt"}, status_code=422)
|
||||
pg = get_pg()
|
||||
cur = pg.cursor()
|
||||
cur.execute("""
|
||||
INSERT INTO stage_2_queue (source, full_text, char_length)
|
||||
VALUES (%s, %s, %s)
|
||||
ON CONFLICT (source) DO UPDATE SET
|
||||
full_text = EXCLUDED.full_text, char_length = EXCLUDED.char_length,
|
||||
enqueued_at = NOW(), completed_at = NULL, failed_at = NULL, attempts = 0
|
||||
""", (source, text[:50000], len(text)))
|
||||
cur.execute("""
|
||||
UPDATE ingest_failures SET retry_count = retry_count + 1, last_failed_at = NOW()
|
||||
WHERE source = %s
|
||||
""", (source,))
|
||||
pg.commit()
|
||||
pg.close()
|
||||
return JSONResponse({"queued": True, "source": source})
|
||||
except Exception as e:
|
||||
return JSONResponse({"error": str(e)}, status_code=500)
|
||||
|
||||
|
||||
@app.post("/api/corpus/reconcile")
|
||||
async def corpus_reconcile(request: Request, background_tasks: BackgroundTasks, auth: str = Depends(require_auth)):
|
||||
try:
|
||||
body = await request.json()
|
||||
fix = body.get("fix", True)
|
||||
except Exception:
|
||||
fix = True
|
||||
def run_reconcile():
|
||||
try:
|
||||
cmd = [PYTHON, CORPUS_INTEGRITY_SCRIPT]
|
||||
if fix:
|
||||
cmd.append("--fix")
|
||||
subprocess.run(cmd, cwd=str(Path.home() / "aaronai"), timeout=300)
|
||||
except Exception as e:
|
||||
print(f"Reconciliation failed: {e}")
|
||||
background_tasks.add_task(run_reconcile)
|
||||
return JSONResponse({"started": True, "fix": fix})
|
||||
|
||||
# ─── Scheduler ──────────────────────────────────────────────────────────────
|
||||
scheduler = BackgroundScheduler()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user