feat: stage2/3 pipeline, taxonomy-free cascade, E1.8/E4 experiments, corpus migration state

This commit is contained in:
2026-04-30 04:04:31 +00:00
parent 62b5b5453a
commit 2b9a1782c1
14 changed files with 6145 additions and 5 deletions
+31 -3
View File
@@ -11,7 +11,7 @@ from docx import Document
from pypdf import PdfReader
from pptx import Presentation
load_dotenv(Path.home() / "aaronai" / ".env")
load_dotenv(Path.home() / "aaronai" / ".env", override=True)
print("Loading embedding model...")
embedder = SentenceTransformer("all-MiniLM-L6-v2")
@@ -63,11 +63,34 @@ def make_id(filepath, chunk_index):
path_hash = hashlib.md5(str(filepath).encode()).hexdigest()[:8]
return f"{path_hash}_{chunk_index}"
def enqueue_stage2(source, full_text):
"""Enqueue document for Stage 2 (Mistral orientation) → Stage 3 (Graphiti ingest).
TEMPORARY: this queue feed will be removed when pgvector is decommissioned
and the watcher calls Stage 2 directly.
"""
try:
pg = get_pg()
cur = pg.cursor()
cur.execute("""
INSERT INTO stage_2_queue (source, full_text, char_length)
VALUES (%s, %s, %s)
ON CONFLICT (source) DO UPDATE SET
full_text = EXCLUDED.full_text,
char_length = EXCLUDED.char_length,
enqueued_at = NOW(),
completed_at = NULL,
failed_at = NULL,
attempts = 0
""", (source, full_text[:50000], len(full_text)))
pg.commit()
pg.close()
except Exception as e:
print(f" Stage 2 queue insert failed (non-fatal): {e}")
def ingest_file(filepath):
path = Path(filepath)
suffix = path.suffix.lower()
# Skip temp files
if path.name.startswith("~$") or path.name.startswith("."):
return 0
@@ -98,6 +121,7 @@ def ingest_file(filepath):
"folder": str(path.parent.relative_to(Path(sys.argv[1]) if len(sys.argv) > 1 else path.parent))
} for _ in chunks]
# STAGE 1: Write to pgvector (TEMPORARY — remove when chat agent migrates to Graphiti)
pg = get_pg()
cur = pg.cursor()
for chunk_id, chunk, embedding, meta in zip(ids, chunks, embeddings, metadatas):
@@ -111,12 +135,16 @@ def ingest_file(filepath):
metadata = EXCLUDED.metadata
""", (
chunk_id, chunk, embedding,
meta.get('source'), 'document', None,
meta.get("source"), "document", None,
json.dumps(meta)
))
pg.commit()
pg.close()
print(f" Indexed {len(chunks)} chunks: {path.name}")
# Enqueue for Stage 2 → Stage 3 (Graphiti pipeline)
enqueue_stage2(path.name, text)
return len(chunks)
except Exception as e: