embeddings: enforce type/created_at on writers; manifests carry type_distribution (Improvement #2 part B+C)
Writers now enforce type and created_at:
- encoding.py: ValueError raised at write_embeddings_batch if row dict lacks
'type'. created_at remains SQL-supplied (NOW() server-side). ON CONFLICT
DO UPDATE now also rewrites type=EXCLUDED.type and preserves the original
created_at via COALESCE(embeddings.created_at, EXCLUDED.created_at) — a
re-ingest re-classifies type but does not overwrite a backfilled mtime.
- ingest_conversations.py: same assertion. ON CONFLICT intentionally keeps
EXCLUDED.created_at semantics (Aaron-AI conversation created_at tracks
convo.updated_at; re-runs should refresh).
- Column-level NOT NULL is not added; application-layer raise gives a
faster, more debuggable failure than a Postgres constraint error.
Retrieval propagates type into chunks:
- retrieve() SELECT now includes type; chunk dicts carry "type": etype.
- WHERE clause built dynamically from excluded_sources and the new
--type-filter CLI arg (experimental, default None, pgvector retrieval
only — Graphiti chunks have no embeddings.type to filter on).
- retrieve_graphiti unchanged; its chunks lack the type field.
Manifests carry type_distribution per stage:
- dream_pipeline writes stage_data[<stage>]["type_distribution"] for nrem,
early_rem, late_rem — a Counter over chunk types, filtering None so
Graphiti chunks (when DREAMER_SUBSTRATE=graphiti) don't pollute the
distribution. Pgvector chunks always carry type post-backfill; if None
appears, the backfill or writer enforcement has regressed.
Verification:
B1 force re-ingest of "Finite and infinite games -- James Carse.pdf":
all 84 chunks preserved created_at=2026-04-27T06:11:55Z
B2 missing-type assertion raises ValueError, no row leaked to embeddings
B3 ast.parse(*) clean; EXPLAIN renders for {no excl/no filter,
type_filter only, excl 2 elems, excl 1 elem edge case, both};
all five plans use HNSW index scan with correct Filter clauses
C1 retrieve("nrem") returns 8 chunks each carrying "type" key
C2 type_distribution = {'document': 5, 'chatgpt_conversation': 3} —
2 distinct types, 62.5/37.5 split (looser bar: >=2 types,
no single type >=90%)
The type and created_at fields are now load-bearing: every dream manifest
emits type_distribution per stage. Reverting the backfill makes the
distribution show NULLs at every dream run.
This commit is contained in:
+44
-27
@@ -16,6 +16,7 @@ import os
|
||||
import json
|
||||
import sqlite3
|
||||
import argparse
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timedelta
|
||||
from dotenv import load_dotenv
|
||||
@@ -282,9 +283,11 @@ def retrieve_graphiti(mode, task=None, n_results=8, excluded_sources=None):
|
||||
print(f"[Graphiti retrieval error: {e}] — falling back to empty.")
|
||||
return []
|
||||
|
||||
def retrieve(mode, task=None, n_results=8, excluded_sources=None):
|
||||
def retrieve(mode, task=None, n_results=8, excluded_sources=None, type_filter=None):
|
||||
# E3 experiment: DREAMER_SUBSTRATE=graphiti routes retrieval to Graphiti /search
|
||||
# Default behavior: pgvector similarity search (unchanged)
|
||||
# type_filter is experimental and applies to pgvector retrieval only — Graphiti
|
||||
# facts are not embeddings rows and have no embeddings.type to filter on.
|
||||
substrate = os.getenv("DREAMER_SUBSTRATE", "pgvector")
|
||||
if substrate == "graphiti":
|
||||
return retrieve_graphiti(mode, task=task, n_results=n_results, excluded_sources=excluded_sources)
|
||||
@@ -311,23 +314,23 @@ def retrieve(mode, task=None, n_results=8, excluded_sources=None):
|
||||
pg = get_pg()
|
||||
cur = pg.cursor()
|
||||
excluded_sources = excluded_sources or set()
|
||||
where, params = [], []
|
||||
if excluded_sources:
|
||||
cur.execute("""
|
||||
SELECT document, source, 1 - (embedding <=> %s::vector) as similarity
|
||||
FROM embeddings
|
||||
WHERE source NOT IN %s
|
||||
ORDER BY embedding <=> %s::vector
|
||||
LIMIT %s
|
||||
""", (embedding, tuple(excluded_sources), embedding, n_results * 3))
|
||||
else:
|
||||
cur.execute("""
|
||||
SELECT document, source, 1 - (embedding <=> %s::vector) as similarity
|
||||
FROM embeddings
|
||||
ORDER BY embedding <=> %s::vector
|
||||
LIMIT %s
|
||||
""", (embedding, embedding, n_results * 3))
|
||||
where.append("source NOT IN %s")
|
||||
params.append(tuple(excluded_sources))
|
||||
if type_filter:
|
||||
where.append("type = ANY(%s)")
|
||||
params.append(list(type_filter))
|
||||
where_clause = ("WHERE " + " AND ".join(where)) if where else ""
|
||||
cur.execute(f"""
|
||||
SELECT document, source, type, 1 - (embedding <=> %s::vector) as similarity
|
||||
FROM embeddings
|
||||
{where_clause}
|
||||
ORDER BY embedding <=> %s::vector
|
||||
LIMIT %s
|
||||
""", [embedding, *params, embedding, n_results * 3])
|
||||
|
||||
for doc, source, similarity in cur.fetchall():
|
||||
for doc, source, etype, similarity in cur.fetchall():
|
||||
if not (low <= similarity <= high):
|
||||
continue
|
||||
if source in seen_sources:
|
||||
@@ -337,6 +340,7 @@ def retrieve(mode, task=None, n_results=8, excluded_sources=None):
|
||||
"content": doc,
|
||||
"relevance": similarity,
|
||||
"similarity": similarity,
|
||||
"type": etype,
|
||||
})
|
||||
seen_sources.add(source)
|
||||
if len(chunks) >= n_results:
|
||||
@@ -482,7 +486,7 @@ def write_manifest(date_str, stage_data, corpus_data):
|
||||
print(f"Manifest write failed (non-critical): {e}")
|
||||
|
||||
|
||||
def dream_pipeline():
|
||||
def dream_pipeline(type_filter=None):
|
||||
"""
|
||||
Full nightly pipeline — interdependent stages.
|
||||
NREM output feeds Early REM. Both feed Late REM. All three feed Synthesis.
|
||||
@@ -501,7 +505,7 @@ def dream_pipeline():
|
||||
print("\n[NREM] Retrieving...")
|
||||
# NREM is replay-and-consolidation — does not exclude prior traces.
|
||||
# Late REM and Early REM exclude prior content for novelty; NREM does not.
|
||||
nrem_chunks = retrieve("nrem", excluded_sources=None)
|
||||
nrem_chunks = retrieve("nrem", excluded_sources=None, type_filter=type_filter)
|
||||
session_retrieved.update(c["source"] for c in nrem_chunks)
|
||||
# Track sources that scored above Early REM ceiling — these are the only ones Early REM should exclude
|
||||
nrem_high_sources = {c["source"] for c in nrem_chunks if c["similarity"] > 0.55}
|
||||
@@ -523,6 +527,10 @@ def dream_pipeline():
|
||||
"sources": nrem_sources,
|
||||
"distinct_folders": nrem_folders,
|
||||
"folder_count": len(nrem_folders),
|
||||
# Counter filters None: Graphiti chunks lack `type` (facts, not embeddings rows).
|
||||
# Pgvector chunks always carry type post-Improvement-#2 backfill. If type
|
||||
# ever appears as None here, the backfill or writer enforcement has regressed.
|
||||
"type_distribution": dict(Counter(c.get("type") for c in nrem_chunks if c.get("type"))),
|
||||
"status": "ok",
|
||||
}
|
||||
}
|
||||
@@ -532,7 +540,7 @@ def dream_pipeline():
|
||||
print("\n[Early REM] Retrieving...")
|
||||
# Early REM excludes previously retrieved + NREM high-scorers only (not full session_retrieved)
|
||||
# Sources that scored in Early REM band during NREM remain available
|
||||
early_chunks = retrieve("early-rem", excluded_sources=nrem_high_sources)
|
||||
early_chunks = retrieve("early-rem", excluded_sources=nrem_high_sources, type_filter=type_filter)
|
||||
session_retrieved.update(c["source"] for c in early_chunks)
|
||||
if not early_chunks:
|
||||
print("[Early REM] No suitable chunks — skipping")
|
||||
@@ -551,13 +559,14 @@ def dream_pipeline():
|
||||
"sources": early_sources,
|
||||
"distinct_folders": early_folders,
|
||||
"folder_count": len(early_folders),
|
||||
"type_distribution": dict(Counter(c.get("type") for c in early_chunks if c.get("type"))),
|
||||
"status": "ok",
|
||||
}
|
||||
print(f"[Early REM] Done.\n{early_rem_output[:200]}...")
|
||||
|
||||
# ── Stage 3: Late REM — informed by NREM + Early REM ──────────────────
|
||||
print("\n[Late REM] Retrieving...")
|
||||
late_chunks = retrieve("late-rem", excluded_sources=session_retrieved)
|
||||
late_chunks = retrieve("late-rem", excluded_sources=session_retrieved, type_filter=type_filter)
|
||||
session_retrieved.update(c["source"] for c in late_chunks)
|
||||
if not late_chunks:
|
||||
print("[Late REM] No suitable chunks — skipping")
|
||||
@@ -582,6 +591,7 @@ def dream_pipeline():
|
||||
"distinct_folders": list(set(late_folders)),
|
||||
"folder_count": len(set(late_folders)),
|
||||
"cross_domain_pairs": cross_domain_pairs,
|
||||
"type_distribution": dict(Counter(c.get("type") for c in late_chunks if c.get("type"))),
|
||||
"status": "ok",
|
||||
}
|
||||
print(f"[Late REM] Done.\n{late_rem_output[:200]}...")
|
||||
@@ -628,10 +638,10 @@ def dream_pipeline():
|
||||
return synthesis_file
|
||||
|
||||
|
||||
def dream_lucid(task):
|
||||
def dream_lucid(task, type_filter=None):
|
||||
"""On-demand lucid dream — single mode, used by Dream Now in settings."""
|
||||
print(f"Lucid dream starting — task: {task[:80] if task else 'none'}")
|
||||
chunks = retrieve("lucid", task=task)
|
||||
chunks = retrieve("lucid", task=task, type_filter=type_filter)
|
||||
if not chunks:
|
||||
print("No suitable chunks — aborting")
|
||||
return None
|
||||
@@ -653,13 +663,13 @@ def dream_lucid(task):
|
||||
return filepath
|
||||
|
||||
|
||||
def dream_single(mode, task=None):
|
||||
def dream_single(mode, task=None, type_filter=None):
|
||||
"""
|
||||
Single mode — used by Dream Now for non-lucid modes.
|
||||
Runs one stage independently (for testing/tuning individual stages).
|
||||
"""
|
||||
print(f"Single mode dream: {mode}")
|
||||
chunks = retrieve(mode, task=task)
|
||||
chunks = retrieve(mode, task=task, type_filter=type_filter)
|
||||
if not chunks:
|
||||
print("No suitable chunks — aborting")
|
||||
return None
|
||||
@@ -696,12 +706,19 @@ if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Aaron AI Dreamer")
|
||||
parser.add_argument("--mode", choices=["nrem", "early-rem", "late-rem", "lucid", "pipeline"])
|
||||
parser.add_argument("--task", type=str)
|
||||
parser.add_argument(
|
||||
"--type-filter", type=str, default=None,
|
||||
help="Comma-separated embeddings.type allowlist (e.g. 'document,aaronai_conversation'). "
|
||||
"Applies to pgvector retrieval only; Graphiti chunks are not filtered. "
|
||||
"Experimental — default is no filter, no behavior change.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
type_filter = [t.strip() for t in args.type_filter.split(",")] if args.type_filter else None
|
||||
|
||||
if args.mode == "lucid":
|
||||
dream_lucid(args.task or "What should I be thinking about that I am not?")
|
||||
dream_lucid(args.task or "What should I be thinking about that I am not?", type_filter=type_filter)
|
||||
elif args.mode and args.mode != "pipeline":
|
||||
dream_single(args.mode, args.task)
|
||||
dream_single(args.mode, args.task, type_filter=type_filter)
|
||||
else:
|
||||
# Default: full pipeline
|
||||
dream_pipeline()
|
||||
dream_pipeline(type_filter=type_filter)
|
||||
|
||||
Reference in New Issue
Block a user