dream.py: enrich manifest with retrieval breadth metrics

This commit is contained in:
2026-04-30 06:14:55 +00:00
parent 2b9a1782c1
commit b53717af5b
+34
View File
@@ -63,6 +63,11 @@ def prompt_hash(prompts: list[str]) -> str:
combined = "".join(prompts) combined = "".join(prompts)
return hashlib.md5(combined.encode()).hexdigest()[:8] return hashlib.md5(combined.encode()).hexdigest()[:8]
def extract_folder(source_path):
"""Extract top-level Nextcloud folder from source path."""
parts = source_path.replace("\\", "/").split("/")
return parts[0] if parts else "unknown"
# ─── Stage 1: Observe ─────────────────────────────────────────────────────── # ─── Stage 1: Observe ───────────────────────────────────────────────────────
def observe_corpus(): def observe_corpus():
@@ -408,12 +413,17 @@ def dream_pipeline():
print(f"[NREM] Retrieved {len(nrem_chunks)} chunks. Synthesizing...") print(f"[NREM] Retrieved {len(nrem_chunks)} chunks. Synthesizing...")
nrem_output = synthesize_nrem(nrem_chunks) nrem_output = synthesize_nrem(nrem_chunks)
nrem_file = deliver(nrem_output, "nrem") nrem_file = deliver(nrem_output, "nrem")
nrem_sources = [c["source"] for c in nrem_chunks]
nrem_folders = list({extract_folder(s) for s in nrem_sources})
stage_data = { stage_data = {
"nrem": { "nrem": {
"chunks_retrieved": len(nrem_chunks), "chunks_retrieved": len(nrem_chunks),
"avg_similarity": round(sum(c["relevance"] for c in nrem_chunks) / len(nrem_chunks), 3), "avg_similarity": round(sum(c["relevance"] for c in nrem_chunks) / len(nrem_chunks), 3),
"query": "research fabrication teaching practice recent work", "query": "research fabrication teaching practice recent work",
"word_count": len(nrem_output.split()), "word_count": len(nrem_output.split()),
"sources": nrem_sources,
"distinct_folders": nrem_folders,
"folder_count": len(nrem_folders),
"status": "ok", "status": "ok",
} }
} }
@@ -430,11 +440,16 @@ def dream_pipeline():
print(f"[Early REM] Retrieved {len(early_chunks)} chunks. Synthesizing with NREM context...") print(f"[Early REM] Retrieved {len(early_chunks)} chunks. Synthesizing with NREM context...")
early_rem_output = synthesize_early_rem(early_chunks, nrem_output) early_rem_output = synthesize_early_rem(early_chunks, nrem_output)
deliver(early_rem_output, "early-rem") deliver(early_rem_output, "early-rem")
early_sources = [c["source"] for c in early_chunks]
early_folders = list({extract_folder(s) for s in early_sources})
stage_data["early_rem"] = { stage_data["early_rem"] = {
"chunks_retrieved": len(early_chunks), "chunks_retrieved": len(early_chunks),
"avg_similarity": round(sum(c["relevance"] for c in early_chunks) / len(early_chunks), 3), "avg_similarity": round(sum(c["relevance"] for c in early_chunks) / len(early_chunks), 3),
"query": "career decision personal change what matters next", "query": "career decision personal change what matters next",
"word_count": len(early_rem_output.split()), "word_count": len(early_rem_output.split()),
"sources": early_sources,
"distinct_folders": early_folders,
"folder_count": len(early_folders),
"status": "ok", "status": "ok",
} }
print(f"[Early REM] Done.\n{early_rem_output[:200]}...") print(f"[Early REM] Done.\n{early_rem_output[:200]}...")
@@ -450,11 +465,22 @@ def dream_pipeline():
print(f"[Late REM] Retrieved {len(late_chunks)} chunks. Synthesizing with full context...") print(f"[Late REM] Retrieved {len(late_chunks)} chunks. Synthesizing with full context...")
late_rem_output = synthesize_late_rem(late_chunks, nrem_output, early_rem_output) late_rem_output = synthesize_late_rem(late_chunks, nrem_output, early_rem_output)
deliver(late_rem_output, "late-rem") deliver(late_rem_output, "late-rem")
late_sources = [c["source"] for c in late_chunks]
late_folders = [extract_folder(s) for s in late_sources]
cross_domain_pairs = sum(
1 for i in range(len(late_folders))
for j in range(i+1, len(late_folders))
if late_folders[i] != late_folders[j]
)
stage_data["late_rem"] = { stage_data["late_rem"] = {
"chunks_retrieved": len(late_chunks), "chunks_retrieved": len(late_chunks),
"avg_similarity": round(sum(c["relevance"] for c in late_chunks) / len(late_chunks), 3), "avg_similarity": round(sum(c["relevance"] for c in late_chunks) / len(late_chunks), 3),
"query": "practice place memory making", "query": "practice place memory making",
"word_count": len(late_rem_output.split()), "word_count": len(late_rem_output.split()),
"sources": late_sources,
"distinct_folders": list(set(late_folders)),
"folder_count": len(set(late_folders)),
"cross_domain_pairs": cross_domain_pairs,
"status": "ok", "status": "ok",
} }
print(f"[Late REM] Done.\n{late_rem_output[:200]}...") print(f"[Late REM] Done.\n{late_rem_output[:200]}...")
@@ -474,10 +500,18 @@ def dream_pipeline():
print(f"{'='*60}") print(f"{'='*60}")
# Write manifest # Write manifest
all_session_sources = list(session_retrieved)
all_session_folders = list({extract_folder(s) for s in all_session_sources})
corpus_data = { corpus_data = {
"total_chunks": delta.get("new_chunks", 0), "total_chunks": delta.get("new_chunks", 0),
"new_chunks_since_last_dream": delta.get("new_chunks", 0), "new_chunks_since_last_dream": delta.get("new_chunks", 0),
"days_since_last_dream": round(delta.get("days_since_dream", 0), 2), "days_since_last_dream": round(delta.get("days_since_dream", 0), 2),
"substrate": "pgvector",
"aggregate": {
"total_distinct_sources": len(all_session_sources),
"total_distinct_folders": len(all_session_folders),
"folders_touched": all_session_folders,
}
} }
write_manifest(datetime.now().strftime("%Y-%m-%d"), stage_data, corpus_data) write_manifest(datetime.now().strftime("%Y-%m-%d"), stage_data, corpus_data)