dream.py: enrich manifest with retrieval breadth metrics
This commit is contained in:
@@ -63,6 +63,11 @@ def prompt_hash(prompts: list[str]) -> str:
|
||||
combined = "".join(prompts)
|
||||
return hashlib.md5(combined.encode()).hexdigest()[:8]
|
||||
|
||||
def extract_folder(source_path):
|
||||
"""Extract top-level Nextcloud folder from source path."""
|
||||
parts = source_path.replace("\\", "/").split("/")
|
||||
return parts[0] if parts else "unknown"
|
||||
|
||||
# ─── Stage 1: Observe ───────────────────────────────────────────────────────
|
||||
|
||||
def observe_corpus():
|
||||
@@ -408,12 +413,17 @@ def dream_pipeline():
|
||||
print(f"[NREM] Retrieved {len(nrem_chunks)} chunks. Synthesizing...")
|
||||
nrem_output = synthesize_nrem(nrem_chunks)
|
||||
nrem_file = deliver(nrem_output, "nrem")
|
||||
nrem_sources = [c["source"] for c in nrem_chunks]
|
||||
nrem_folders = list({extract_folder(s) for s in nrem_sources})
|
||||
stage_data = {
|
||||
"nrem": {
|
||||
"chunks_retrieved": len(nrem_chunks),
|
||||
"avg_similarity": round(sum(c["relevance"] for c in nrem_chunks) / len(nrem_chunks), 3),
|
||||
"query": "research fabrication teaching practice recent work",
|
||||
"word_count": len(nrem_output.split()),
|
||||
"sources": nrem_sources,
|
||||
"distinct_folders": nrem_folders,
|
||||
"folder_count": len(nrem_folders),
|
||||
"status": "ok",
|
||||
}
|
||||
}
|
||||
@@ -430,11 +440,16 @@ def dream_pipeline():
|
||||
print(f"[Early REM] Retrieved {len(early_chunks)} chunks. Synthesizing with NREM context...")
|
||||
early_rem_output = synthesize_early_rem(early_chunks, nrem_output)
|
||||
deliver(early_rem_output, "early-rem")
|
||||
early_sources = [c["source"] for c in early_chunks]
|
||||
early_folders = list({extract_folder(s) for s in early_sources})
|
||||
stage_data["early_rem"] = {
|
||||
"chunks_retrieved": len(early_chunks),
|
||||
"avg_similarity": round(sum(c["relevance"] for c in early_chunks) / len(early_chunks), 3),
|
||||
"query": "career decision personal change what matters next",
|
||||
"word_count": len(early_rem_output.split()),
|
||||
"sources": early_sources,
|
||||
"distinct_folders": early_folders,
|
||||
"folder_count": len(early_folders),
|
||||
"status": "ok",
|
||||
}
|
||||
print(f"[Early REM] Done.\n{early_rem_output[:200]}...")
|
||||
@@ -450,11 +465,22 @@ def dream_pipeline():
|
||||
print(f"[Late REM] Retrieved {len(late_chunks)} chunks. Synthesizing with full context...")
|
||||
late_rem_output = synthesize_late_rem(late_chunks, nrem_output, early_rem_output)
|
||||
deliver(late_rem_output, "late-rem")
|
||||
late_sources = [c["source"] for c in late_chunks]
|
||||
late_folders = [extract_folder(s) for s in late_sources]
|
||||
cross_domain_pairs = sum(
|
||||
1 for i in range(len(late_folders))
|
||||
for j in range(i+1, len(late_folders))
|
||||
if late_folders[i] != late_folders[j]
|
||||
)
|
||||
stage_data["late_rem"] = {
|
||||
"chunks_retrieved": len(late_chunks),
|
||||
"avg_similarity": round(sum(c["relevance"] for c in late_chunks) / len(late_chunks), 3),
|
||||
"query": "practice place memory making",
|
||||
"word_count": len(late_rem_output.split()),
|
||||
"sources": late_sources,
|
||||
"distinct_folders": list(set(late_folders)),
|
||||
"folder_count": len(set(late_folders)),
|
||||
"cross_domain_pairs": cross_domain_pairs,
|
||||
"status": "ok",
|
||||
}
|
||||
print(f"[Late REM] Done.\n{late_rem_output[:200]}...")
|
||||
@@ -474,10 +500,18 @@ def dream_pipeline():
|
||||
print(f"{'='*60}")
|
||||
|
||||
# Write manifest
|
||||
all_session_sources = list(session_retrieved)
|
||||
all_session_folders = list({extract_folder(s) for s in all_session_sources})
|
||||
corpus_data = {
|
||||
"total_chunks": delta.get("new_chunks", 0),
|
||||
"new_chunks_since_last_dream": delta.get("new_chunks", 0),
|
||||
"days_since_last_dream": round(delta.get("days_since_dream", 0), 2),
|
||||
"substrate": "pgvector",
|
||||
"aggregate": {
|
||||
"total_distinct_sources": len(all_session_sources),
|
||||
"total_distinct_folders": len(all_session_folders),
|
||||
"folders_touched": all_session_folders,
|
||||
}
|
||||
}
|
||||
write_manifest(datetime.now().strftime("%Y-%m-%d"), stage_data, corpus_data)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user