dream.py: enrich manifest with retrieval breadth metrics
This commit is contained in:
@@ -63,6 +63,11 @@ def prompt_hash(prompts: list[str]) -> str:
|
|||||||
combined = "".join(prompts)
|
combined = "".join(prompts)
|
||||||
return hashlib.md5(combined.encode()).hexdigest()[:8]
|
return hashlib.md5(combined.encode()).hexdigest()[:8]
|
||||||
|
|
||||||
|
def extract_folder(source_path):
|
||||||
|
"""Extract top-level Nextcloud folder from source path."""
|
||||||
|
parts = source_path.replace("\\", "/").split("/")
|
||||||
|
return parts[0] if parts else "unknown"
|
||||||
|
|
||||||
# ─── Stage 1: Observe ───────────────────────────────────────────────────────
|
# ─── Stage 1: Observe ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
def observe_corpus():
|
def observe_corpus():
|
||||||
@@ -408,12 +413,17 @@ def dream_pipeline():
|
|||||||
print(f"[NREM] Retrieved {len(nrem_chunks)} chunks. Synthesizing...")
|
print(f"[NREM] Retrieved {len(nrem_chunks)} chunks. Synthesizing...")
|
||||||
nrem_output = synthesize_nrem(nrem_chunks)
|
nrem_output = synthesize_nrem(nrem_chunks)
|
||||||
nrem_file = deliver(nrem_output, "nrem")
|
nrem_file = deliver(nrem_output, "nrem")
|
||||||
|
nrem_sources = [c["source"] for c in nrem_chunks]
|
||||||
|
nrem_folders = list({extract_folder(s) for s in nrem_sources})
|
||||||
stage_data = {
|
stage_data = {
|
||||||
"nrem": {
|
"nrem": {
|
||||||
"chunks_retrieved": len(nrem_chunks),
|
"chunks_retrieved": len(nrem_chunks),
|
||||||
"avg_similarity": round(sum(c["relevance"] for c in nrem_chunks) / len(nrem_chunks), 3),
|
"avg_similarity": round(sum(c["relevance"] for c in nrem_chunks) / len(nrem_chunks), 3),
|
||||||
"query": "research fabrication teaching practice recent work",
|
"query": "research fabrication teaching practice recent work",
|
||||||
"word_count": len(nrem_output.split()),
|
"word_count": len(nrem_output.split()),
|
||||||
|
"sources": nrem_sources,
|
||||||
|
"distinct_folders": nrem_folders,
|
||||||
|
"folder_count": len(nrem_folders),
|
||||||
"status": "ok",
|
"status": "ok",
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -430,11 +440,16 @@ def dream_pipeline():
|
|||||||
print(f"[Early REM] Retrieved {len(early_chunks)} chunks. Synthesizing with NREM context...")
|
print(f"[Early REM] Retrieved {len(early_chunks)} chunks. Synthesizing with NREM context...")
|
||||||
early_rem_output = synthesize_early_rem(early_chunks, nrem_output)
|
early_rem_output = synthesize_early_rem(early_chunks, nrem_output)
|
||||||
deliver(early_rem_output, "early-rem")
|
deliver(early_rem_output, "early-rem")
|
||||||
|
early_sources = [c["source"] for c in early_chunks]
|
||||||
|
early_folders = list({extract_folder(s) for s in early_sources})
|
||||||
stage_data["early_rem"] = {
|
stage_data["early_rem"] = {
|
||||||
"chunks_retrieved": len(early_chunks),
|
"chunks_retrieved": len(early_chunks),
|
||||||
"avg_similarity": round(sum(c["relevance"] for c in early_chunks) / len(early_chunks), 3),
|
"avg_similarity": round(sum(c["relevance"] for c in early_chunks) / len(early_chunks), 3),
|
||||||
"query": "career decision personal change what matters next",
|
"query": "career decision personal change what matters next",
|
||||||
"word_count": len(early_rem_output.split()),
|
"word_count": len(early_rem_output.split()),
|
||||||
|
"sources": early_sources,
|
||||||
|
"distinct_folders": early_folders,
|
||||||
|
"folder_count": len(early_folders),
|
||||||
"status": "ok",
|
"status": "ok",
|
||||||
}
|
}
|
||||||
print(f"[Early REM] Done.\n{early_rem_output[:200]}...")
|
print(f"[Early REM] Done.\n{early_rem_output[:200]}...")
|
||||||
@@ -450,11 +465,22 @@ def dream_pipeline():
|
|||||||
print(f"[Late REM] Retrieved {len(late_chunks)} chunks. Synthesizing with full context...")
|
print(f"[Late REM] Retrieved {len(late_chunks)} chunks. Synthesizing with full context...")
|
||||||
late_rem_output = synthesize_late_rem(late_chunks, nrem_output, early_rem_output)
|
late_rem_output = synthesize_late_rem(late_chunks, nrem_output, early_rem_output)
|
||||||
deliver(late_rem_output, "late-rem")
|
deliver(late_rem_output, "late-rem")
|
||||||
|
late_sources = [c["source"] for c in late_chunks]
|
||||||
|
late_folders = [extract_folder(s) for s in late_sources]
|
||||||
|
cross_domain_pairs = sum(
|
||||||
|
1 for i in range(len(late_folders))
|
||||||
|
for j in range(i+1, len(late_folders))
|
||||||
|
if late_folders[i] != late_folders[j]
|
||||||
|
)
|
||||||
stage_data["late_rem"] = {
|
stage_data["late_rem"] = {
|
||||||
"chunks_retrieved": len(late_chunks),
|
"chunks_retrieved": len(late_chunks),
|
||||||
"avg_similarity": round(sum(c["relevance"] for c in late_chunks) / len(late_chunks), 3),
|
"avg_similarity": round(sum(c["relevance"] for c in late_chunks) / len(late_chunks), 3),
|
||||||
"query": "practice place memory making",
|
"query": "practice place memory making",
|
||||||
"word_count": len(late_rem_output.split()),
|
"word_count": len(late_rem_output.split()),
|
||||||
|
"sources": late_sources,
|
||||||
|
"distinct_folders": list(set(late_folders)),
|
||||||
|
"folder_count": len(set(late_folders)),
|
||||||
|
"cross_domain_pairs": cross_domain_pairs,
|
||||||
"status": "ok",
|
"status": "ok",
|
||||||
}
|
}
|
||||||
print(f"[Late REM] Done.\n{late_rem_output[:200]}...")
|
print(f"[Late REM] Done.\n{late_rem_output[:200]}...")
|
||||||
@@ -474,10 +500,18 @@ def dream_pipeline():
|
|||||||
print(f"{'='*60}")
|
print(f"{'='*60}")
|
||||||
|
|
||||||
# Write manifest
|
# Write manifest
|
||||||
|
all_session_sources = list(session_retrieved)
|
||||||
|
all_session_folders = list({extract_folder(s) for s in all_session_sources})
|
||||||
corpus_data = {
|
corpus_data = {
|
||||||
"total_chunks": delta.get("new_chunks", 0),
|
"total_chunks": delta.get("new_chunks", 0),
|
||||||
"new_chunks_since_last_dream": delta.get("new_chunks", 0),
|
"new_chunks_since_last_dream": delta.get("new_chunks", 0),
|
||||||
"days_since_last_dream": round(delta.get("days_since_dream", 0), 2),
|
"days_since_last_dream": round(delta.get("days_since_dream", 0), 2),
|
||||||
|
"substrate": "pgvector",
|
||||||
|
"aggregate": {
|
||||||
|
"total_distinct_sources": len(all_session_sources),
|
||||||
|
"total_distinct_folders": len(all_session_folders),
|
||||||
|
"folders_touched": all_session_folders,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
write_manifest(datetime.now().strftime("%Y-%m-%d"), stage_data, corpus_data)
|
write_manifest(datetime.now().strftime("%Y-%m-%d"), stage_data, corpus_data)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user