dream.py: enrich manifest with retrieval breadth metrics

2026-04-30 06:14:55 +00:00
parent 2b9a1782c1
commit b53717af5b
1 changed files with 34 additions and 0 deletions
@@ -63,6 +63,11 @@ def prompt_hash(prompts: list[str]) -> str:
    combined = "".join(prompts)
    return hashlib.md5(combined.encode()).hexdigest()[:8]

+def extract_folder(source_path):
+    """Extract top-level Nextcloud folder from source path."""
+    parts = source_path.replace("\\", "/").split("/")
+    return parts[0] if parts else "unknown"
+
 # ─── Stage 1: Observe ───────────────────────────────────────────────────────

 def observe_corpus():
@@ -408,12 +413,17 @@ def dream_pipeline():
    print(f"[NREM] Retrieved {len(nrem_chunks)} chunks. Synthesizing...")
    nrem_output = synthesize_nrem(nrem_chunks)
    nrem_file = deliver(nrem_output, "nrem")
+    nrem_sources = [c["source"] for c in nrem_chunks]
+    nrem_folders = list({extract_folder(s) for s in nrem_sources})
    stage_data = {
        "nrem": {
            "chunks_retrieved": len(nrem_chunks),
            "avg_similarity": round(sum(c["relevance"] for c in nrem_chunks) / len(nrem_chunks), 3),
            "query": "research fabrication teaching practice recent work",
            "word_count": len(nrem_output.split()),
+            "sources": nrem_sources,
+            "distinct_folders": nrem_folders,
+            "folder_count": len(nrem_folders),
            "status": "ok",
        }
    }
@@ -430,11 +440,16 @@ def dream_pipeline():
        print(f"[Early REM] Retrieved {len(early_chunks)} chunks. Synthesizing with NREM context...")
        early_rem_output = synthesize_early_rem(early_chunks, nrem_output)
        deliver(early_rem_output, "early-rem")
+        early_sources = [c["source"] for c in early_chunks]
+        early_folders = list({extract_folder(s) for s in early_sources})
        stage_data["early_rem"] = {
            "chunks_retrieved": len(early_chunks),
            "avg_similarity": round(sum(c["relevance"] for c in early_chunks) / len(early_chunks), 3),
            "query": "career decision personal change what matters next",
            "word_count": len(early_rem_output.split()),
+            "sources": early_sources,
+            "distinct_folders": early_folders,
+            "folder_count": len(early_folders),
            "status": "ok",
        }
        print(f"[Early REM] Done.\n{early_rem_output[:200]}...")
@@ -450,11 +465,22 @@ def dream_pipeline():
        print(f"[Late REM] Retrieved {len(late_chunks)} chunks. Synthesizing with full context...")
        late_rem_output = synthesize_late_rem(late_chunks, nrem_output, early_rem_output)
        deliver(late_rem_output, "late-rem")
+        late_sources = [c["source"] for c in late_chunks]
+        late_folders = [extract_folder(s) for s in late_sources]
+        cross_domain_pairs = sum(
+            1 for i in range(len(late_folders))
+            for j in range(i+1, len(late_folders))
+            if late_folders[i] != late_folders[j]
+        )
        stage_data["late_rem"] = {
            "chunks_retrieved": len(late_chunks),
            "avg_similarity": round(sum(c["relevance"] for c in late_chunks) / len(late_chunks), 3),
            "query": "practice place memory making",
            "word_count": len(late_rem_output.split()),
+            "sources": late_sources,
+            "distinct_folders": list(set(late_folders)),
+            "folder_count": len(set(late_folders)),
+            "cross_domain_pairs": cross_domain_pairs,
            "status": "ok",
        }
        print(f"[Late REM] Done.\n{late_rem_output[:200]}...")
@@ -474,10 +500,18 @@ def dream_pipeline():
    print(f"{'='*60}")

    # Write manifest
+    all_session_sources = list(session_retrieved)
+    all_session_folders = list({extract_folder(s) for s in all_session_sources})
    corpus_data = {
        "total_chunks": delta.get("new_chunks", 0),
        "new_chunks_since_last_dream": delta.get("new_chunks", 0),
        "days_since_last_dream": round(delta.get("days_since_dream", 0), 2),
+        "substrate": "pgvector",
+        "aggregate": {
+            "total_distinct_sources": len(all_session_sources),
+            "total_distinct_folders": len(all_session_folders),
+            "folders_touched": all_session_folders,
+        }
    }
    write_manifest(datetime.now().strftime("%Y-%m-%d"), stage_data, corpus_data)