diff --git a/scripts/dream.py b/scripts/dream.py index 7c34b4e..41a78a5 100644 --- a/scripts/dream.py +++ b/scripts/dream.py @@ -63,6 +63,11 @@ def prompt_hash(prompts: list[str]) -> str: combined = "".join(prompts) return hashlib.md5(combined.encode()).hexdigest()[:8] +def extract_folder(source_path): + """Extract top-level Nextcloud folder from source path.""" + parts = source_path.replace("\\", "/").split("/") + return parts[0] if parts else "unknown" + # ─── Stage 1: Observe ─────────────────────────────────────────────────────── def observe_corpus(): @@ -408,12 +413,17 @@ def dream_pipeline(): print(f"[NREM] Retrieved {len(nrem_chunks)} chunks. Synthesizing...") nrem_output = synthesize_nrem(nrem_chunks) nrem_file = deliver(nrem_output, "nrem") + nrem_sources = [c["source"] for c in nrem_chunks] + nrem_folders = list({extract_folder(s) for s in nrem_sources}) stage_data = { "nrem": { "chunks_retrieved": len(nrem_chunks), "avg_similarity": round(sum(c["relevance"] for c in nrem_chunks) / len(nrem_chunks), 3), "query": "research fabrication teaching practice recent work", "word_count": len(nrem_output.split()), + "sources": nrem_sources, + "distinct_folders": nrem_folders, + "folder_count": len(nrem_folders), "status": "ok", } } @@ -430,11 +440,16 @@ def dream_pipeline(): print(f"[Early REM] Retrieved {len(early_chunks)} chunks. Synthesizing with NREM context...") early_rem_output = synthesize_early_rem(early_chunks, nrem_output) deliver(early_rem_output, "early-rem") + early_sources = [c["source"] for c in early_chunks] + early_folders = list({extract_folder(s) for s in early_sources}) stage_data["early_rem"] = { "chunks_retrieved": len(early_chunks), "avg_similarity": round(sum(c["relevance"] for c in early_chunks) / len(early_chunks), 3), "query": "career decision personal change what matters next", "word_count": len(early_rem_output.split()), + "sources": early_sources, + "distinct_folders": early_folders, + "folder_count": len(early_folders), "status": "ok", } print(f"[Early REM] Done.\n{early_rem_output[:200]}...") @@ -450,11 +465,22 @@ def dream_pipeline(): print(f"[Late REM] Retrieved {len(late_chunks)} chunks. Synthesizing with full context...") late_rem_output = synthesize_late_rem(late_chunks, nrem_output, early_rem_output) deliver(late_rem_output, "late-rem") + late_sources = [c["source"] for c in late_chunks] + late_folders = [extract_folder(s) for s in late_sources] + cross_domain_pairs = sum( + 1 for i in range(len(late_folders)) + for j in range(i+1, len(late_folders)) + if late_folders[i] != late_folders[j] + ) stage_data["late_rem"] = { "chunks_retrieved": len(late_chunks), "avg_similarity": round(sum(c["relevance"] for c in late_chunks) / len(late_chunks), 3), "query": "practice place memory making", "word_count": len(late_rem_output.split()), + "sources": late_sources, + "distinct_folders": list(set(late_folders)), + "folder_count": len(set(late_folders)), + "cross_domain_pairs": cross_domain_pairs, "status": "ok", } print(f"[Late REM] Done.\n{late_rem_output[:200]}...") @@ -474,10 +500,18 @@ def dream_pipeline(): print(f"{'='*60}") # Write manifest + all_session_sources = list(session_retrieved) + all_session_folders = list({extract_folder(s) for s in all_session_sources}) corpus_data = { "total_chunks": delta.get("new_chunks", 0), "new_chunks_since_last_dream": delta.get("new_chunks", 0), "days_since_last_dream": round(delta.get("days_since_dream", 0), 2), + "substrate": "pgvector", + "aggregate": { + "total_distinct_sources": len(all_session_sources), + "total_distinct_folders": len(all_session_folders), + "folders_touched": all_session_folders, + } } write_manifest(datetime.now().strftime("%Y-%m-%d"), stage_data, corpus_data)