From d985f9e91e83faf2bace1530c33c1eac14261ca6 Mon Sep 17 00:00:00 2001 From: Aaron Nelson Date: Mon, 4 May 2026 16:29:04 +0000 Subject: [PATCH] dream.py: raise_for_status on manifest writes; total_chunks as actual corpus count MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two correctness bugs in dream_pipeline manifest assembly. write_manifest at lines 487-491 swallowed HTTP 4xx/5xx responses silently. requests.put() only raises on transport-level errors (DNS, connection refused, timeout); 401/403/500/507 come back as Response objects and never trigger the except. The code printed "Manifest written" while the manifest never persisted. The same file's deliver() function at line 434 already used response.raise_for_status() — the pattern was already established, write_manifest just skipped it. Fix: bind the response and call raise_for_status() before the success print. The except message changes from "(non-critical)" to "manifest not persisted" because HTTP failure now means manifest data was lost, which is critical, not quiet. corpus_data["total_chunks"] at lines 621-622 stored delta["new_chunks"], duplicating the sibling field new_chunks_since_last_dream. The field name claimed absolute corpus size; the value was a delta of recently-touched files. Verified in live manifests: total_chunks: 0 while pgvector held 11,379+ document embeddings. Fix: query SELECT COUNT(*) FROM embeddings inside dream_pipeline, store as total_chunks. Tightly-scoped one-shot connect via the existing get_pg() helper. Telemetry query failure is treated as non-critical and falls back to 0 — pgvector hiccup should not crash an otherwise successful dream pipeline. Bonus finding (not fixed in this commit): new_chunks_since_last_dream is itself misnamed. observe_corpus() reads the watcher's mtime cache and counts files (not chunks) whose mtime is newer than last_dream. Both fields were "files touched since last dream" duplicated under two different names; this commit fixes only the total_chunks semantics. Renaming new_chunks_since_last_dream is out of scope — manifests are write-only telemetry today, no consumer reads either field, and the rename is a separate decision. Verification: real pipeline run produced manifest with total_chunks matching SELECT COUNT(*) directly; doubled as a smoke test for the embedder cache (single Loading weights line), type_distribution propagation, and the manifest write success path. --- scripts/dream.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/scripts/dream.py b/scripts/dream.py index e97deb2..dd9b26e 100644 --- a/scripts/dream.py +++ b/scripts/dream.py @@ -485,10 +485,11 @@ def write_manifest(date_str, stage_data, corpus_data): auth = (NEXTCLOUD_USER, NEXTCLOUD_PASSWORD) url = f"{DREAMS_WEBDAV}/dream-manifest-{date_str}.json" try: - requests.put(url, data=content.encode("utf-8"), auth=auth, timeout=30) + response = requests.put(url, data=content.encode("utf-8"), auth=auth, timeout=30) + response.raise_for_status() print(f"Manifest written: Journal/Dreams/dream-manifest-{date_str}.json") except Exception as e: - print(f"Manifest write failed (non-critical): {e}") + print(f"Manifest write failed — manifest not persisted: {e}") def dream_pipeline(type_filter=None): @@ -618,8 +619,20 @@ def dream_pipeline(type_filter=None): # Write manifest all_session_sources = list(session_retrieved) all_session_folders = list({extract_folder(s) for s in all_session_sources}) + total_chunks = 0 + pg = None + try: + pg = get_pg() + cur = pg.cursor() + cur.execute("SELECT COUNT(*) FROM embeddings") + total_chunks = cur.fetchone()[0] + except Exception as e: + print(f"total_chunks query failed (non-critical): {e}") + finally: + if pg is not None: + pg.close() corpus_data = { - "total_chunks": delta.get("new_chunks", 0), + "total_chunks": total_chunks, "new_chunks_since_last_dream": delta.get("new_chunks", 0), "days_since_last_dream": round(delta.get("days_since_dream", 0), 2), "substrate": "pgvector",