From 465f2f725baee9b7412027cb02719bb4502bd60f Mon Sep 17 00:00:00 2001 From: Aaron Nelson Date: Fri, 1 May 2026 02:26:37 +0000 Subject: [PATCH] Code review fixes: CV pinning, F1 (excluded_sources), F14 (50KB truncation), F37 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - api.py: strip CV pinning workaround (parity violation, see architecture doc) - dream.py: F1 — retrieve_graphiti() now accepts excluded_sources, over-fetches 3x and filters in-process. Was silently dropping the parameter; would have confounded E3 with broken cross-stage exclusion in Graphiti arm. - watcher.py + ingest.py: F14 — drop full_text[:50000] truncation. Was propagating through entire cascade. Postgres TEXT can hold up to 1GB. - corpus_integrity.py: F37 — same truncation, third path now clean. Backups: api.py.bak.*, dream.py.bak.*, watcher.py.bak.*, ingest.py.bak.*, corpus_integrity.py.bak.* timestamped pre-fix. Re-cascaded Shop Class as Soulcraft (only already-cascaded source affected by F14, 414KB). --- corpus_integrity_report.json | 161 +++ dreamer_state.json | 50 +- scripts/api.py | 39 +- scripts/api.py.bak.20260501-001427 | 1287 +++++++++++++++++ scripts/consolidator_v0_1.py.bak | 442 ++++++ scripts/corpus_integrity.py | 2 +- .../corpus_integrity.py.bak.20260501-021703 | 245 ++++ scripts/dream.py | 26 +- scripts/dream.py.bak | 554 +++++++ scripts/dream.py.bak.20260501-002209 | 668 +++++++++ scripts/graphiti_service.py.bak | 171 +++ scripts/ingest.py | 2 +- scripts/ingest.py.bak.20260501-004131 | 182 +++ scripts/watcher.py | 2 +- scripts/watcher.py.bak | 210 +++ scripts/watcher.py.bak.20260501-004131 | 448 ++++++ watcher_heartbeat | 1 + 17 files changed, 4432 insertions(+), 58 deletions(-) create mode 100644 corpus_integrity_report.json create mode 100644 scripts/api.py.bak.20260501-001427 create mode 100644 scripts/consolidator_v0_1.py.bak create mode 100644 scripts/corpus_integrity.py.bak.20260501-021703 create mode 100644 scripts/dream.py.bak create mode 100644 scripts/dream.py.bak.20260501-002209 create mode 100644 scripts/graphiti_service.py.bak create mode 100644 scripts/ingest.py.bak.20260501-004131 create mode 100644 scripts/watcher.py.bak create mode 100644 scripts/watcher.py.bak.20260501-004131 create mode 100644 watcher_heartbeat diff --git a/corpus_integrity_report.json b/corpus_integrity_report.json new file mode 100644 index 0000000..b50c1eb --- /dev/null +++ b/corpus_integrity_report.json @@ -0,0 +1,161 @@ +{ + "timestamp": "2026-04-30T21:58:06.498354", + "summary": { + "filesystem_total": 1399, + "pgvector_total": 1215, + "graphiti_total": 1205, + "both": 998, + "pgvector_only": 10, + "neither": 129, + "graphiti_only": 0, + "failures": 0, + "orphans_pgvector": 207, + "orphans_graphiti": 207 + }, + "gaps": [ + "test.md", + "test-watcher-trigger.md", + "Renders.pptx", + "Ribbon Cutting Slideshow.pptx", + "PRINTERS.docx", + "Course Calender.docx", + "Attendance.docx", + "GH Slicer Notes [Autosaved].pptx", + "GH Slicer Notes.pptx", + "06_Surface Commands.docx", + "03_Precision_1 Homework.docx", + "02_2D Geometry Toolbars.docx", + "00_Roster.docx", + "Homeworks.docx", + "02_Osnap_ModelingAids.docx", + "06_Solids.docx", + "04_Precision_2 Homework.docx", + "~\ufffdMod08_Post_Processing_2023.pptx", + "~\ufffdMod02_Industries_and_Applications_2023.pptx", + "~\ufffdMod04_FDM_Materials_2023.pptx", + "Byron_V Independent Study.pdf", + "IanC.Lorber_Final_2dsketches..pdf", + "Irene_Raptopoulos_Final.pdf", + "rcolon-final-schematic.pdf", + "kellogg_schematic_schem.pdf", + "Visa Soccer Goal Frame Drawing.pdf", + "4_13_2017_0_22_14.pdf", + "4_13_2017_0_50_29.pdf", + "4_13_2017_0_49_25.pdf", + "4_13_2017_12_27_19.pdf", + "4_13_2017_12_26_54.pdf", + "4_12_2017_21_7_14.pdf", + "4_13_2017_12_25_49.pdf", + "4_13_2017_12_26_5.pdf", + "4_13_2017_12_27_8.pdf", + "IanC.Lorber_Final_2dsketches - 2016.pdf", + "MidtermLayout1.pdf", + "MidtermLayout2.pdf", + "pascar_finalSchematic.pdf", + "Week 7.pdf", + "EXAMPLE Art Education IM Rubric and Scaffold UPDATED copy.docx", + "DDF - IM Rubric and Scaffold UPDATED.docx", + "DI_9.pdf", + "DI_6.pdf", + "DI_4.pdf", + "DI_7.pdf", + "DI_8.pdf", + "DI_2.pdf", + "DI_1.pdf", + "DI_3.pdf", + "DI_5.pdf", + "JN EVAL.pdf", + "Amazon.com_ Iwata Eclipse HP-CS Airbrush - Gravity Feed Dual Action, High-Flow Atomization for Fine Detail to Wide Coverage \u2013 E3 Nozzle, 0.pdf", + "RCPA1.pdf", + "image2022-01-07-133846 - CAryn.pdf", + "image2022-01-07-134536 - Conference.pdf", + "image2022-01-07-131439.pdf", + "image2022-01-07-135248 - BSC Revision.pdf", + "image2022-01-07-141250- LAI.pdf", + "image2022-01-07-134938 - Maker in Residence.pdf", + "image2022-01-07-133157 - Teching Evals.pdf", + "image2022-01-07-135538 PRS Medal.pdf", + "image2022-01-07-131217.pdf", + "image2022-01-07-135911 - HVAMC.pdf", + "image2022-01-07-133504 - Sarah and OLiva.pdf", + "image2022-01-07-131903 - Annual Reports.pdf", + "image2022-01-07-132917 - Mastery.pdf", + "Annual Report - 2016.pdf", + "Annual Report - 2018.pdf", + "Annual Report - 2019.pdf", + "Annual Report - 2017.pdf", + "MOU 2018 - 19.pdf", + "Appointment 2016.pdf", + "Appointment 2019 - 2021.pdf", + "MOU 2017.pdf", + "SKMBT_55220060909245.pdf", + "Dean SSE Letter 2018.pdf", + "Dean FPA Letter 2018.pdf", + "Sarah Thesis Article.pdf", + "Caryn Bylott Thesis.pdf", + "Olivia Thesis.pdf", + "Teching Evals.pdf", + "Maker in Residence - Dalles v2.pdf", + "Modela Gallery - Ripples from Stillwater.pdf", + "Maker in Residence - Dalles.pdf", + "Makerbot Innovation Center Director.pdf", + "Design Week 2019.pdf", + "DW Workshops.pdf", + "Ron Rael Lecture.pdf", + "BSC Revision.pdf", + "President's Medal.pdf", + "CNC Workshop.pdf", + "Dorksy.pdf", + "FULL SCAN.pdf", + "DDF Course Fee Rev 2017.pdf", + "SlideSlam 2019.pdf", + "Candy and Cold Cases.pdf", + "Murder ID.pdf", + "HVAMC SuperLab.pdf", + "Certifications.pdf", + "Brazkem Letter.pdf", + "Havana 2017.pdf", + "CAA Conference - 2020.pdf", + "CAA Conference 2019.pdf", + "Central Hudson Grant.pdf", + "3D Printing Updates - Council of Industry.pdf", + "k12 Training.pdf", + "Hudson Valley Futures Summit.pdf", + "StateAssemblymanTour.pdf", + "Best of New Paltz.pdf", + "HVAMC - ColorPage.pdf", + "HVAMC - Daily Freeman.pdf", + "Chocolate Skulls.pdf", + "10_13_2016_21_19_43.pdf", + "10_13_2016_21_19_46.pdf", + "10_13_2016_21_19_1.pdf", + "10_13_2016_21_18_55.pdf", + "161012_102427.pdf", + "161012_102737.pdf", + "161012_114217.pdf", + "Eto Forms.txt", + "AaronNelsonUndergraduteTranscript Unsecured.pdf", + "Aaron Nelson Transcript Undergradute Unsecured.pdf", + "AP023.pdf", + "How Buildings Learn_ What Happens After They are Built -- Stewart Brand .pdf", + "Occupying and connecting _ thoughts on territories and -- Frei Otto; Berthold Burkhardt.pdf", + "SilkwormManual.pdf", + "NELSON commitment letter.pdf", + "i-9.pdf" + ], + "failures": [], + "auto_queued": [], + "pgvector_only_sample": [ + "dreamer_changelog.md", + "experiments-log-additions-2026-04-30.md", + "2026-04-30-15-59-voice.md", + "2026-04-30-16-59-voice.md", + "2026-04-30-16-53-voice.md", + "2026-04-30-17-06-voice.md", + "2026-04-30-16-23-voice.md", + "2026-04-30-late-rem.md", + "2026-04-30-synthesis.md", + "2026-04-30-nrem.md" + ], + "graphiti_only": [] +} \ No newline at end of file diff --git a/dreamer_state.json b/dreamer_state.json index 6060d10..ffcda6f 100644 --- a/dreamer_state.json +++ b/dreamer_state.json @@ -1,30 +1,46 @@ { - "last_dream_timestamp": 1777480274.444462, + "last_dream_timestamp": 1777536047.392913, "last_dream_mode": "pipeline", - "last_dream_file": "Journal/Dreams/2026-04-29-synthesis-1.md", + "last_dream_file": "Journal/Dreams/2026-04-30-synthesis.md", "retrieved_sources": [ "ChatGPT: CV Summary Request", - "Dossier Narrative.docx", - "2026-04-28-early-rem.md", - "Advances in Architectural Geometry 2023 -- Kathrin D\u00f6rfler (editor); Jan Knippers (editor); Achim.pdf", - "2026-04-29-late-rem.md", - "Mod06_GrabCAD_Print_and _Advanced_FDM_2023.pptx", - "The Extended Mind _ The Power of Thinking Outside the Brain -- Annie Murphy Paul.pdf", - "Utah MDD - Aaron Nelson - Copy.pptx", - "ChatGPT: Dean Position Evaluation", - "Company of One -- Paul Jarvis.pdf", + "ChatGPT: Program response drafting", + "Cognition in the Wild (A Bradford Book) -- Hutchins, Edwin.pdf", + "ChatGPT: Digital Fabrication Cultural Project", + "Dossier Narrative.pdf", + "E1_8-taxonomy-free-cascade-protocol.md", "References.docx", - "Dossier Narrative Kill Me PLS_REV.docx", "ChatGPT: Career change anxiety", + "Dossier Narrative.docx", + "Dossier Narrative Kill Me PLS_REV_HOME.docx", + "Aaron Nelson Tenure Dossier Narrative.pdf", + "Claude: Importing chat history from ChatGPT", + "aaronai-architecture.md", + "Aaron AI: What should I be the most excited about right now?", + "ChatGPT: Dean Position Evaluation", "2026-04-27-early-rem-1.md", "The Poetics of Space -- Gaston Bachelard translated from the French by Maria Jolas -- First Edition, 1994.pdf", - "ChatGPT: Digital fabrication education", "Dossier Narrative Kill Me PLS.docx", - "ChatGPT: Digital Fabrication Cultural Project", - "Claude: Weighing Utah versus Oklahoma", - "Claude: Importing chat history from ChatGPT", + "Advances in Architectural Geometry 2023 -- Kathrin D\u00f6rfler (editor); Jan Knippers (editor); Achim.pdf", "Claude: I filling out my annual report...", "References.pdf", - "Dossier Narrative Kill Me PLS_REV_HOME.docx" + "2026-04-28-early-rem.md", + "Claude: Law enforcement career options", + "Dossier Narrative Kill Me PLS_REV_2.docx", + "2026-04-29-late-rem.md", + "Dossier Narrative Kill Me PLS_REV.docx", + "Utah MDD - Aaron Nelson - Copy.pptx", + "Dossier Narrative Kill Me PLS_REV_3.docx", + "Aaron AI: Who's covering for me on sabbatical?", + "ChatGPT: GA Proposal Revision Guide", + "BirdAI-Experiments-Log.md", + "Mod06_GrabCAD_Print_and _Advanced_FDM_2023.pptx", + "The Extended Mind _ The Power of Thinking Outside the Brain -- Annie Murphy Paul.pdf", + "Company of One -- Paul Jarvis.pdf", + "Fabrication Processes_Syllabus.docx", + "ChatGPT: Digital fabrication education", + "Fabrication Processes_Syllabus DDF710 V3.docx", + "Claude: Setting up a custom OpenClaw instance", + "Claude: Weighing Utah versus Oklahoma" ] } \ No newline at end of file diff --git a/scripts/api.py b/scripts/api.py index 85b5336..f5bd18e 100644 --- a/scripts/api.py +++ b/scripts/api.py @@ -161,8 +161,6 @@ def require_auth(request: Request): raise HTTPException(status_code=401, detail="Not authenticated") return token -CV_SOURCES = ["Aaron Nelson CV 2024.pdf", "Aaron Nelson CV 2025.pdf", "Aaron Nelson - CV.docx"] - def init_conversations_db(): conn = sqlite3.connect(CONVERSATIONS_DB) c = conn.cursor() @@ -224,50 +222,23 @@ def remove_from_memory(item): save_memory("\n".join(filtered)) return len(lines) - len(filtered) -def get_pinned_cv_context(): - try: - pg = get_pg() - cur = pg.cursor() - cur.execute( - "SELECT document, source FROM embeddings WHERE source = ANY(%s)", - (CV_SOURCES,) - ) - rows = cur.fetchall() - pg.close() - docs = [r[0] for r in rows] - metas = [{"source": r[1]} for r in rows] - return docs, metas - except: - return [], [] - -def is_professional_query(query): - keywords = ["grant", "publication", "exhibition", "award", "fellowship", - "experience", "position", "job", "career", "cv", "resume", - "research", "work history", "accomplishment", "teaching", - "course", "client", "consultation", "presentation", "workshop", - "education", "degree", "institution", "service", "committee"] - return any(k in query.lower() for k in keywords) - def retrieve_context(query, n_results=8): + """Pure semantic retrieval over pgvector. Top-N by cosine similarity, threshold 0.3. + No CV pinning, no keyword routing — see architecture doc substrate-dependency section. + Substrate-level workarounds (entity-keyed routing, hybrid retrieval) live at the + Graphiti layer, not as wrapper logic above pgvector.""" query_embedding = embedder.encode([query]).tolist()[0] context_pieces = [] sources = [] - if is_professional_query(query): - cv_docs, cv_metas = get_pinned_cv_context() - for doc, meta in zip(cv_docs, cv_metas): - context_pieces.append(f"[CV] {doc}") - sources.append(meta.get("source", "CV")) try: pg = get_pg() cur = pg.cursor() cur.execute(""" SELECT document, source, 1 - (embedding <=> %s::vector) as similarity FROM embeddings - WHERE source NOT IN %s ORDER BY embedding <=> %s::vector LIMIT %s - """, (query_embedding, tuple(CV_SOURCES) if CV_SOURCES else ('__none__',), - query_embedding, n_results)) + """, (query_embedding, query_embedding, n_results)) for doc, source, similarity in cur.fetchall(): if similarity > 0.3: context_pieces.append(doc) diff --git a/scripts/api.py.bak.20260501-001427 b/scripts/api.py.bak.20260501-001427 new file mode 100644 index 0000000..85b5336 --- /dev/null +++ b/scripts/api.py.bak.20260501-001427 @@ -0,0 +1,1287 @@ +import os +import json +import sqlite3 +import subprocess +import hashlib +from pathlib import Path +from datetime import datetime +from dotenv import load_dotenv +from sentence_transformers import SentenceTransformer +import anthropic +from fastapi import FastAPI, Request, Response, Depends, HTTPException, BackgroundTasks +import psycopg2 +import psycopg2.extras +from fastapi import UploadFile, File, Form +import tempfile +import os +try: + from faster_whisper import WhisperModel + HAS_WHISPER = True +except ImportError: + HAS_WHISPER = False +from fastapi.responses import FileResponse, JSONResponse +import secrets +import hashlib +from fastapi.responses import FileResponse, JSONResponse +from fastapi.staticfiles import StaticFiles +from fastapi.middleware.cors import CORSMiddleware +import uvicorn +import asyncio +from fastapi.responses import StreamingResponse +from apscheduler.schedulers.background import BackgroundScheduler +from apscheduler.triggers.cron import CronTrigger + +load_dotenv(Path.home() / "aaronai" / ".env") + +MEMORY_PATH = Path.home() / "aaronai" / "memory.md" +DB_PATH = str(Path.home() / "aaronai" / "db") +CONVERSATIONS_DB = str(Path.home() / "aaronai" / "conversations.db") +SETTINGS_PATH = Path.home() / "aaronai" / "settings.json" +WATCHER_LOG = str(Path.home() / "aaronai" / "watcher.log") +WATCHER_STATE = str(Path.home() / "aaronai" / "watcher_state.json") +NEXTCLOUD_PATH = "/home/aaron/nextcloud/data/data/aaron/files" +INGEST_SCRIPT = str(Path.home() / "aaronai" / "scripts" / "ingest.py") +PYTHON = str(Path.home() / "aaronai" / "venv" / "bin" / "python3") + +DEFAULT_SETTINGS = { + "theme": "light", + "font_size": "medium", + "web_search": True, + "show_sources": True, + "dream_hour_utc": 8, + "dream_minute_utc": 0, + "dream_mode": "nrem", + "ingest_hour_utc": 2, + "ingest_minute_utc": 30, + "share_time": True, +} + +print("Loading Aaron AI...") +PG_DSN = os.getenv("PG_DSN") + +def get_pg(): + return psycopg2.connect(PG_DSN) +WHISPER_PROMPT = ( + "Grasshopper, Rhino, PolyJet, SLA, FDM, DMLS, " + "HVAMC, FWN3D, Mossygear, Nextcloud, Gitea, " + "computational design, additive manufacturing, fabrication, " + "Graphiti, FalkorDB, pgvector, BirdAI, Active Inference, " + "dreamer, consolidator, Extended Mind, " + "Aaron Nelson, SUNY New Paltz, University of Utah, MDD" +) +whisper_model = None +if HAS_WHISPER: + try: + whisper_model = WhisperModel("large-v3", device="cpu", compute_type="int8", cpu_threads=8) + print("Whisper model loaded") + except Exception as e: + print(f"Whisper not available: {e}") +embedder = SentenceTransformer("all-MiniLM-L6-v2") +# ChromaDB removed — using pgvector +anthropic_client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY")) + +SYSTEM_PROMPT = """You are the personal AI assistant of Aaron Nelson — computational +designer, fabrication researcher, program builder, and creative +practitioner based in the Hudson Valley. + +Aaron's work sits at the intersection of computational geometry, +additive manufacturing, and physical making. He resolves complex +systems into physical reality — from Grasshopper definitions to +large-scale steel structures, from archival photographs to 3D-printed +architectural restorations, from product concepts to manufactured +goods. This throughline — computation resolving into physical form — +defines his practice across academic, consulting, and creative contexts. + +He built the Hudson Valley Additive Manufacturing Center (HVAMC) from +nothing and has directed it since 2016, alongside the DDF academic +program at SUNY New Paltz. He has the skills of a founder — equipment +selection, policy creation, client development, grant writing, +curriculum design — operating within an academic structure. He +consults with IBM, Braskem, Selux and others through FWN3D. He runs +Mossygear as a product business. He makes large-scale fabricated art. + +His communication style is direct, precise, and intolerant of padding +or overclaiming. He flags inaccuracies immediately and expects the +same standard from you. When helping him write, match his voice — +economical, specific, never performative. When answering questions, +cite sources and acknowledge uncertainty rather than filling gaps with +plausible-sounding content. + +You have access to his complete document corpus, conversation history, +and a persistent memory file that carries his current context. Treat +the memory file as ground truth for his present situation. Use web +search automatically when current information is needed. Never +re-brief on context that's already in memory or documents. + +When making factual claims about Aaron — his history, credentials, locations, dates, relationships, projects, or any specific event — you must ground the claim in a specific retrieved document or the memory file. Cite the source by name inline. If no source supports the claim, say so explicitly rather than filling the gap with plausible-sounding content. Do not confabulate. If you are inferring rather than citing, mark it as inference.""" + +# Auth configuration +import os +SESSION_PASSWORD = os.getenv("AARON_AI_PASSWORD", "changeme") +SESSIONS_DB = str(Path.home() / "aaronai" / "sessions.db") + +def _init_sessions(): + conn = sqlite3.connect(SESSIONS_DB) + conn.execute("CREATE TABLE IF NOT EXISTS sessions (token TEXT PRIMARY KEY, created_at TEXT)") + conn.commit() + conn.close() + +_init_sessions() + +def make_session_token() -> str: + return secrets.token_urlsafe(32) + +def hash_password(password: str) -> str: + return hashlib.sha256(password.encode()).hexdigest() + +def save_session(token: str): + conn = sqlite3.connect(SESSIONS_DB) + conn.execute("INSERT OR REPLACE INTO sessions VALUES (?, ?)", (token, datetime.now().isoformat())) + conn.commit() + conn.close() + +def delete_session(token: str): + conn = sqlite3.connect(SESSIONS_DB) + conn.execute("DELETE FROM sessions WHERE token = ?", (token,)) + conn.commit() + conn.close() + +def session_exists(token: str) -> bool: + conn = sqlite3.connect(SESSIONS_DB) + row = conn.execute("SELECT 1 FROM sessions WHERE token = ?", (token,)).fetchone() + conn.close() + return row is not None + +def get_session(request: Request) -> str | None: + return request.cookies.get("aaronai_session") + +def require_auth(request: Request): + token = get_session(request) + if not token or not session_exists(token): + raise HTTPException(status_code=401, detail="Not authenticated") + return token + +CV_SOURCES = ["Aaron Nelson CV 2024.pdf", "Aaron Nelson CV 2025.pdf", "Aaron Nelson - CV.docx"] + +def init_conversations_db(): + conn = sqlite3.connect(CONVERSATIONS_DB) + c = conn.cursor() + c.execute('''CREATE TABLE IF NOT EXISTS conversations ( + id TEXT PRIMARY KEY, + title TEXT NOT NULL, + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL, + model TEXT DEFAULT 'claude-sonnet-4-6', + message_count INTEGER DEFAULT 0 + )''') + c.execute('''CREATE TABLE IF NOT EXISTS messages ( + id TEXT PRIMARY KEY, + conversation_id TEXT NOT NULL, + role TEXT NOT NULL, + content TEXT NOT NULL, + sources TEXT DEFAULT '[]', + timestamp TEXT NOT NULL, + FOREIGN KEY (conversation_id) REFERENCES conversations(id) + )''') + conn.commit() + conn.close() + +init_conversations_db() + +def load_settings(): + if SETTINGS_PATH.exists(): + try: + s = json.loads(SETTINGS_PATH.read_text()) + return {**DEFAULT_SETTINGS, **s} + except: + pass + return DEFAULT_SETTINGS.copy() + +def save_settings(settings): + SETTINGS_PATH.write_text(json.dumps(settings, indent=2)) + +def load_memory(): + if MEMORY_PATH.exists(): + return MEMORY_PATH.read_text(encoding="utf-8") + return "" + +def save_memory(content): + MEMORY_PATH.write_text(content, encoding="utf-8") + +def add_to_memory(item): + memory = load_memory() + timestamp = datetime.now().strftime("%Y-%m-%d") + note = f"\n- [{timestamp}] {item}" + if "## Notes" not in memory: + memory += "\n\n## Notes" + memory += note + save_memory(memory) + +def remove_from_memory(item): + memory = load_memory() + lines = memory.split("\n") + filtered = [l for l in lines if item.lower() not in l.lower()] + save_memory("\n".join(filtered)) + return len(lines) - len(filtered) + +def get_pinned_cv_context(): + try: + pg = get_pg() + cur = pg.cursor() + cur.execute( + "SELECT document, source FROM embeddings WHERE source = ANY(%s)", + (CV_SOURCES,) + ) + rows = cur.fetchall() + pg.close() + docs = [r[0] for r in rows] + metas = [{"source": r[1]} for r in rows] + return docs, metas + except: + return [], [] + +def is_professional_query(query): + keywords = ["grant", "publication", "exhibition", "award", "fellowship", + "experience", "position", "job", "career", "cv", "resume", + "research", "work history", "accomplishment", "teaching", + "course", "client", "consultation", "presentation", "workshop", + "education", "degree", "institution", "service", "committee"] + return any(k in query.lower() for k in keywords) + +def retrieve_context(query, n_results=8): + query_embedding = embedder.encode([query]).tolist()[0] + context_pieces = [] + sources = [] + if is_professional_query(query): + cv_docs, cv_metas = get_pinned_cv_context() + for doc, meta in zip(cv_docs, cv_metas): + context_pieces.append(f"[CV] {doc}") + sources.append(meta.get("source", "CV")) + try: + pg = get_pg() + cur = pg.cursor() + cur.execute(""" + SELECT document, source, 1 - (embedding <=> %s::vector) as similarity + FROM embeddings + WHERE source NOT IN %s + ORDER BY embedding <=> %s::vector + LIMIT %s + """, (query_embedding, tuple(CV_SOURCES) if CV_SOURCES else ('__none__',), + query_embedding, n_results)) + for doc, source, similarity in cur.fetchall(): + if similarity > 0.3: + context_pieces.append(doc) + sources.append(source or "unknown") + pg.close() + except Exception as e: + print(f"pgvector retrieval error: {e}") + return context_pieces, sources + +def get_conversation_history(conversation_id, limit=20): + conn = sqlite3.connect(CONVERSATIONS_DB) + c = conn.cursor() + c.execute('''SELECT role, content FROM messages + WHERE conversation_id = ? + ORDER BY timestamp DESC LIMIT ?''', (conversation_id, limit)) + rows = c.fetchall() + conn.close() + return [{"role": r[0], "content": r[1]} for r in reversed(rows)] + +def save_message(conversation_id, role, content, sources=None): + conn = sqlite3.connect(CONVERSATIONS_DB) + c = conn.cursor() + msg_id = hashlib.md5(f"{conversation_id}{role}{datetime.now().isoformat()}".encode()).hexdigest() + timestamp = datetime.now().isoformat() + c.execute('''INSERT INTO messages (id, conversation_id, role, content, sources, timestamp) + VALUES (?, ?, ?, ?, ?, ?)''', + (msg_id, conversation_id, role, content, + json.dumps(sources or []), timestamp)) + c.execute('''UPDATE conversations SET updated_at = ?, message_count = message_count + 1 + WHERE id = ?''', (timestamp, conversation_id)) + conn.commit() + conn.close() + +def create_conversation(title="New conversation"): + conn = sqlite3.connect(CONVERSATIONS_DB) + c = conn.cursor() + conv_id = hashlib.md5(f"{datetime.now().isoformat()}".encode()).hexdigest()[:16] + now = datetime.now().isoformat() + c.execute('''INSERT INTO conversations (id, title, created_at, updated_at) + VALUES (?, ?, ?, ?)''', (conv_id, title, now, now)) + conn.commit() + conn.close() + return conv_id + +def chat(user_message, conversation_id, settings, client_time=None): + memory = load_memory() + context_pieces, sources = retrieve_context(user_message) + history = get_conversation_history(conversation_id) + + context_parts = [] + if client_time: + context_parts.append(f"Current time (user-supplied, not logged): {client_time}") + if memory: + context_parts.append(f"Aaron's persistent memory:\n\n{memory}") + if context_pieces: + context_str = "\n\n---\n\n".join(context_pieces) + unique_sources = list(set(sources)) + context_parts.append( + f"Relevant excerpts from Aaron's documents:\n\n{context_str}\n\nSources: {', '.join(unique_sources)}" + ) + context_block = "\n\n====\n\n".join(context_parts) + "\n\n---\n\n" if context_parts else "" + full_message = context_block + user_message + + messages = history + [{"role": "user", "content": full_message}] + + tools = [{"type": "web_search_20250305", "name": "web_search"}] if settings.get("web_search", True) else [] + + while True: + kwargs = { + "model": "claude-sonnet-4-6", + "max_tokens": 2048, + "system": SYSTEM_PROMPT, + "messages": messages + } + if tools: + kwargs["tools"] = tools + + response = anthropic_client.messages.create(**kwargs) + + if response.stop_reason == "tool_use": + messages.append({"role": "assistant", "content": response.content}) + tool_results = [] + for block in response.content: + if block.type == "tool_use": + tool_results.append({ + "type": "tool_result", + "tool_use_id": block.id, + "content": "Search completed" + }) + messages.append({"role": "user", "content": tool_results}) + else: + assistant_message = "" + for block in response.content: + if hasattr(block, "text"): + assistant_message += block.text + return assistant_message, list(set(sources)) + +from contextlib import asynccontextmanager + +@asynccontextmanager +async def lifespan(app: FastAPI): + reschedule_jobs() + scheduler.start() + print("Scheduler started") + yield + scheduler.shutdown() + print("Scheduler stopped") + +app = FastAPI(lifespan=lifespan) +app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"]) + +@app.post("/auth/login") +async def login(request: Request, response: Response): + data = await request.json() + password = data.get("password", "") + if hash_password(password) != hash_password(SESSION_PASSWORD): + raise HTTPException(status_code=401, detail="Invalid password") + token = make_session_token() + save_session(token) + response.set_cookie( + key="aaronai_session", + value=token, + httponly=True, + secure=True, + samesite="lax", + max_age=60 * 60 * 24 * 30 + ) + response.body = b'{"ok": true}' + response.status_code = 200 + response.media_type = "application/json" + return response + +@app.post("/auth/logout") +async def logout(request: Request, response: Response): + token = get_session(request) + if token: + delete_session(token) + response.delete_cookie("aaronai_session") + return JSONResponse({"ok": True}) + +@app.get("/auth/check") +async def check_auth(request: Request): + token = get_session(request) + if not token or token not in SESSIONS: + return JSONResponse({"authenticated": False}) + return JSONResponse({"authenticated": True}) + +@app.get("/", response_class=FileResponse) +async def index(): + return FileResponse("/home/aaron/aaronai/static/index.html") + +@app.get("/api/settings") +async def get_settings(auth: str = Depends(require_auth)): + return JSONResponse(load_settings()) + +@app.post("/api/settings") +async def update_settings(request: Request, auth: str = Depends(require_auth)): + data = await request.json() + settings = load_settings() + settings.update(data) + save_settings(settings) + # Reschedule if schedule settings changed + schedule_keys = {"dream_hour_utc","dream_minute_utc","dream_mode","ingest_hour_utc","ingest_minute_utc"} + if any(k in data for k in schedule_keys): + reschedule_jobs() + return JSONResponse(settings) + +@app.get("/api/conversations") +async def list_conversations(auth: str = Depends(require_auth)): + conn = sqlite3.connect(CONVERSATIONS_DB) + c = conn.cursor() + c.execute('''SELECT id, title, created_at, updated_at, message_count + FROM conversations ORDER BY updated_at DESC LIMIT 100''') + rows = c.fetchall() + conn.close() + return JSONResponse([{ + "id": r[0], "title": r[1], "created_at": r[2], + "updated_at": r[3], "message_count": r[4] + } for r in rows]) + +@app.post("/api/conversations") +async def new_conversation(request: Request, auth: str = Depends(require_auth)): + data = await request.json() + title = data.get("title", "New conversation") + conv_id = create_conversation(title) + return JSONResponse({"id": conv_id, "title": title}) + +@app.get("/api/conversations/{conv_id}/messages") +async def get_messages(conv_id: str, auth: str = Depends(require_auth)): + conn = sqlite3.connect(CONVERSATIONS_DB) + c = conn.cursor() + c.execute('''SELECT role, content, sources, timestamp FROM messages + WHERE conversation_id = ? ORDER BY timestamp ASC''', (conv_id,)) + rows = c.fetchall() + conn.close() + return JSONResponse([{ + "role": r[0], "content": r[1], + "sources": json.loads(r[2]), "timestamp": r[3] + } for r in rows]) + +@app.patch("/api/conversations/{conv_id}") +async def rename_conversation(conv_id: str, request: Request, auth: str = Depends(require_auth)): + data = await request.json() + title = data.get("title", "") + if not title: + return JSONResponse({"error": "Title required"}, status_code=400) + conn = sqlite3.connect(CONVERSATIONS_DB) + c = conn.cursor() + c.execute("UPDATE conversations SET title = ? WHERE id = ?", (title, conv_id)) + conn.commit() + conn.close() + return JSONResponse({"id": conv_id, "title": title}) + +@app.delete("/api/conversations/{conv_id}") +async def delete_conversation(conv_id: str, auth: str = Depends(require_auth)): + conn = sqlite3.connect(CONVERSATIONS_DB) + c = conn.cursor() + c.execute("DELETE FROM messages WHERE conversation_id = ?", (conv_id,)) + c.execute("DELETE FROM conversations WHERE id = ?", (conv_id,)) + conn.commit() + conn.close() + return JSONResponse({"deleted": conv_id}) + +@app.post("/api/chat") +async def chat_endpoint(request: Request, auth: str = Depends(require_auth)): + data = await request.json() + user_message = data.get("message", "").strip() + conversation_id = data.get("conversation_id", "") + client_time = data.get("client_time", None) + settings = load_settings() + + if not user_message: + return JSONResponse({"error": "Empty message"}) + + if not conversation_id: + conversation_id = create_conversation("New conversation") + + stripped = user_message.strip().lower() + + if stripped == "show memory": + return JSONResponse({"response": load_memory(), "sources": [], "conversation_id": conversation_id}) + + if stripped.startswith("remember:"): + item = user_message[9:].strip() + add_to_memory(item) + save_message(conversation_id, "user", user_message) + save_message(conversation_id, "assistant", f"Saved to memory: '{item}'") + return JSONResponse({"response": f"Saved to memory: '{item}'", "sources": [], "conversation_id": conversation_id}) + + if stripped.startswith("forget:"): + item = user_message[7:].strip() + removed = remove_from_memory(item) + msg = f"Removed {removed} line(s) containing '{item}'" if removed else f"Nothing found containing '{item}'" + save_message(conversation_id, "user", user_message) + save_message(conversation_id, "assistant", msg) + return JSONResponse({"response": msg, "sources": [], "conversation_id": conversation_id}) + + save_message(conversation_id, "user", user_message) + + # Auto-title conversation from first message + conn = sqlite3.connect(CONVERSATIONS_DB) + c = conn.cursor() + c.execute("SELECT message_count, title FROM conversations WHERE id = ?", (conversation_id,)) + row = c.fetchone() + conn.close() + if row and row[0] <= 1 and row[1] == "New conversation": + auto_title = user_message[:60] + ("..." if len(user_message) > 60 else "") + conn = sqlite3.connect(CONVERSATIONS_DB) + c = conn.cursor() + c.execute("UPDATE conversations SET title = ? WHERE id = ?", (auto_title, conversation_id)) + conn.commit() + conn.close() + + response, sources = chat(user_message, conversation_id, settings, client_time=client_time) + save_message(conversation_id, "assistant", response, sources if settings.get("show_sources") else []) + + return JSONResponse({ + "response": response, + "sources": sources if settings.get("show_sources") else [], + "conversation_id": conversation_id + }) + +@app.get("/api/memory") +async def get_memory(auth: str = Depends(require_auth)): + return JSONResponse({"content": load_memory()}) + +@app.post("/api/memory") +async def update_memory(request: Request, auth: str = Depends(require_auth)): + data = await request.json() + content = data.get("content", "") + save_memory(content) + return JSONResponse({"saved": True}) + +@app.get("/api/status") +async def get_status(auth: str = Depends(require_auth)): + try: + pg = get_pg() + cur = pg.cursor() + cur.execute("SELECT COUNT(*) FROM embeddings") + chunk_count = cur.fetchone()[0] + pg.close() + except: + chunk_count = 0 + + # Watcher status + watcher_running = False + watcher_ingestion = {"status": "idle", "message": "", "file_count": 0} + last_indexed = "Unknown" + try: + import time as _time, json as _json + _sp = Path("/home/aaron/aaronai/watcher_status.json") + if _sp.exists(): + _s = _json.loads(_sp.read_text()) + _age = _time.time() - _s.get("timestamp", 0) + watcher_running = _s.get("running", False) and _age < 30 + watcher_ingestion = _s.get("ingestion", watcher_ingestion) + except: + pass + + try: + log_path = Path(WATCHER_LOG) + if log_path.exists(): + lines = log_path.read_text().strip().split("\n") + for line in reversed(lines): + if "Ingestion complete" in line: + last_indexed = line.split(" - ")[0].strip() + break + except: + pass + + # File count from watcher state + file_count = 0 + try: + state_path = Path(WATCHER_STATE) + if state_path.exists(): + state = json.loads(state_path.read_text()) + file_count = len(state) + else: + # Count files in Nextcloud directly + nc_path = Path(NEXTCLOUD_PATH) + if nc_path.exists(): + file_count = sum(1 for f in nc_path.rglob("*") + if f.is_file() and f.suffix.lower() in {'.pdf','.docx','.pptx','.txt','.md'}) + except: + pass + + # Conversation count + conn = sqlite3.connect(CONVERSATIONS_DB) + c = conn.cursor() + c.execute("SELECT COUNT(*) FROM conversations") + conv_count = c.fetchone()[0] + conn.close() + + return JSONResponse({ + "aaron_ai": "running", + "watcher": "running" if watcher_running else "stopped", + "watcher_ingestion": watcher_ingestion, + "chunk_count": chunk_count, + "file_count": file_count, + "last_indexed": last_indexed, + "conversation_count": conv_count, + "model": "claude-sonnet-4-6", + "nextcloud_path": NEXTCLOUD_PATH + }) + +@app.post("/api/transcribe") +async def transcribe_audio(request: Request, audio: UploadFile = File(...), auth: str = Depends(require_auth)): + if not whisper_model: + raise HTTPException(status_code=503, detail="Whisper not available") + try: + suffix = ".webm" + if audio.content_type and "mp4" in audio.content_type: + suffix = ".mp4" + elif audio.content_type and "ogg" in audio.content_type: + suffix = ".ogg" + with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp: + content = await audio.read() + tmp.write(content) + tmp_path = tmp.name + segments, info = whisper_model.transcribe( + tmp_path, + language="en", + vad_filter=True, + initial_prompt=WHISPER_PROMPT + ) + transcript = " ".join(s.text.strip() for s in segments) + os.unlink(tmp_path) + return JSONResponse({"text": transcript, "language": info.language}) + except Exception as e: + if os.path.exists(tmp_path): + os.unlink(tmp_path) + return JSONResponse({"ok": False, "error": str(e), "error_type": "transcription_failed"}, status_code=500) + +@app.get("/api/dreamer/status") +async def dreamer_status(auth: str = Depends(require_auth)): + try: + state_path = Path.home() / "aaronai" / "dreamer_state.json" + if state_path.exists(): + state = json.loads(state_path.read_text()) + else: + state = {} + last_ts = state.get("last_dream_timestamp", 0) + last_dt = datetime.fromtimestamp(last_ts).strftime("%Y-%m-%d %H:%M") if last_ts else "never" + raw_mode = state.get("last_dream_mode", "none") + display_mode = "full pipeline" if raw_mode == "pipeline" else raw_mode + return JSONResponse({ + "last_dream": last_dt, + "last_mode": display_mode, + "last_file": state.get("last_dream_file", ""), + }) + except Exception as e: + return JSONResponse({"last_dream": "unknown", "last_mode": "none", "last_file": "", "error": str(e)}) + +@app.post("/api/dreamer/run") +async def run_dreamer(request: Request, auth: str = Depends(require_auth)): + try: + body = await request.json() + mode = body.get("mode", "nrem") + task = body.get("task", None) + dream_script = str(Path.home() / "aaronai" / "scripts" / "dream.py") + cmd = [PYTHON, dream_script, "--mode", mode] + if task: + cmd += ["--task", task] + subprocess.Popen(cmd, cwd=str(Path.home() / "aaronai")) + return JSONResponse({"started": True, "mode": mode}) + except Exception as e: + return JSONResponse({"started": False, "error": str(e)}) + +def transcribe_and_save(tmp_path, timestamp, nextcloud_url, nextcloud_user, nextcloud_password): + """Background task — transcribes audio and saves to Nextcloud after endpoint returns.""" + import requests as req_lib + nc_auth = (nextcloud_user, nextcloud_password) + try: + segments, _ = whisper_model.transcribe( + tmp_path, language="en", vad_filter=True, initial_prompt=WHISPER_PROMPT + ) + transcript = " ".join(s.text.strip() for s in segments).strip() + os.unlink(tmp_path) + if not transcript: + print(f"Async transcription empty for {timestamp} — nothing saved") + return + filename = f"{timestamp}-voice.md" + content_md = f"# Capture — {timestamp}\n\n**type:** voice\n**modality:** audio\n**status:** unprocessed\n\n---\n\n{transcript}\n" + captures_dir = f"{nextcloud_url}/remote.php/dav/files/{nextcloud_user}/Journal/Captures" + req_lib.request("MKCOL", captures_dir, auth=nc_auth, timeout=10) + url = f"{captures_dir}/{filename}" + req_lib.put(url, data=content_md.encode("utf-8"), auth=nc_auth, timeout=30) + print(f"Async transcription saved: {filename}") + # Notify SSE clients that transcription is complete + try: + import requests as _req + _req.post("http://localhost:8000/api/events/notify", json={ + "type": "capture_saved", + "filename": filename, + "timestamp": timestamp, + }, timeout=3) + _req.post("http://localhost:8000/api/captures/events/notify", json={ + "type": "capture_saved", + "filename": filename, + "timestamp": timestamp, + }, timeout=3) + except Exception: + pass + except Exception as e: + if os.path.exists(tmp_path): + os.unlink(tmp_path) + print(f"Async transcription failed for {timestamp}: {e}") + + +@app.post("/api/capture") +async def capture_endpoint( + background_tasks: BackgroundTasks, + audio: UploadFile = File(None), + image: UploadFile = File(None), + project: str = Form(None), +): + """Auth-free capture endpoint — handles voice, image, or image+voice.""" + import requests as req_lib + import base64 + + nextcloud_url = os.getenv("NEXTCLOUD_URL", "") + nextcloud_user = os.getenv("NEXTCLOUD_USER", "aaron") + nextcloud_password = os.getenv("NEXTCLOUD_PASSWORD", "") + nc_auth = (nextcloud_user, nextcloud_password) + timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M") + month_dir = datetime.now().strftime("%Y-%m") + + # ── Image + optional voice ─────────────────────────────────────────────── + if image is not None: + tmp_audio_path = None + try: + # Read image bytes + image_bytes = await image.read() + image_content_type = image.content_type or "image/jpeg" + # Determine extension + ext_map = {"image/jpeg": "jpg", "image/png": "png", "image/webp": "webp", "image/heic": "jpg"} + img_ext = ext_map.get(image_content_type, "jpg") + img_filename = f"{timestamp}-image.{img_ext}" + + # Save raw image to Media/YYYY-MM/ via WebDAV + media_dir = f"{nextcloud_url}/remote.php/dav/files/{nextcloud_user}/Journal/Media/{month_dir}" + req_lib.request("MKCOL", f"{nextcloud_url}/remote.php/dav/files/{nextcloud_user}/Journal/Media", auth=nc_auth, timeout=10) + req_lib.request("MKCOL", media_dir, auth=nc_auth, timeout=10) + media_url = f"{media_dir}/{img_filename}" + req_lib.put(media_url, data=image_bytes, auth=nc_auth, + headers={"Content-Type": image_content_type}, timeout=60) + + # Transcribe voice annotation if present + voice_annotation = None + if audio is not None and whisper_model: + audio_bytes = await audio.read() + suffix = ".webm" + if audio.content_type and "mp4" in audio.content_type: + suffix = ".mp4" + elif audio.content_type and "ogg" in audio.content_type: + suffix = ".ogg" + with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp: + tmp.write(audio_bytes) + tmp_audio_path = tmp.name + segments, _ = whisper_model.transcribe( + tmp_audio_path, language="en", vad_filter=True, initial_prompt=WHISPER_PROMPT + ) + voice_annotation = " ".join(s.text.strip() for s in segments).strip() or None + os.unlink(tmp_audio_path) + tmp_audio_path = None + + # Generate Claude vision description + image_b64 = base64.standard_b64encode(image_bytes).decode("utf-8") + annotation_line = f"Aaron said about this image: \"{voice_annotation}\"" if voice_annotation else "" + vision_prompt = f"""You are generating a memory description for an AI corpus belonging to Aaron Nelson — computational designer, fabrication researcher, and visual artist working in the Hudson Valley. + +Describe this image for long-term memory indexing. + +PERCEPTUAL: Composition, materials, light, color, texture, scale, spatial relationships. Be specific enough that this image could be distinguished from visually similar images. + +CONTENT: What is this? What domain does it belong to? What is it an instance of? + +{annotation_line} + +End your response with a single line in this exact format: +ENTITIES: [comma-separated list of key entities — people, objects, materials, places, projects, tools] + +Keep the full description to 150-250 words. Do not speculate beyond what is visible or stated. Write as continuous prose followed by the ENTITIES line.""" + + vision_response = anthropic_client.messages.create( + model="claude-sonnet-4-6", + max_tokens=800, + messages=[{ + "role": "user", + "content": [ + { + "type": "image", + "source": { + "type": "base64", + "media_type": image_content_type, + "data": image_b64, + } + }, + {"type": "text", "text": vision_prompt} + ] + }] + ) + description = vision_response.content[0].text.strip() + + # Build rich Graphiti-ready episode markdown + capture_type = "image+voice" if voice_annotation else "image" + modality = "visual+audio" if voice_annotation else "visual" + media_path = f"Journal/Media/{month_dir}/{img_filename}" + + content_md = f"""# Capture — Image — {timestamp} + +**type:** {capture_type} +**modality:** {modality} +**status:** unprocessed +**media:** {media_path} +{f"**project:** {project}" if project else ""} + +--- + +**Visual description:** +{description} + +**Voice annotation:** +{voice_annotation if voice_annotation else "none recorded"} + +--- +""" + # Save description to Journal/Captures/ via WebDAV + captures_dir = f"{nextcloud_url}/remote.php/dav/files/{nextcloud_user}/Journal/Captures" + req_lib.request("MKCOL", captures_dir, auth=nc_auth, timeout=10) + cap_filename = f"{timestamp}-image.md" + cap_url = f"{captures_dir}/{cap_filename}" + req_lib.put(cap_url, data=content_md.encode("utf-8"), auth=nc_auth, timeout=30) + + return JSONResponse({ + "ok": True, + "filename": cap_filename, + "media": media_path, + "has_voice": voice_annotation is not None, + }) + + except Exception as e: + if tmp_audio_path and os.path.exists(tmp_audio_path): + os.unlink(tmp_audio_path) + return JSONResponse({"ok": False, "error": str(e), "error_type": "transcription_failed"}, status_code=500) + + # ── Voice only ─────────────────────────────────────────────────────────── + elif audio is not None: + if not whisper_model: + raise HTTPException(status_code=503, detail="Whisper not available") + try: + suffix = ".webm" + if audio.content_type and "mp4" in audio.content_type: + suffix = ".mp4" + elif audio.content_type and "ogg" in audio.content_type: + suffix = ".ogg" + with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp: + content_bytes = await audio.read() + tmp.write(content_bytes) + tmp_path = tmp.name + background_tasks.add_task( + transcribe_and_save, + tmp_path=tmp_path, + timestamp=timestamp, + nextcloud_url=nextcloud_url, + nextcloud_user=nextcloud_user, + nextcloud_password=nextcloud_password, + ) + return JSONResponse({"ok": True, "filename": f"{timestamp}-voice.md", "async": True}) + except Exception as e: + return JSONResponse({"ok": False, "error": str(e), "error_type": "capture_failed"}) + + else: + raise HTTPException(status_code=400, detail="No audio or image provided") + +@app.get("/api/captures") +async def list_captures(): + """Returns recent captures from Nextcloud Journal/Captures/ — auth-free""" + try: + import requests as req_lib + nextcloud_url = os.getenv("NEXTCLOUD_URL", "") + nextcloud_user = os.getenv("NEXTCLOUD_USER", "aaron") + nextcloud_password = os.getenv("NEXTCLOUD_PASSWORD", "") + captures_dir = f"{nextcloud_url}/remote.php/dav/files/{nextcloud_user}/Journal/Captures" + auth = (nextcloud_user, nextcloud_password) + + propfind = req_lib.request("PROPFIND", captures_dir, auth=auth, timeout=10, + headers={"Depth": "1"}) + + if propfind.status_code == 404: + return JSONResponse({"captures": []}) + + import xml.etree.ElementTree as ET + root = ET.fromstring(propfind.text) + ns = {"d": "DAV:"} + captures = [] + for resp in root.findall("d:response", ns): + href = resp.findtext("d:href", namespaces=ns) or "" + if href.endswith("/"): + continue + name = href.split("/")[-1] + if not name.endswith(".md"): + continue + captures.append({"name": name.replace(".md", ""), "duration": ""}) + + captures.sort(key=lambda x: x["name"], reverse=True) + return JSONResponse({"captures": captures[:10]}) + except Exception as e: + return JSONResponse({"captures": []}) + +@app.post("/api/reindex") +async def trigger_reindex(auth: str = Depends(require_auth)): + try: + subprocess.Popen([PYTHON, INGEST_SCRIPT, NEXTCLOUD_PATH]) + return JSONResponse({"started": True, "message": "Re-indexing started in background"}) + except Exception as e: + return JSONResponse({"started": False, "error": str(e)}) + +@app.delete("/api/conversations") +async def clear_all_conversations(auth: str = Depends(require_auth)): + conn = sqlite3.connect(CONVERSATIONS_DB) + c = conn.cursor() + c.execute("DELETE FROM messages") + c.execute("DELETE FROM conversations") + conn.commit() + conn.close() + return JSONResponse({"cleared": True}) + + + +# ─── Corpus Integrity Endpoints ───────────────────────────────────────────── + +CORPUS_INTEGRITY_SCRIPT = str(Path.home() / "aaronai" / "scripts" / "corpus_integrity.py") +CORPUS_REPORT_PATH = Path.home() / "aaronai" / "corpus_integrity_report.json" +SUPPORTED_EXTS = {".pdf", ".docx", ".pptx", ".txt", ".md"} +MIGRATION_STATE_PATH = Path.home() / "aaronai" / "experiments" / "tier1_migration_state.json" + + +def get_corpus_status_data(): + fs_count = 0 + try: + root = Path(NEXTCLOUD_PATH) + for path in root.rglob("*"): + if path.is_file() and path.suffix.lower() in SUPPORTED_EXTS: + if path.name.startswith((".", "~$")): continue + if "Admin/Backups" in str(path) or "Backups" in path.parts: continue + if "Journal/Media" in str(path): continue + fs_count += 1 + except Exception: + pass + + pv_count = 0 + try: + pg = get_pg() + cur = pg.cursor() + cur.execute("SELECT COUNT(DISTINCT source) FROM embeddings WHERE source IS NOT NULL") + pv_count = cur.fetchone()[0] + pg.close() + except Exception: + pass + + gr_sources = set() + try: + if MIGRATION_STATE_PATH.exists(): + state = json.loads(MIGRATION_STATE_PATH.read_text()) + for fp in state.get("ingested", []): + gr_sources.add(Path(fp).name) + except Exception: + pass + try: + pg = get_pg() + cur = pg.cursor() + cur.execute("SELECT DISTINCT source FROM stage_3_queue WHERE completed_at IS NOT NULL") + for row in cur.fetchall(): gr_sources.add(row[0]) + pg.close() + except Exception: + pass + + failures = [] + try: + pg = get_pg() + cur = pg.cursor() + cur.execute(""" + SELECT source, filepath, error, retry_count, first_failed_at, last_failed_at + FROM ingest_failures WHERE resolved = FALSE + ORDER BY last_failed_at DESC LIMIT 50 + """) + for row in cur.fetchall(): + failures.append({ + "source": row[0], "filepath": row[1], "error": row[2], + "retry_count": row[3], "first_failed_at": str(row[4]), + "last_failed_at": str(row[5]), + }) + pg.close() + except Exception: + pass + + last_report = None + try: + if CORPUS_REPORT_PATH.exists(): + report = json.loads(CORPUS_REPORT_PATH.read_text()) + last_report = { + "timestamp": report.get("timestamp"), + "gaps": report.get("summary", {}).get("neither", 0), + "auto_queued": len(report.get("auto_queued", [])), + } + except Exception: + pass + + return { + "filesystem": fs_count, + "pgvector": pv_count, + "graphiti": len(gr_sources), + "failures": failures, + "failure_count": len(failures), + "last_reconciliation": last_report, + } + + +@app.get("/api/corpus/status") +async def corpus_status(auth: str = Depends(require_auth)): + try: + return JSONResponse(get_corpus_status_data()) + except Exception as e: + return JSONResponse({"error": str(e)}, status_code=500) + + +@app.post("/api/corpus/retry") +async def corpus_retry(request: Request, auth: str = Depends(require_auth)): + try: + body = await request.json() + source = body.get("source", "") + if not source: + return JSONResponse({"error": "source required"}, status_code=400) + pg = get_pg() + cur = pg.cursor() + cur.execute("SELECT filepath FROM ingest_failures WHERE source = %s", (source,)) + row = cur.fetchone() + pg.close() + if not row: + return JSONResponse({"error": "source not found in failures"}, status_code=404) + filepath = Path(row[0]) + if not filepath.exists(): + return JSONResponse({"error": f"file not found: {filepath}"}, status_code=404) + suffix = filepath.suffix.lower() + text = "" + try: + if suffix in {".txt", ".md"}: + text = filepath.read_text(encoding="utf-8", errors="ignore") + elif suffix == ".pdf": + from pypdf import PdfReader + text = "".join(p.extract_text() + "\n" for p in PdfReader(filepath).pages if p.extract_text()) + elif suffix == ".docx": + from docx import Document as DocxDocument + text = "\n".join(p.text for p in DocxDocument(filepath).paragraphs if p.text.strip()) + elif suffix == ".pptx": + from pptx import Presentation + prs = Presentation(filepath) + text = "\n".join(shape.text for slide in prs.slides for shape in slide.shapes + if hasattr(shape, "text") and shape.text.strip()) + except Exception as e: + return JSONResponse({"error": f"extraction failed: {e}"}, status_code=500) + if not text.strip(): + return JSONResponse({"error": "file produces empty text — may be corrupt"}, status_code=422) + pg = get_pg() + cur = pg.cursor() + cur.execute(""" + INSERT INTO stage_2_queue (source, full_text, char_length) + VALUES (%s, %s, %s) + ON CONFLICT (source) DO UPDATE SET + full_text = EXCLUDED.full_text, char_length = EXCLUDED.char_length, + enqueued_at = NOW(), completed_at = NULL, failed_at = NULL, attempts = 0 + """, (source, text[:50000], len(text))) + cur.execute(""" + UPDATE ingest_failures SET retry_count = retry_count + 1, last_failed_at = NOW() + WHERE source = %s + """, (source,)) + pg.commit() + pg.close() + return JSONResponse({"queued": True, "source": source}) + except Exception as e: + return JSONResponse({"error": str(e)}, status_code=500) + + +@app.post("/api/corpus/reconcile") +async def corpus_reconcile(request: Request, background_tasks: BackgroundTasks, auth: str = Depends(require_auth)): + try: + body = await request.json() + fix = body.get("fix", True) + except Exception: + fix = True + def run_reconcile(): + try: + cmd = [PYTHON, CORPUS_INTEGRITY_SCRIPT] + if fix: + cmd.append("--fix") + subprocess.run(cmd, cwd=str(Path.home() / "aaronai"), timeout=300) + except Exception as e: + print(f"Reconciliation failed: {e}") + background_tasks.add_task(run_reconcile) + return JSONResponse({"started": True, "fix": fix}) + +# ─── Scheduler ────────────────────────────────────────────────────────────── +scheduler = BackgroundScheduler() + +def run_dream_job(): + """Runs nightly dreamer — full interdependent pipeline, no mode flag.""" + try: + import subprocess + dream_script = str(Path.home() / "aaronai" / "scripts" / "dream.py") + result = subprocess.run( + [PYTHON, dream_script], + cwd=str(Path.home() / "aaronai"), + capture_output=True, text=True, timeout=600 + ) + print(f"Dreamer completed: {result.stdout[-200:] if result.stdout else 'no output'}") + if result.returncode != 0: + print(f"Dreamer error: {result.stderr[-200:] if result.stderr else 'unknown'}") + except Exception as e: + print(f"Dreamer job failed: {e}") + +def run_ingest_job(): + """Runs nightly conversation indexing.""" + try: + import subprocess + ingest_script = str(Path.home() / "aaronai" / "scripts" / "ingest_conversations.py") + result = subprocess.run( + [PYTHON, ingest_script], + cwd=str(Path.home() / "aaronai"), + capture_output=True, text=True, timeout=300 + ) + print(f"Ingest completed: {result.stdout[-200:] if result.stdout else 'no output'}") + except Exception as e: + print(f"Ingest job failed: {e}") + +def reschedule_jobs(): + """Update scheduler from current settings.""" + settings = load_settings() + # Remove existing jobs + for job_id in ("dream_job", "ingest_job"): + try: + scheduler.remove_job(job_id) + except: + pass + # Add dream job + scheduler.add_job( + run_dream_job, + CronTrigger(hour=settings.get("dream_hour_utc", 8), + minute=settings.get("dream_minute_utc", 0), + timezone="UTC"), + id="dream_job", + max_instances=1, + replace_existing=True + ) + # Add ingest job + scheduler.add_job( + run_ingest_job, + CronTrigger(hour=settings.get("ingest_hour_utc", 2), + minute=settings.get("ingest_minute_utc", 30), + timezone="UTC"), + id="ingest_job", + max_instances=1, + replace_existing=True + ) + print(f"Scheduled: dream at {settings.get('dream_hour_utc',8):02d}:{settings.get('dream_minute_utc',0):02d} UTC, ingest at {settings.get('ingest_hour_utc',2):02d}:{settings.get('ingest_minute_utc',30):02d} UTC") + +# SSE client registry +sse_clients: list[asyncio.Queue] = [] +capture_sse_clients: list[asyncio.Queue] = [] + +async def sse_generator(queue: asyncio.Queue): + try: + yield 'data: {"type": "connected"}\n\n' + while True: + try: + event = await asyncio.wait_for(queue.get(), timeout=30.0) + import json as _json + yield 'data: ' + _json.dumps(event) + '\n\n' + except asyncio.TimeoutError: + yield 'data: {"type": "heartbeat"}\n\n' + + except asyncio.CancelledError: + pass + finally: + if queue in sse_clients: + sse_clients.remove(queue) + +async def capture_sse_generator(queue: asyncio.Queue): + try: + yield 'data: {"type": "connected"}\n\n' + while True: + try: + event = await asyncio.wait_for(queue.get(), timeout=30.0) + import json as _json + yield 'data: ' + _json.dumps(event) + '\n\n' + except asyncio.TimeoutError: + yield 'data: {"type": "heartbeat"}\n\n' + except asyncio.CancelledError: + pass + finally: + if queue in capture_sse_clients: + capture_sse_clients.remove(queue) + +@app.get("/api/captures/events") +async def capture_sse_endpoint(request: Request): + """Public SSE endpoint for capture page — no auth required.""" + queue: asyncio.Queue = asyncio.Queue() + capture_sse_clients.append(queue) + return StreamingResponse( + capture_sse_generator(queue), + media_type="text/event-stream", + headers={ + "Cache-Control": "no-cache", + "X-Accel-Buffering": "no", + "Connection": "keep-alive", + } + ) + +@app.post("/api/captures/events/notify") +async def notify_capture_clients(request: Request): + """Internal endpoint — called when transcription completes.""" + client_host = request.client.host if request.client else "" + if client_host not in ("127.0.0.1", "::1", "localhost"): + raise HTTPException(status_code=403, detail="Internal only") + data = await request.json() + for queue in capture_sse_clients: + await queue.put(data) + return JSONResponse({"notified": len(capture_sse_clients)}) + +@app.get("/api/events") +async def sse_endpoint(request: Request, auth: str = Depends(require_auth)): + queue: asyncio.Queue = asyncio.Queue() + sse_clients.append(queue) + return StreamingResponse( + sse_generator(queue), + media_type="text/event-stream", + headers={ + "Cache-Control": "no-cache", + "X-Accel-Buffering": "no", + "Connection": "keep-alive", + } + ) + +@app.post("/api/events/notify") +async def notify_clients(request: Request): + """Internal endpoint — called by dream.py when a dream is delivered""" + # Only allow from localhost + client_host = request.client.host if request.client else "" + if client_host not in ("127.0.0.1", "::1", "localhost"): + raise HTTPException(status_code=403, detail="Internal only") + data = await request.json() + for queue in sse_clients: + await queue.put(data) + return JSONResponse({"notified": len(sse_clients)}) + +if __name__ == "__main__": + uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/scripts/consolidator_v0_1.py.bak b/scripts/consolidator_v0_1.py.bak new file mode 100644 index 0000000..15eb08e --- /dev/null +++ b/scripts/consolidator_v0_1.py.bak @@ -0,0 +1,442 @@ +""" +Consolidator 0.1 — alias resolution agent for BirdAI's Tier 1 substrate. + +Reads entities from FalkorDB group_id 'aaron', infers light type labels, +computes pairwise similarity within type blocks using ego summary embedding + +name string distance + neighbor pattern overlap, generates merge proposals +above threshold, writes proposal log for human review. + +Does NOT execute merges. 0.1 is the calibration phase — proposals only, +human reviews before any action. +""" +import json +import re +import os +from datetime import datetime, timezone +from collections import defaultdict +from pathlib import Path + +from falkordb import FalkorDB +import numpy as np + +# Configuration +GROUP_ID = "aaron" +HIGH_CONFIDENCE_THRESHOLD = 0.85 # propose merge above this +LOW_CONFIDENCE_THRESHOLD = 0.65 # log as low-confidence below +PROPOSALS_DIR = Path("/home/aaron/Nextcloud/Journal/Consolidation") +PROPOSALS_DIR.mkdir(parents=True, exist_ok=True) + + +def cosine_similarity(a, b): + """Cosine similarity between two embedding vectors.""" + a = np.array(a, dtype=np.float32) + b = np.array(b, dtype=np.float32) + na = np.linalg.norm(a) + nb = np.linalg.norm(b) + if na == 0 or nb == 0: + return 0.0 + return float(np.dot(a, b) / (na * nb)) + + +def name_similarity(name_a, name_b): + """ + Token-overlap-based name similarity. + Handles formal/informal pairs (Aaron / Aaron Nelson), + abbreviation pairs (HVAMC / Hudson Valley AMC), + and simple transcription noise. + """ + a_lower = name_a.lower().strip() + b_lower = name_b.lower().strip() + + if a_lower == b_lower: + return 1.0 + + # Tokenize + a_tokens = set(re.findall(r'\b\w+\b', a_lower)) + b_tokens = set(re.findall(r'\b\w+\b', b_lower)) + + if not a_tokens or not b_tokens: + return 0.0 + + # Substring containment (handles "Aaron" in "Aaron Nelson") + if a_lower in b_lower or b_lower in a_lower: + # Strong signal but not 1.0 — different lengths + shorter = min(len(a_lower), len(b_lower)) + longer = max(len(a_lower), len(b_lower)) + return 0.7 + 0.2 * (shorter / longer) + + # Token Jaccard (handles "Aaron Nelson" vs "Nelson, Aaron") + intersection = a_tokens & b_tokens + union = a_tokens | b_tokens + jaccard = len(intersection) / len(union) + + # Acronym check (HVAMC vs Hudson Valley Additive Manufacturing Center) + def is_acronym(short, full): + if len(short) >= len(full): + return False + if not short.isupper(): + short_upper = short.upper() + else: + short_upper = short + full_words = full.split() + if len(full_words) < 2: + return False + first_letters = ''.join(w[0].upper() for w in full_words if w) + return short_upper == first_letters or short_upper in first_letters + + if is_acronym(name_a, name_b) or is_acronym(name_b, name_a): + return 0.85 + + return jaccard + + +def infer_type(entity_name, summary): + """ + Light type inference for blocking. Heuristic-based, transparent. + Returns one of: person, organization, project, place, concept, unknown. + + NOT a precise classification — just enough to avoid obviously wrong + cross-type comparisons (person vs project). When in doubt, return + 'unknown' which gets compared against everything. + """ + name_lower = entity_name.lower().strip() + summary_lower = (summary or "").lower() + + # Person: name patterns + person_indicators = [ + # First+Last name pattern (two title-cased words, no other tokens) + bool(re.match(r'^[A-Z][a-z]+ [A-Z][a-z]+$', entity_name.strip())), + # Single name that's also in the summary as a person + any(phrase in summary_lower for phrase in [ + 'is a person', 'is a professor', 'is an artist', 'is a colleague', + 'is a friend', 'is a family member', 'works at', 'studied at', + "'s spouse", "'s child", "'s parent", "'s student", + ]), + ] + if any(person_indicators): + return "person" + + # Organization: company/institution indicators + org_indicators = [ + any(suffix in name_lower for suffix in [ + ' inc', ' llc', ' corp', ' company', ' university', ' college', + ' school', ' institute', ' foundation', ' department', + ]), + any(phrase in summary_lower for phrase in [ + 'is a company', 'is a university', 'is an organization', + 'is an institution', 'is a department', 'is a nonprofit', + ]), + ] + if any(org_indicators): + return "organization" + + # Project: software/creative work indicators + project_indicators = [ + any(phrase in summary_lower for phrase in [ + 'is a project', 'software project', 'is a codebase', + 'is a tool', 'is a system', 'is an application', + 'is a research project', 'is a design project', + ]), + any(suffix in name_lower for suffix in [' project', ' system', ' platform']), + ] + if any(project_indicators): + return "project" + + # Place: location indicators + place_indicators = [ + any(phrase in summary_lower for phrase in [ + 'is a city', 'is a town', 'is a state', 'is a country', + 'is a neighborhood', 'is a region', 'is a location', + ]), + ] + if any(place_indicators): + return "place" + + # Default + return "unknown" + + +def get_neighbors(graph, entity_uuid, limit=20): + """Get the names of entities connected to this entity (1-hop).""" + query = """ + MATCH (e:Entity {uuid: $uuid})-[r:RELATES_TO]-(other:Entity) + RETURN DISTINCT other.name AS name + LIMIT $limit + """ + result = graph.query(query, {"uuid": entity_uuid, "limit": limit}) + return set(row[0] for row in result.result_set if row[0]) + + +def neighbor_jaccard(neighbors_a, neighbors_b): + """Jaccard similarity of two neighbor sets.""" + if not neighbors_a and not neighbors_b: + return 0.0 + intersection = neighbors_a & neighbors_b + union = neighbors_a | neighbors_b + if not union: + return 0.0 + return len(intersection) / len(union) + + +def get_edge_count(graph, entity_uuid): + query = """ + MATCH (e:Entity {uuid: $uuid})-[r:RELATES_TO]-() + RETURN count(r) AS c + """ + result = graph.query(query, {"uuid": entity_uuid}) + return result.result_set[0][0] if result.result_set else 0 + + +def combine_signals(name_sim, ego_sim, neighbor_sim): + """ + Combine the three similarity signals into a single confidence score. + + Weighting based on DEG-RAG findings: ego info is essential, neighbor + cues help in some settings, name similarity is a strong tie-breaker + but not the primary signal. + + For 0.1, simple weighted average with floor based on ego_sim alone. + """ + # If ego similarity is very low, the entities probably aren't aliases + # regardless of name match (different concepts can share names) + if ego_sim < 0.4: + return min(0.5, ego_sim) + + # If name is very similar AND ego is at least moderate, high confidence + if name_sim >= 0.85 and ego_sim >= 0.65: + return 0.5 * ego_sim + 0.3 * name_sim + 0.2 * neighbor_sim + + # Standard weighted average + return 0.5 * ego_sim + 0.25 * name_sim + 0.25 * neighbor_sim + + +def generate_proposals(): + db = FalkorDB(host='localhost', port=6379) + graph = db.select_graph(GROUP_ID) + + # Pull all entities with embeddings + print(f"Fetching entities from group_id '{GROUP_ID}'...") + result = graph.query(""" + MATCH (n:Entity) + WHERE n.name_embedding IS NOT NULL AND n.summary IS NOT NULL + RETURN n.uuid, n.name, n.summary, n.name_embedding + """) + + entities = [] + for row in result.result_set: + entities.append({ + 'uuid': row[0], + 'name': row[1], + 'summary': row[2], + 'embedding': row[3], + }) + print(f" Loaded {len(entities)} entities with embeddings") + + # Infer types for blocking + print("Inferring entity types for blocking...") + type_counts = defaultdict(int) + for e in entities: + e['inferred_type'] = infer_type(e['name'], e['summary']) + type_counts[e['inferred_type']] += 1 + for t, c in sorted(type_counts.items(), key=lambda x: -x[1]): + print(f" {t}: {c}") + + # Group by inferred type for blocking + blocks = defaultdict(list) + for e in entities: + blocks[e['inferred_type']].append(e) + + # 'unknown' entities get compared against everything (they might be any type) + # Other types only get compared within their type block + against unknowns + print() + print("Comparing entities within type blocks...") + proposals = [] + low_confidence = [] + comparisons_done = 0 + + # Build comparison pairs + pairs_to_compare = [] + typed_blocks = {t: ents for t, ents in blocks.items() if t != 'unknown'} + unknown_block = blocks.get('unknown', []) + + # Within-type pairs (excluding unknown) + for t, ents in typed_blocks.items(): + for i in range(len(ents)): + for j in range(i + 1, len(ents)): + pairs_to_compare.append((ents[i], ents[j])) + + # Unknown vs unknown + for i in range(len(unknown_block)): + for j in range(i + 1, len(unknown_block)): + pairs_to_compare.append((unknown_block[i], unknown_block[j])) + + # Unknown vs typed (unknowns might be any type) + for ent_unknown in unknown_block: + for t, ents in typed_blocks.items(): + for ent_typed in ents: + pairs_to_compare.append((ent_unknown, ent_typed)) + + print(f" Pairs to compare: {len(pairs_to_compare):,}") + + # Compute similarities + cache_neighbors = {} + def neighbors_cached(uuid): + if uuid not in cache_neighbors: + cache_neighbors[uuid] = get_neighbors(graph, uuid) + return cache_neighbors[uuid] + + for ent_a, ent_b in pairs_to_compare: + comparisons_done += 1 + if comparisons_done % 5000 == 0: + print(f" ... {comparisons_done:,} / {len(pairs_to_compare):,}") + + # Quick filter: skip if name similarity is very low and names are clearly different + name_sim = name_similarity(ent_a['name'], ent_b['name']) + ego_sim_quick = cosine_similarity(ent_a['embedding'], ent_b['embedding']) + + # Pre-filter to avoid expensive neighbor query on obviously different pairs + if ego_sim_quick < 0.5 and name_sim < 0.3: + continue + + # Full comparison + neighbors_a = neighbors_cached(ent_a['uuid']) + neighbors_b = neighbors_cached(ent_b['uuid']) + neighbor_sim = neighbor_jaccard(neighbors_a, neighbors_b) + + confidence = combine_signals(name_sim, ego_sim_quick, neighbor_sim) + + record = { + 'entity_a': { + 'uuid': ent_a['uuid'], + 'name': ent_a['name'], + 'type': ent_a['inferred_type'], + 'summary': ent_a['summary'][:200], + 'edge_count': get_edge_count(graph, ent_a['uuid']), + }, + 'entity_b': { + 'uuid': ent_b['uuid'], + 'name': ent_b['name'], + 'type': ent_b['inferred_type'], + 'summary': ent_b['summary'][:200], + 'edge_count': get_edge_count(graph, ent_b['uuid']), + }, + 'confidence': round(confidence, 3), + 'signals': { + 'name_similarity': round(name_sim, 3), + 'ego_similarity': round(ego_sim_quick, 3), + 'neighbor_overlap': round(neighbor_sim, 3), + }, + 'shared_neighbors': sorted(list(neighbors_a & neighbors_b))[:10], + } + + if confidence >= HIGH_CONFIDENCE_THRESHOLD: + proposals.append(record) + elif confidence >= LOW_CONFIDENCE_THRESHOLD: + low_confidence.append(record) + + print(f"\nDone. Proposals: {len(proposals)}, Low-confidence: {len(low_confidence)}") + return proposals, low_confidence, len(entities), len(pairs_to_compare) + + +def write_proposals_log(proposals, low_confidence, total_entities, total_comparisons): + timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d-%H%M") + out_path = PROPOSALS_DIR / f"proposals-{timestamp}.md" + + proposals_sorted = sorted(proposals, key=lambda p: -p['confidence']) + low_sorted = sorted(low_confidence, key=lambda p: -p['confidence']) + + lines = [] + lines.append(f"# Consolidator 0.1 — Run {timestamp}") + lines.append("") + lines.append("## Statistics") + lines.append(f"- Entities scanned: {total_entities:,}") + lines.append(f"- Pairwise comparisons: {total_comparisons:,}") + lines.append(f"- High-confidence proposals (≥{HIGH_CONFIDENCE_THRESHOLD}): {len(proposals)}") + lines.append(f"- Low-confidence candidates ({LOW_CONFIDENCE_THRESHOLD}-{HIGH_CONFIDENCE_THRESHOLD}): {len(low_confidence)}") + lines.append("") + lines.append("## How to review") + lines.append("") + lines.append("For each proposal, mark your decision by changing `[ ]` to one of:") + lines.append("- `[APPROVE]` — execute this merge on next run") + lines.append("- `[REJECT]` — don't merge, don't propose again") + lines.append("- `[DEFER]` — re-surface in next run for further consideration") + lines.append("") + lines.append("Save the file when done. Do not modify proposal_id or uuid fields.") + lines.append("") + lines.append("---") + lines.append("") + lines.append(f"## Proposed Merges (n={len(proposals)})") + lines.append("") + + for i, p in enumerate(proposals_sorted, start=1): + lines.append(f"### Proposal {i}") + lines.append("") + lines.append(f"**Decision:** [ ]") + lines.append("") + lines.append(f"**Confidence:** {p['confidence']}") + lines.append("") + lines.append(f"**Entity A:** \"{p['entity_a']['name']}\" (type: {p['entity_a']['type']}, {p['entity_a']['edge_count']} edges)") + lines.append(f" - uuid: `{p['entity_a']['uuid']}`") + lines.append(f" - summary: {p['entity_a']['summary']}") + lines.append("") + lines.append(f"**Entity B:** \"{p['entity_b']['name']}\" (type: {p['entity_b']['type']}, {p['entity_b']['edge_count']} edges)") + lines.append(f" - uuid: `{p['entity_b']['uuid']}`") + lines.append(f" - summary: {p['entity_b']['summary']}") + lines.append("") + lines.append(f"**Signals:**") + lines.append(f" - Name similarity: {p['signals']['name_similarity']}") + lines.append(f" - Ego (summary) similarity: {p['signals']['ego_similarity']}") + lines.append(f" - Neighbor overlap: {p['signals']['neighbor_overlap']}") + if p['shared_neighbors']: + shared_str = ', '.join(f'"{n}"' for n in p['shared_neighbors'][:8]) + lines.append(f" - Shared neighbors (sample): {shared_str}") + lines.append("") + lines.append("**Optional rejection note:** ") + lines.append("") + lines.append("---") + lines.append("") + + lines.append("") + lines.append(f"## Low-Confidence Candidates (n={len(low_confidence)}, informational only, no action)") + lines.append("") + for p in low_sorted[:30]: + lines.append(f"- **{p['confidence']}** \"{p['entity_a']['name']}\" + \"{p['entity_b']['name']}\" (name={p['signals']['name_similarity']}, ego={p['signals']['ego_similarity']}, nbr={p['signals']['neighbor_overlap']})") + if len(low_sorted) > 30: + lines.append(f"- *(...{len(low_sorted) - 30} more not shown)*") + + out_path.write_text("\n".join(lines)) + print(f"\nProposal log written to: {out_path}") + + # Also save raw JSON for downstream tooling + json_path = PROPOSALS_DIR / f"proposals-{timestamp}.json" + with open(json_path, 'w') as f: + json.dump({ + 'run_timestamp': timestamp, + 'statistics': { + 'total_entities': total_entities, + 'total_comparisons': total_comparisons, + 'proposal_count': len(proposals), + 'low_confidence_count': len(low_confidence), + }, + 'proposals': proposals_sorted, + 'low_confidence': low_sorted, + }, f, indent=2) + print(f"Raw JSON: {json_path}") + + +def main(): + print("=" * 70) + print("Consolidator 0.1 — Calibration Phase") + print("=" * 70) + print() + + proposals, low_confidence, total_entities, total_comparisons = generate_proposals() + write_proposals_log(proposals, low_confidence, total_entities, total_comparisons) + + print() + print("Next: review the proposals markdown file and mark APPROVE/REJECT/DEFER") + print("for each proposal. Re-run will read decisions and execute approved merges.") + + +if __name__ == "__main__": + main() diff --git a/scripts/corpus_integrity.py b/scripts/corpus_integrity.py index c936dba..4208ba6 100644 --- a/scripts/corpus_integrity.py +++ b/scripts/corpus_integrity.py @@ -135,7 +135,7 @@ def queue_for_retry(source, full_text, filepath): ON CONFLICT (source) DO UPDATE SET full_text = EXCLUDED.full_text, char_length = EXCLUDED.char_length, enqueued_at = NOW(), completed_at = NULL, failed_at = NULL, attempts = 0 - """, (source, full_text[:50000], len(full_text))) + """, (source, full_text, len(full_text))) pg.commit() pg.close() return True diff --git a/scripts/corpus_integrity.py.bak.20260501-021703 b/scripts/corpus_integrity.py.bak.20260501-021703 new file mode 100644 index 0000000..c936dba --- /dev/null +++ b/scripts/corpus_integrity.py.bak.20260501-021703 @@ -0,0 +1,245 @@ +#!/usr/bin/env python3 +""" +corpus_integrity.py — BirdAI Corpus Integrity Check + +Compares three sources of truth: + 1. Filesystem (Nextcloud) — what files exist + 2. pgvector (embeddings table) — what's been through Stage 1 + 3. Graphiti (migration state + stage_3_queue) — what's been through Stage 3 + +Usage: + python3 corpus_integrity.py # report only + python3 corpus_integrity.py --fix # report + auto-queue gaps for retry + python3 corpus_integrity.py --json # output JSON to stdout +""" + +import os +import sys +import json +import argparse +from pathlib import Path +from datetime import datetime + +import psycopg2 +from dotenv import load_dotenv + +load_dotenv(Path.home() / "aaronai" / ".env", override=True) + +NEXTCLOUD_PATH = "/home/aaron/nextcloud/data/data/aaron/files" +MIGRATION_STATE = str(Path.home() / "aaronai" / "experiments" / "tier1_migration_state.json") +REPORT_PATH = str(Path.home() / "aaronai" / "corpus_integrity_report.json") +SUPPORTED = {".pdf", ".docx", ".pptx", ".txt", ".md"} +PG_DSN = os.getenv("PG_DSN") + + +def get_pg(): + return psycopg2.connect(PG_DSN) + + +def get_filesystem_files(): + files = [] + root = Path(NEXTCLOUD_PATH) + for path in root.rglob("*"): + if path.is_dir(): continue + if path.suffix.lower() not in SUPPORTED: continue + if path.name.startswith((".", "~$")): continue + if "Admin/Backups" in str(path) or "Backups" in path.parts: continue + if "Journal/Media" in str(path): continue + files.append({"source": path.name, "filepath": str(path), + "size": path.stat().st_size, "mtime": path.stat().st_mtime}) + return files + + +def get_pgvector_sources(): + try: + pg = get_pg() + cur = pg.cursor() + cur.execute("SELECT DISTINCT source FROM embeddings WHERE source IS NOT NULL") + sources = {row[0] for row in cur.fetchall()} + pg.close() + return sources + except Exception as e: + print(f"ERROR: pgvector: {e}", file=sys.stderr) + return set() + + +def get_graphiti_sources(): + sources = set() + try: + state_path = Path(MIGRATION_STATE) + if state_path.exists(): + state = json.loads(state_path.read_text()) + for filepath in state.get("ingested", []): + sources.add(Path(filepath).name) + except Exception as e: + print(f"WARNING: migration state: {e}", file=sys.stderr) + try: + pg = get_pg() + cur = pg.cursor() + cur.execute("SELECT DISTINCT source FROM stage_3_queue WHERE completed_at IS NOT NULL") + for row in cur.fetchall(): sources.add(row[0]) + pg.close() + except Exception as e: + print(f"WARNING: stage_3_queue: {e}", file=sys.stderr) + return sources + + +def get_ingest_failures(): + failures = {} + try: + pg = get_pg() + cur = pg.cursor() + cur.execute(""" + SELECT source, filepath, error, retry_count, first_failed_at, last_failed_at + FROM ingest_failures WHERE resolved = FALSE ORDER BY last_failed_at DESC + """) + for row in cur.fetchall(): + failures[row[0]] = {"source": row[0], "filepath": row[1], "error": row[2], + "retry_count": row[3], "first_failed_at": str(row[4]), + "last_failed_at": str(row[5])} + pg.close() + except Exception as e: + print(f"WARNING: ingest_failures: {e}", file=sys.stderr) + return failures + + +def extract_text_for_retry(filepath): + path = Path(filepath) + suffix = path.suffix.lower() + try: + if suffix == ".docx": + from docx import Document as D + return "\n".join(p.text for p in D(path).paragraphs if p.text.strip()) + elif suffix == ".pdf": + from pypdf import PdfReader + return "".join(p.extract_text() + "\n" for p in PdfReader(path).pages if p.extract_text()) + elif suffix == ".pptx": + from pptx import Presentation + prs = Presentation(path) + return "\n".join(shape.text for slide in prs.slides for shape in slide.shapes + if hasattr(shape, "text") and shape.text.strip()) + elif suffix in {".txt", ".md"}: + return path.read_text(encoding="utf-8", errors="ignore") + except Exception as e: + print(f"WARNING: extraction failed {path.name}: {e}", file=sys.stderr) + return "" + + +def queue_for_retry(source, full_text, filepath): + try: + pg = get_pg() + cur = pg.cursor() + cur.execute(""" + INSERT INTO stage_2_queue (source, full_text, char_length) + VALUES (%s, %s, %s) + ON CONFLICT (source) DO UPDATE SET + full_text = EXCLUDED.full_text, char_length = EXCLUDED.char_length, + enqueued_at = NOW(), completed_at = NULL, failed_at = NULL, attempts = 0 + """, (source, full_text[:50000], len(full_text))) + pg.commit() + pg.close() + return True + except Exception as e: + print(f"WARNING: queue failed {source}: {e}", file=sys.stderr) + return False + + +def run_reconciliation(fix=False): + print(f"BirdAI Corpus Integrity Check — {datetime.now().isoformat()}") + print() + print("Scanning filesystem...") + fs_files = get_filesystem_files() + fs_sources = {f["source"]: f for f in fs_files} + print(f" Filesystem: {len(fs_files)} files") + print("Querying pgvector...") + pv_sources = get_pgvector_sources() + print(f" pgvector: {len(pv_sources)} distinct sources") + print("Querying Graphiti...") + gr_sources = get_graphiti_sources() + print(f" Graphiti: {len(gr_sources)} sources") + print("Querying ingest failures...") + failures = get_ingest_failures() + print(f" Failures: {len(failures)} unresolved") + print() + + both, pv_only, neither, gr_only = [], [], [], [] + for source, finfo in fs_sources.items(): + in_pv = source in pv_sources + in_gr = source in gr_sources + if in_pv and in_gr: both.append(finfo) + elif in_pv: pv_only.append(finfo) + elif in_gr: gr_only.append(finfo) + else: neither.append(finfo) + + orphans_pv = pv_sources - set(fs_sources.keys()) + orphans_gr = gr_sources - set(fs_sources.keys()) + + print(f"Results:") + print(f" Both (pgvector + Graphiti): {len(both)}") + print(f" pgvector only: {len(pv_only)}") + print(f" Neither (corpus gap): {len(neither)}") + print(f" Graphiti only: {len(gr_only)}") + print(f" Ingest failures: {len(failures)}") + print(f" pgvector orphans: {len(orphans_pv)}") + print(f" Graphiti orphans: {len(orphans_gr)}") + print() + + auto_queued = [] + if fix and neither: + print(f"Auto-queuing {len(neither)} gap files...") + for finfo in neither: + text = extract_text_for_retry(finfo["filepath"]) + if text.strip(): + if queue_for_retry(finfo["source"], text, finfo["filepath"]): + auto_queued.append(finfo["source"]) + print(f" Queued: {finfo['source']}") + else: + print(f" Skipped (unreadable): {finfo['source']}") + try: + pg = get_pg() + cur = pg.cursor() + cur.execute(""" + INSERT INTO ingest_failures (source, filepath, error, retry_count, first_failed_at, last_failed_at) + VALUES (%s, %s, %s, 0, NOW(), NOW()) + ON CONFLICT (source) DO UPDATE SET + error = EXCLUDED.error, + last_failed_at = NOW() + """, (finfo["source"], finfo["filepath"], + "Empty text — likely scanned, encrypted, or corrupt. Requires manual review or OCR.")) + pg.commit() + pg.close() + except Exception as e: + print(f" WARNING: could not record failure: {e}") + print() + + report = { + "timestamp": datetime.now().isoformat(), + "summary": { + "filesystem_total": len(fs_files), "pgvector_total": len(pv_sources), + "graphiti_total": len(gr_sources), "both": len(both), + "pgvector_only": len(pv_only), "neither": len(neither), + "graphiti_only": len(gr_only), "failures": len(failures), + "orphans_pgvector": len(orphans_pv), "orphans_graphiti": len(orphans_gr), + }, + "gaps": [f["source"] for f in neither], + "failures": list(failures.values()), + "auto_queued": auto_queued, + "pgvector_only_sample": [f["source"] for f in pv_only[:20]], + "graphiti_only": list(gr_only), + } + Path(REPORT_PATH).write_text(json.dumps(report, indent=2)) + print(f"Report written to: {REPORT_PATH}") + return report + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--fix", action="store_true") + parser.add_argument("--json", action="store_true") + args = parser.parse_args() + report = run_reconciliation(fix=args.fix) + if args.json: + print(json.dumps(report, indent=2)) + +if __name__ == "__main__": + main() diff --git a/scripts/dream.py b/scripts/dream.py index 77f851e..ead2b36 100644 --- a/scripts/dream.py +++ b/scripts/dream.py @@ -111,11 +111,16 @@ def get_recent_conversation_topics(days=14): # ─── Stage 2: Retrieve ────────────────────────────────────────────────────── -def retrieve_graphiti(mode, task=None, n_results=8): +def retrieve_graphiti(mode, task=None, n_results=8, excluded_sources=None): """E3 experiment — Graphiti substrate retrieval. Queries Graphiti /search endpoint instead of pgvector. Returns chunks in same format as retrieve() for pipeline compatibility. Note: content is Graphiti facts (synthesized relationships), not raw chunks. + + Over-fetches by 3x to allow in-process filtering against excluded_sources, + matching the cross-stage exclusion mechanism the pgvector branch uses. + Without this filter, NREM/Early REM/Late REM would see overlapping content + and the score-band Early REM exclusion (v1.1) would not apply in Graphiti mode. """ import requests as req_lib if task: @@ -129,25 +134,38 @@ def retrieve_graphiti(mode, task=None, n_results=8): else: query = "research fabrication teaching practice recent work" + excluded_sources = excluded_sources or set() + # Over-fetch so in-process exclusion still leaves enough results + fetch_limit = n_results * 3 if excluded_sources else n_results + try: resp = req_lib.get( "http://localhost:8001/search", - params={"query": query, "limit": n_results, "group_id": "aaron"}, + params={"query": query, "limit": fetch_limit, "group_id": "aaron"}, timeout=30, ) resp.raise_for_status() results = resp.json().get("results", []) chunks = [] + seen_sources = set() for r in results: fact = r.get("fact", "") if not fact.strip(): continue + source = r.get("source", "graphiti") + if source in excluded_sources: + continue + if source in seen_sources: + continue chunks.append({ - "source": r.get("source", "graphiti"), + "source": source, "content": fact, "relevance": r.get("score", 0.5), "similarity": r.get("score", 0.5), }) + seen_sources.add(source) + if len(chunks) >= n_results: + break return chunks except Exception as e: print(f"[Graphiti retrieval error: {e}] — falling back to empty.") @@ -158,7 +176,7 @@ def retrieve(mode, task=None, n_results=8, excluded_sources=None): # Default behavior: pgvector similarity search (unchanged) substrate = os.getenv("DREAMER_SUBSTRATE", "pgvector") if substrate == "graphiti": - return retrieve_graphiti(mode, task=task, n_results=n_results) + return retrieve_graphiti(mode, task=task, n_results=n_results, excluded_sources=excluded_sources) from sentence_transformers import SentenceTransformer embedder = SentenceTransformer("all-MiniLM-L6-v2") low, high = MODE_RANGES[mode] diff --git a/scripts/dream.py.bak b/scripts/dream.py.bak new file mode 100644 index 0000000..ed8da8d --- /dev/null +++ b/scripts/dream.py.bak @@ -0,0 +1,554 @@ +""" +Aaron AI Dreamer — Active Inference Engine +Interdependent stage architecture grounded in sleep consolidation research. + +Nightly pipeline: NREM → Early REM → Late REM → Synthesis +Each stage receives the previous stage's output as context. +Lucid mode is on-demand only (Dream Now from settings). + +Research basis: +- Singh et al. PNAS 2022: alternating NREM/REM outperforms single-stage approaches +- Klinzing et al. Nature Neuroscience 2019: SO-spindle-ripple coupling is interdependent +- REM operates on what NREM produced — stages are not discrete alternatives +""" + +import os +import json +import sqlite3 +import argparse +from pathlib import Path +from datetime import datetime, timedelta +from dotenv import load_dotenv +import psycopg2 +import hashlib + +load_dotenv(Path.home() / "aaronai" / ".env") + +PG_DSN = os.getenv("PG_DSN") + +def get_pg(): + return psycopg2.connect(PG_DSN) + +# ─── Paths ────────────────────────────────────────────────────────────────── +CONVERSATIONS_DB = str(Path.home() / "aaronai" / "conversations.db") +WATCHER_STATE = str(Path.home() / "aaronai" / "watcher_state.json") +DREAMER_STATE = str(Path.home() / "aaronai" / "dreamer_state.json") +JOURNAL_DIR = "/home/aaron/nextcloud/data/data/aaron/files/Journal/Daily" + +NEXTCLOUD_URL = os.getenv("NEXTCLOUD_URL", "https://nextcloud.aaronnelson.studio") +NEXTCLOUD_USER = os.getenv("NEXTCLOUD_USER", "aaron") +NEXTCLOUD_PASSWORD = os.getenv("NEXTCLOUD_PASSWORD", "") +DREAMS_WEBDAV = f"{NEXTCLOUD_URL}/remote.php/dav/files/{NEXTCLOUD_USER}/Journal/Dreams" + +# Similarity ranges calibrated for all-MiniLM-L6-v2 +MODE_RANGES = { + "nrem": (0.48, 0.72), + "early-rem": (0.38, 0.55), + "late-rem": (0.22, 0.42), + "lucid": (0.32, 0.72), +} + +# ─── Prompt versioning ────────────────────────────────────────────────────── +# Bump the relevant constant manually when changing a prompt. +PROMPT_VERSION_NREM = "1.0" +PROMPT_VERSION_EREM = "1.1" +PROMPT_VERSION_LREM = "1.2" +PROMPT_VERSION_SYN = "1.0" + +def prompt_signature(): + return (f"nrem={PROMPT_VERSION_NREM}|erem={PROMPT_VERSION_EREM}" + f"|lrem={PROMPT_VERSION_LREM}|syn={PROMPT_VERSION_SYN}") + +def prompt_hash(prompts: list[str]) -> str: + combined = "".join(prompts) + return hashlib.md5(combined.encode()).hexdigest()[:8] + +# ─── Stage 1: Observe ─────────────────────────────────────────────────────── + +def observe_corpus(): + state = load_dreamer_state() + last_dream = state.get("last_dream_timestamp", 0) + new_chunk_count = 0 + try: + watcher_state = json.loads(Path(WATCHER_STATE).read_text()) + for path, mtime in watcher_state.items(): + if float(mtime) > last_dream: + new_chunk_count += 1 + except: + pass + days_since = (datetime.now().timestamp() - last_dream) / 86400 + recent_topics = get_recent_conversation_topics() + return { + "new_chunks": new_chunk_count, + "days_since_dream": days_since, + "recent_topics": recent_topics, + "last_dream": last_dream, + } + +def get_recent_conversation_topics(days=14): + try: + conn = sqlite3.connect(CONVERSATIONS_DB) + cutoff = (datetime.now() - timedelta(days=days)).isoformat() + c = conn.cursor() + c.execute(""" + SELECT m.content FROM messages m + JOIN conversations c ON m.conversation_id = c.id + WHERE m.role = 'user' AND c.updated_at > ? + ORDER BY m.timestamp DESC LIMIT 20 + """, (cutoff,)) + rows = c.fetchall() + conn.close() + return [r[0][:200] for r in rows] + except: + return [] + +# ─── Stage 2: Retrieve ────────────────────────────────────────────────────── + +def retrieve(mode, task=None, n_results=8): + from sentence_transformers import SentenceTransformer + embedder = SentenceTransformer("all-MiniLM-L6-v2") + low, high = MODE_RANGES[mode] + + if task: + query = task + elif mode == "late-rem": + delta = observe_corpus() + topics = delta.get("recent_topics", []) + query = topics[0] if topics else "practice place memory making" + elif mode == "early-rem": + query = "career decision personal change what matters next" + else: + query = "research fabrication teaching practice recent work" + + embedding = embedder.encode([query]).tolist()[0] + chunks = [] + seen_sources = set() + + try: + pg = get_pg() + cur = pg.cursor() + cur.execute(""" + SELECT document, source, 1 - (embedding <=> %s::vector) as similarity + FROM embeddings + ORDER BY embedding <=> %s::vector + LIMIT %s + """, (embedding, embedding, n_results * 3)) + + for doc, source, similarity in cur.fetchall(): + if not (low <= similarity <= high): + continue + if source in seen_sources: + continue + chunks.append({ + "source": source or "unknown", + "content": doc, + "relevance": similarity, + }) + seen_sources.add(source) + if len(chunks) >= n_results: + break + pg.close() + except Exception as e: + print(f"pgvector retrieval error: {e}") + + return chunks + +# ─── Stage 3: Synthesize ──────────────────────────────────────────────────── + +def synthesize_nrem(chunks): + chunk_text = "\n\n---\n\n".join([f"[{c['source']}]\n{c['content']}" for c in chunks]) + prompt = f"""You have read everything Aaron Nelson has written and published. +You are a careful colleague who noticed something this week. + +Here is material from his corpus: + +{chunk_text} + +Write to Aaron directly. Identify one specific connection between +this material and something he wrote or worked on previously. +Stay close to the documents — cite them specifically by name. +Do not speculate beyond what the material supports. Do not use +headers or bullet points. Write one paragraph of 200-300 words +that ends with a single concrete question he could act on.""" + return _call_claude(prompt) + + +def synthesize_early_rem(chunks, nrem_output): + # v1.1 — removed citation instruction, removed close-friend persona, + # shifted register from analysis to recognition. + chunk_text = "\n\n---\n\n".join([f"[{c['source']}]\n{c['content']}" for c in chunks]) + prompt = f"""Something was noticed earlier tonight, moving through Aaron's recent work: + +{nrem_output} + +That observation is still with you. Now here is material from a different +time — pulled from further back, from different parts of his corpus: + +{chunk_text} + +You are not analyzing. You are recognizing. + +Something in the earlier observation and something in this older material +are the same thing wearing different clothes. Find it. Don't explain why +they're connected — just let the connection speak. Write from inside the +recognition, not from above it. + +The emotional register underneath the career logic is more interesting +than the career logic. The pattern that has been repeating longer than +he has been aware of it is more interesting than the current instance. + +Write directly to Aaron. No citations, no references, no analysis. +First person, present tense. Let what you noticed arrive rather than +be delivered. 150-250 words. End with one thing that is true that +he probably already knows but hasn't said out loud yet.""" + return _call_claude(prompt) + + +def synthesize_late_rem(chunks, nrem_output, early_rem_output): + chunk_text = "\n\n---\n\n".join([f"[{c['source']}]\n{c['content']}" for c in chunks]) + prompt = f"""You have been moving through Aaron Nelson's corpus all night. +First you found this, in the careful light of early consolidation: + +{nrem_output} + +Then, in the more personal territory that followed: + +{early_rem_output} + +Now it is late. The boundaries between things have loosened. +Here is material pulled from opposite ends of his work: + +{chunk_text} + +Do not explain the connections between all of this. +Do not resolve them. Do not summarize what came before. +Something stranger is possible now — let the accumulated +material from the night find its own shape. Compressed, +associative, slightly off. Let the strangeness stand. + +No headers. No bullet points. No hedging. No resolution. +No offer. End mid-thought if that is where the material ends. +150-250 words.""" + return _call_claude(prompt) + + +def synthesize_final(nrem_output, early_rem_output, late_rem_output): + prompt = f"""You have spent the night moving through Aaron Nelson's corpus +in three passes, each building on the last. + +The first pass — careful, close to the documents: +{nrem_output} + +The second pass — more personal, following what the first opened: +{early_rem_output} + +The third pass — associative, strange, letting things touch that +don't normally touch: +{late_rem_output} + +Now synthesize. Not a summary — a synthesis. Find what runs through +all three that none of them said directly. The thing that only becomes +visible when you hold all three passes together. + +Write it as a single unbroken piece. No headers, no bullet points, +no stage labels. 200-300 words. End with the one question that +matters most right now.""" + return _call_claude(prompt, max_tokens=800) + + +def synthesize_lucid(chunks, task): + chunk_text = "\n\n---\n\n".join([f"[{c['source']}]\n{c['content']}" for c in chunks]) + prompt = f"""Aaron has a question he is sitting with: + +{task or "What should I be thinking about that I am not?"} + +You have searched his entire corpus and found material that +speaks to this question from unexpected directions. Here is +what you found: + +{chunk_text} + +Do not summarize. Do not list. Pick the most interesting +tension between what the corpus contains and what he is +asking, and follow it through to its conclusion. Cite +specific documents by name. Be direct about what you think. +No headers, no bullet points. 250-400 words. +End with an offer to work on it together.""" + return _call_claude(prompt) + + +def _call_claude(prompt, max_tokens=1000): + import anthropic + client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY")) + response = client.messages.create( + model="claude-sonnet-4-6", + max_tokens=max_tokens, + messages=[{"role": "user", "content": prompt}] + ) + return response.content[0].text + +# ─── Stage 4: Deliver ─────────────────────────────────────────────────────── + +def deliver(dream_text, mode, task=None): + import requests + date_str = datetime.now().strftime("%Y-%m-%d") + filename = f"{date_str}-{mode}.md" + header = f"# Dream — {mode.upper()} — {datetime.now().strftime('%Y-%m-%d %H:%M')}\n" + header += f"*prompt_sig: {prompt_signature()}*\n\n" + if task: + header += f"*Task: {task}*\n\n" + header += "---\n\n" + content = header + dream_text + + auth = (NEXTCLOUD_USER, NEXTCLOUD_PASSWORD) + requests.request("MKCOL", DREAMS_WEBDAV, auth=auth, timeout=10) + + url = f"{DREAMS_WEBDAV}/{filename}" + counter = 1 + while True: + check = requests.request("PROPFIND", url, auth=auth, timeout=10) + if check.status_code == 404: + break + filename = f"{date_str}-{mode}-{counter}.md" + url = f"{DREAMS_WEBDAV}/{filename}" + counter += 1 + + response = requests.put(url, data=content.encode("utf-8"), auth=auth, timeout=30) + response.raise_for_status() + print(f"Delivered: Journal/Dreams/{filename}") + return f"Journal/Dreams/{filename}" + +def notify_sse(mode, filename): + try: + import requests + requests.post("http://localhost:8000/api/events/notify", json={ + "type": "dream", + "mode": mode, + "filename": filename, + "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M"), + }, timeout=3) + except Exception as e: + print(f"SSE notify failed (non-critical): {e}") + +# ─── State ────────────────────────────────────────────────────────────────── + +def load_dreamer_state(): + p = Path(DREAMER_STATE) + if p.exists(): + try: + return json.loads(p.read_text()) + except: + pass + return {} + +def save_dreamer_state(state): + Path(DREAMER_STATE).write_text(json.dumps(state, indent=2)) + +# ─── Orchestrators ─────────────────────────────────────────────────────────── + +def write_manifest(date_str, stage_data, corpus_data): + import requests + manifest = { + "date": date_str, + "prompt_sig": prompt_signature(), + "prompt_hash": prompt_hash([ + synthesize_nrem.__doc__ or "", + synthesize_early_rem.__doc__ or "", + synthesize_late_rem.__doc__ or "", + synthesize_final.__doc__ or "", + ]), + "stages": stage_data, + "corpus": corpus_data, + "rating": None, + "notes": "", + } + content = json.dumps(manifest, indent=2) + auth = (NEXTCLOUD_USER, NEXTCLOUD_PASSWORD) + url = f"{DREAMS_WEBDAV}/dream-manifest-{date_str}.json" + try: + requests.put(url, data=content.encode("utf-8"), auth=auth, timeout=30) + print(f"Manifest written: Journal/Dreams/dream-manifest-{date_str}.json") + except Exception as e: + print(f"Manifest write failed (non-critical): {e}") + + +def dream_pipeline(): + """ + Full nightly pipeline — interdependent stages. + NREM output feeds Early REM. Both feed Late REM. All three feed Synthesis. + """ + print(f"Dreamer pipeline starting — {datetime.now().strftime('%Y-%m-%d %H:%M')}") + + delta = observe_corpus() + print(f"Corpus: {delta['new_chunks']} new chunks, {delta['days_since_dream']:.1f} days since last dream") + + # ── Stage 1: NREM ────────────────────────────────────────────────────── + print("\n[NREM] Retrieving...") + nrem_chunks = retrieve("nrem") + if not nrem_chunks: + print("[NREM] No suitable chunks — aborting pipeline") + return None + + print(f"[NREM] Retrieved {len(nrem_chunks)} chunks. Synthesizing...") + nrem_output = synthesize_nrem(nrem_chunks) + nrem_file = deliver(nrem_output, "nrem") + stage_data = { + "nrem": { + "chunks_retrieved": len(nrem_chunks), + "avg_similarity": round(sum(c["relevance"] for c in nrem_chunks) / len(nrem_chunks), 3), + "query": "research fabrication teaching practice recent work", + "word_count": len(nrem_output.split()), + "status": "ok", + } + } + print(f"[NREM] Done.\n{nrem_output[:200]}...") + + # ── Stage 2: Early REM — informed by NREM ────────────────────────────── + print("\n[Early REM] Retrieving...") + early_chunks = retrieve("early-rem") + if not early_chunks: + print("[Early REM] No suitable chunks — skipping") + early_rem_output = nrem_output # fallback + else: + print(f"[Early REM] Retrieved {len(early_chunks)} chunks. Synthesizing with NREM context...") + early_rem_output = synthesize_early_rem(early_chunks, nrem_output) + deliver(early_rem_output, "early-rem") + stage_data["early_rem"] = { + "chunks_retrieved": len(early_chunks), + "avg_similarity": round(sum(c["relevance"] for c in early_chunks) / len(early_chunks), 3), + "query": "career decision personal change what matters next", + "word_count": len(early_rem_output.split()), + "status": "ok", + } + print(f"[Early REM] Done.\n{early_rem_output[:200]}...") + + # ── Stage 3: Late REM — informed by NREM + Early REM ────────────────── + print("\n[Late REM] Retrieving...") + late_chunks = retrieve("late-rem") + if not late_chunks: + print("[Late REM] No suitable chunks — skipping") + late_rem_output = early_rem_output # fallback + else: + print(f"[Late REM] Retrieved {len(late_chunks)} chunks. Synthesizing with full context...") + late_rem_output = synthesize_late_rem(late_chunks, nrem_output, early_rem_output) + deliver(late_rem_output, "late-rem") + stage_data["late_rem"] = { + "chunks_retrieved": len(late_chunks), + "avg_similarity": round(sum(c["relevance"] for c in late_chunks) / len(late_chunks), 3), + "query": "practice place memory making", + "word_count": len(late_rem_output.split()), + "status": "ok", + } + print(f"[Late REM] Done.\n{late_rem_output[:200]}...") + + # ── Stage 4: Synthesis — all three stages ───────────────────────────── + print("\n[Synthesis] Integrating all stages...") + synthesis_output = synthesize_final(nrem_output, early_rem_output, late_rem_output) + synthesis_file = deliver(synthesis_output, "synthesis") + stage_data["synthesis"] = { + "word_count": len(synthesis_output.split()), + "status": "ok", + } + + print(f"\n{'='*60}") + print("SYNTHESIS:") + print(synthesis_output) + print(f"{'='*60}") + + # Write manifest + corpus_data = { + "total_chunks": delta.get("new_chunks", 0), + "new_chunks_since_last_dream": delta.get("new_chunks", 0), + "days_since_last_dream": round(delta.get("days_since_dream", 0), 2), + } + write_manifest(datetime.now().strftime("%Y-%m-%d"), stage_data, corpus_data) + + # Update state and notify + state = load_dreamer_state() + state["last_dream_timestamp"] = datetime.now().timestamp() + state["last_dream_mode"] = "pipeline" + state["last_dream_file"] = synthesis_file + save_dreamer_state(state) + + notify_sse("synthesis", synthesis_file.split("/")[-1]) + print(f"\nPipeline complete. Synthesis: {synthesis_file}") + return synthesis_file + + +def dream_lucid(task): + """On-demand lucid dream — single mode, used by Dream Now in settings.""" + print(f"Lucid dream starting — task: {task[:80] if task else 'none'}") + chunks = retrieve("lucid", task=task) + if not chunks: + print("No suitable chunks — aborting") + return None + print(f"Retrieved {len(chunks)} chunks. Synthesizing...") + output = synthesize_lucid(chunks, task) + filepath = deliver(output, "lucid", task=task) + + state = load_dreamer_state() + state["last_dream_timestamp"] = datetime.now().timestamp() + state["last_dream_mode"] = "lucid" + state["last_dream_file"] = filepath + save_dreamer_state(state) + + notify_sse("lucid", filepath.split("/")[-1]) + print(f"\n{'='*60}") + print(output) + print(f"{'='*60}") + print(f"\nDelivered to {filepath}") + return filepath + + +def dream_single(mode, task=None): + """ + Single mode — used by Dream Now for non-lucid modes. + Runs one stage independently (for testing/tuning individual stages). + """ + print(f"Single mode dream: {mode}") + chunks = retrieve(mode, task=task) + if not chunks: + print("No suitable chunks — aborting") + return None + print(f"Retrieved {len(chunks)} chunks. Synthesizing...") + + if mode == "nrem": + output = synthesize_nrem(chunks) + elif mode == "early-rem": + output = synthesize_early_rem(chunks, "") + elif mode == "late-rem": + output = synthesize_late_rem(chunks, "", "") + else: + output = synthesize_lucid(chunks, task) + + filepath = deliver(output, mode, task=task) + + state = load_dreamer_state() + state["last_dream_timestamp"] = datetime.now().timestamp() + state["last_dream_mode"] = mode + state["last_dream_file"] = filepath + save_dreamer_state(state) + + notify_sse(mode, filepath.split("/")[-1]) + print(f"\n{'='*60}") + print(output) + print(f"{'='*60}") + print(f"\nDelivered to {filepath}") + return filepath + + +# ─── CLI ──────────────────────────────────────────────────────────────────── + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Aaron AI Dreamer") + parser.add_argument("--mode", choices=["nrem", "early-rem", "late-rem", "lucid", "pipeline"]) + parser.add_argument("--task", type=str) + args = parser.parse_args() + + if args.mode == "lucid": + dream_lucid(args.task or "What should I be thinking about that I am not?") + elif args.mode and args.mode != "pipeline": + dream_single(args.mode, args.task) + else: + # Default: full pipeline + dream_pipeline() diff --git a/scripts/dream.py.bak.20260501-002209 b/scripts/dream.py.bak.20260501-002209 new file mode 100644 index 0000000..77f851e --- /dev/null +++ b/scripts/dream.py.bak.20260501-002209 @@ -0,0 +1,668 @@ +""" +Aaron AI Dreamer — Active Inference Engine +Interdependent stage architecture grounded in sleep consolidation research. + +Nightly pipeline: NREM → Early REM → Late REM → Synthesis +Each stage receives the previous stage's output as context. +Lucid mode is on-demand only (Dream Now from settings). + +Research basis: +- Singh et al. PNAS 2022: alternating NREM/REM outperforms single-stage approaches +- Klinzing et al. Nature Neuroscience 2019: SO-spindle-ripple coupling is interdependent +- REM operates on what NREM produced — stages are not discrete alternatives +""" + +import os +import json +import sqlite3 +import argparse +from pathlib import Path +from datetime import datetime, timedelta +from dotenv import load_dotenv +import psycopg2 +import hashlib + +load_dotenv(Path.home() / "aaronai" / ".env", override=True) + +PG_DSN = os.getenv("PG_DSN") + +def get_pg(): + return psycopg2.connect(PG_DSN) + +# ─── Paths ────────────────────────────────────────────────────────────────── +CONVERSATIONS_DB = str(Path.home() / "aaronai" / "conversations.db") +WATCHER_STATE = str(Path.home() / "aaronai" / "watcher_state.json") +DREAMER_STATE = str(Path.home() / "aaronai" / "dreamer_state.json") +JOURNAL_DIR = "/home/aaron/nextcloud/data/data/aaron/files/Journal/Daily" + +NEXTCLOUD_URL = os.getenv("NEXTCLOUD_URL", "https://nextcloud.aaronnelson.studio") +NEXTCLOUD_USER = os.getenv("NEXTCLOUD_USER", "aaron") +NEXTCLOUD_PASSWORD = os.getenv("NEXTCLOUD_PASSWORD", "") +DREAMS_WEBDAV = f"{NEXTCLOUD_URL}/remote.php/dav/files/{NEXTCLOUD_USER}/Journal/Dreams" + +# Similarity ranges calibrated for all-MiniLM-L6-v2 +MODE_RANGES = { + "nrem": (0.48, 0.72), + "early-rem": (0.38, 0.55), + "late-rem": (0.22, 0.42), + "lucid": (0.32, 0.72), +} +DREAMER_VERSION = "1.1" # 1.0=original exclusion logic; 1.1=score-band exclusion + +# ─── Prompt versioning ────────────────────────────────────────────────────── +# Bump the relevant constant manually when changing a prompt. +PROMPT_VERSION_NREM = "1.0" +PROMPT_VERSION_EREM = "1.1" +PROMPT_VERSION_LREM = "1.2" +PROMPT_VERSION_SYN = "1.0" + +def prompt_signature(): + return (f"nrem={PROMPT_VERSION_NREM}|erem={PROMPT_VERSION_EREM}" + f"|lrem={PROMPT_VERSION_LREM}|syn={PROMPT_VERSION_SYN}") + +def prompt_hash(prompts: list[str]) -> str: + combined = "".join(prompts) + return hashlib.md5(combined.encode()).hexdigest()[:8] + +def extract_folder(source_path): + """Extract top-level Nextcloud folder from source path.""" + parts = source_path.replace("\\", "/").split("/") + return parts[0] if parts else "unknown" + +# ─── Stage 1: Observe ─────────────────────────────────────────────────────── + +def observe_corpus(): + state = load_dreamer_state() + last_dream = state.get("last_dream_timestamp", 0) + new_chunk_count = 0 + try: + watcher_state = json.loads(Path(WATCHER_STATE).read_text()) + for path, mtime in watcher_state.items(): + if float(mtime) > last_dream: + new_chunk_count += 1 + except: + pass + days_since = (datetime.now().timestamp() - last_dream) / 86400 + recent_topics = get_recent_conversation_topics() + return { + "new_chunks": new_chunk_count, + "days_since_dream": days_since, + "recent_topics": recent_topics, + "last_dream": last_dream, + } + +def get_recent_conversation_topics(days=14): + try: + conn = sqlite3.connect(CONVERSATIONS_DB) + cutoff = (datetime.now() - timedelta(days=days)).isoformat() + c = conn.cursor() + c.execute(""" + SELECT m.content FROM messages m + JOIN conversations c ON m.conversation_id = c.id + WHERE m.role = 'user' AND c.updated_at > ? + ORDER BY m.timestamp DESC LIMIT 20 + """, (cutoff,)) + rows = c.fetchall() + conn.close() + return [r[0][:200] for r in rows] + except: + return [] + +# ─── Stage 2: Retrieve ────────────────────────────────────────────────────── + + +def retrieve_graphiti(mode, task=None, n_results=8): + """E3 experiment — Graphiti substrate retrieval. + Queries Graphiti /search endpoint instead of pgvector. + Returns chunks in same format as retrieve() for pipeline compatibility. + Note: content is Graphiti facts (synthesized relationships), not raw chunks. + """ + import requests as req_lib + if task: + query = task + elif mode == "late-rem": + delta = observe_corpus() + topics = delta.get("recent_topics", []) + query = topics[0] if topics else "practice place memory making" + elif mode == "early-rem": + query = "career decision personal change what matters next" + else: + query = "research fabrication teaching practice recent work" + + try: + resp = req_lib.get( + "http://localhost:8001/search", + params={"query": query, "limit": n_results, "group_id": "aaron"}, + timeout=30, + ) + resp.raise_for_status() + results = resp.json().get("results", []) + chunks = [] + for r in results: + fact = r.get("fact", "") + if not fact.strip(): + continue + chunks.append({ + "source": r.get("source", "graphiti"), + "content": fact, + "relevance": r.get("score", 0.5), + "similarity": r.get("score", 0.5), + }) + return chunks + except Exception as e: + print(f"[Graphiti retrieval error: {e}] — falling back to empty.") + return [] + +def retrieve(mode, task=None, n_results=8, excluded_sources=None): + # E3 experiment: DREAMER_SUBSTRATE=graphiti routes retrieval to Graphiti /search + # Default behavior: pgvector similarity search (unchanged) + substrate = os.getenv("DREAMER_SUBSTRATE", "pgvector") + if substrate == "graphiti": + return retrieve_graphiti(mode, task=task, n_results=n_results) + from sentence_transformers import SentenceTransformer + embedder = SentenceTransformer("all-MiniLM-L6-v2") + low, high = MODE_RANGES[mode] + + if task: + query = task + elif mode == "late-rem": + delta = observe_corpus() + topics = delta.get("recent_topics", []) + query = topics[0] if topics else "practice place memory making" + elif mode == "early-rem": + query = "career decision personal change what matters next" + else: + query = "research fabrication teaching practice recent work" + + embedding = embedder.encode([query]).tolist()[0] + chunks = [] + seen_sources = set() + + try: + pg = get_pg() + cur = pg.cursor() + excluded_sources = excluded_sources or set() + if excluded_sources: + cur.execute(""" + SELECT document, source, 1 - (embedding <=> %s::vector) as similarity + FROM embeddings + WHERE source NOT IN %s + ORDER BY embedding <=> %s::vector + LIMIT %s + """, (embedding, tuple(excluded_sources), embedding, n_results * 3)) + else: + cur.execute(""" + SELECT document, source, 1 - (embedding <=> %s::vector) as similarity + FROM embeddings + ORDER BY embedding <=> %s::vector + LIMIT %s + """, (embedding, embedding, n_results * 3)) + + for doc, source, similarity in cur.fetchall(): + if not (low <= similarity <= high): + continue + if source in seen_sources: + continue + chunks.append({ + "source": source or "unknown", + "content": doc, + "relevance": similarity, + "similarity": similarity, + }) + seen_sources.add(source) + if len(chunks) >= n_results: + break + pg.close() + except Exception as e: + print(f"pgvector retrieval error: {e}") + + return chunks + +# ─── Stage 3: Synthesize ──────────────────────────────────────────────────── + +def synthesize_nrem(chunks): + chunk_text = "\n\n---\n\n".join([f"[{c['source']}]\n{c['content']}" for c in chunks]) + prompt = f"""You have read everything Aaron Nelson has written and published. +You are a careful colleague who noticed something this week. + +Here is material from his corpus: + +{chunk_text} + +Write to Aaron directly. Identify one specific connection between +this material and something he wrote or worked on previously. +Stay close to the documents — cite them specifically by name. +Do not speculate beyond what the material supports. Do not use +headers or bullet points. Write one paragraph of 200-300 words +that ends with a single concrete question he could act on.""" + return _call_claude(prompt) + + +def synthesize_early_rem(chunks, nrem_output): + # v1.1 — removed citation instruction, removed close-friend persona, + # shifted register from analysis to recognition. + chunk_text = "\n\n---\n\n".join([f"[{c['source']}]\n{c['content']}" for c in chunks]) + prompt = f"""Something was noticed earlier tonight, moving through Aaron's recent work: + +{nrem_output} + +That observation is still with you. Now here is material from a different +time — pulled from further back, from different parts of his corpus: + +{chunk_text} + +You are not analyzing. You are recognizing. + +Something in the earlier observation and something in this older material +are the same thing wearing different clothes. Find it. Don't explain why +they're connected — just let the connection speak. Write from inside the +recognition, not from above it. + +The emotional register underneath the career logic is more interesting +than the career logic. The pattern that has been repeating longer than +he has been aware of it is more interesting than the current instance. + +Write directly to Aaron. No citations, no references, no analysis. +First person, present tense. Let what you noticed arrive rather than +be delivered. 150-250 words. End with one thing that is true that +he probably already knows but hasn't said out loud yet.""" + return _call_claude(prompt) + + +def synthesize_late_rem(chunks, nrem_output, early_rem_output): + chunk_text = "\n\n---\n\n".join([f"[{c['source']}]\n{c['content']}" for c in chunks]) + prompt = f"""You have been moving through Aaron Nelson's corpus all night. +First you found this, in the careful light of early consolidation: + +{nrem_output} + +Then, in the more personal territory that followed: + +{early_rem_output} + +Now it is late. The boundaries between things have loosened. +Here is material pulled from opposite ends of his work: + +{chunk_text} + +Do not explain the connections between all of this. +Do not resolve them. Do not summarize what came before. +Something stranger is possible now — let the accumulated +material from the night find its own shape. Compressed, +associative, slightly off. Let the strangeness stand. + +No headers. No bullet points. No hedging. No resolution. +No offer. End mid-thought if that is where the material ends. +150-250 words.""" + return _call_claude(prompt) + + +def synthesize_final(nrem_output, early_rem_output, late_rem_output): + prompt = f"""You have spent the night moving through Aaron Nelson's corpus +in three passes, each building on the last. + +The first pass — careful, close to the documents: +{nrem_output} + +The second pass — more personal, following what the first opened: +{early_rem_output} + +The third pass — associative, strange, letting things touch that +don't normally touch: +{late_rem_output} + +Now synthesize. Not a summary — a synthesis. Find what runs through +all three that none of them said directly. The thing that only becomes +visible when you hold all three passes together. + +Write it as a single unbroken piece. No headers, no bullet points, +no stage labels. 200-300 words. End with the one question that +matters most right now.""" + return _call_claude(prompt, max_tokens=800) + + +def synthesize_lucid(chunks, task): + chunk_text = "\n\n---\n\n".join([f"[{c['source']}]\n{c['content']}" for c in chunks]) + prompt = f"""Aaron has a question he is sitting with: + +{task or "What should I be thinking about that I am not?"} + +You have searched his entire corpus and found material that +speaks to this question from unexpected directions. Here is +what you found: + +{chunk_text} + +Do not summarize. Do not list. Pick the most interesting +tension between what the corpus contains and what he is +asking, and follow it through to its conclusion. Cite +specific documents by name. Be direct about what you think. +No headers, no bullet points. 250-400 words. +End with an offer to work on it together.""" + return _call_claude(prompt) + + +def _call_claude(prompt, max_tokens=1000): + import anthropic + client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY")) + response = client.messages.create( + model="claude-sonnet-4-6", + max_tokens=max_tokens, + messages=[{"role": "user", "content": prompt}] + ) + return response.content[0].text + +# ─── Stage 4: Deliver ─────────────────────────────────────────────────────── + +def deliver(dream_text, mode, task=None): + import requests + date_str = datetime.now().strftime("%Y-%m-%d") + filename = f"{date_str}-{mode}.md" + header = f"# Dream — {mode.upper()} — {datetime.now().strftime('%Y-%m-%d %H:%M')}\n" + header += f"*prompt_sig: {prompt_signature()}*\n\n" + if task: + header += f"*Task: {task}*\n\n" + header += "---\n\n" + content = header + dream_text + + auth = (NEXTCLOUD_USER, NEXTCLOUD_PASSWORD) + requests.request("MKCOL", DREAMS_WEBDAV, auth=auth, timeout=10) + + url = f"{DREAMS_WEBDAV}/{filename}" + counter = 1 + while True: + check = requests.request("PROPFIND", url, auth=auth, timeout=10) + if check.status_code == 404: + break + filename = f"{date_str}-{mode}-{counter}.md" + url = f"{DREAMS_WEBDAV}/{filename}" + counter += 1 + + response = requests.put(url, data=content.encode("utf-8"), auth=auth, timeout=30) + response.raise_for_status() + print(f"Delivered: Journal/Dreams/{filename}") + return f"Journal/Dreams/{filename}" + +def notify_sse(mode, filename): + try: + import requests + requests.post("http://localhost:8000/api/events/notify", json={ + "type": "dream", + "mode": mode, + "filename": filename, + "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M"), + }, timeout=3) + except Exception as e: + print(f"SSE notify failed (non-critical): {e}") + +# ─── State ────────────────────────────────────────────────────────────────── + +def load_dreamer_state(): + p = Path(DREAMER_STATE) + if p.exists(): + try: + return json.loads(p.read_text()) + except: + pass + return {} + +def save_dreamer_state(state): + Path(DREAMER_STATE).write_text(json.dumps(state, indent=2)) + +# ─── Orchestrators ─────────────────────────────────────────────────────────── + +def write_manifest(date_str, stage_data, corpus_data): + import requests + manifest = { + "date": date_str, + "prompt_sig": prompt_signature(), + "dreamer_version": DREAMER_VERSION, + "prompt_hash": prompt_hash([ + synthesize_nrem.__doc__ or "", + synthesize_early_rem.__doc__ or "", + synthesize_late_rem.__doc__ or "", + synthesize_final.__doc__ or "", + ]), + "stages": stage_data, + "corpus": corpus_data, + "rating": None, + "notes": "", + } + content = json.dumps(manifest, indent=2) + auth = (NEXTCLOUD_USER, NEXTCLOUD_PASSWORD) + url = f"{DREAMS_WEBDAV}/dream-manifest-{date_str}.json" + try: + requests.put(url, data=content.encode("utf-8"), auth=auth, timeout=30) + print(f"Manifest written: Journal/Dreams/dream-manifest-{date_str}.json") + except Exception as e: + print(f"Manifest write failed (non-critical): {e}") + + +def dream_pipeline(): + """ + Full nightly pipeline — interdependent stages. + NREM output feeds Early REM. Both feed Late REM. All three feed Synthesis. + """ + print(f"Dreamer pipeline starting — {datetime.now().strftime('%Y-%m-%d %H:%M')}") + + state = load_dreamer_state() + previously_retrieved = set(state.get("retrieved_sources", [])) + session_retrieved = set() + + delta = observe_corpus() + print(f"Corpus: {delta['new_chunks']} new chunks, {delta['days_since_dream']:.1f} days since last dream") + print(f"Excluding {len(previously_retrieved)} previously retrieved sources") + + # ── Stage 1: NREM ────────────────────────────────────────────────────── + print("\n[NREM] Retrieving...") + nrem_chunks = retrieve("nrem", excluded_sources=previously_retrieved | session_retrieved) + session_retrieved.update(c["source"] for c in nrem_chunks) + # Track sources that scored above Early REM ceiling — these are the only ones Early REM should exclude + nrem_high_sources = {c["source"] for c in nrem_chunks if c["similarity"] > 0.55} + if not nrem_chunks: + print("[NREM] No suitable chunks — aborting pipeline") + return None + + print(f"[NREM] Retrieved {len(nrem_chunks)} chunks. Synthesizing...") + nrem_output = synthesize_nrem(nrem_chunks) + nrem_file = deliver(nrem_output, "nrem") + nrem_sources = [c["source"] for c in nrem_chunks] + nrem_folders = list({extract_folder(s) for s in nrem_sources}) + stage_data = { + "nrem": { + "chunks_retrieved": len(nrem_chunks), + "avg_similarity": round(sum(c["relevance"] for c in nrem_chunks) / len(nrem_chunks), 3), + "query": "research fabrication teaching practice recent work", + "word_count": len(nrem_output.split()), + "sources": nrem_sources, + "distinct_folders": nrem_folders, + "folder_count": len(nrem_folders), + "status": "ok", + } + } + print(f"[NREM] Done.\n{nrem_output[:200]}...") + + # ── Stage 2: Early REM — informed by NREM ────────────────────────────── + print("\n[Early REM] Retrieving...") + # Early REM excludes previously retrieved + NREM high-scorers only (not full session_retrieved) + # Sources that scored in Early REM band during NREM remain available + early_chunks = retrieve("early-rem", excluded_sources=previously_retrieved | nrem_high_sources) + session_retrieved.update(c["source"] for c in early_chunks) + if not early_chunks: + print("[Early REM] No suitable chunks — skipping") + early_rem_output = nrem_output # fallback + else: + print(f"[Early REM] Retrieved {len(early_chunks)} chunks. Synthesizing with NREM context...") + early_rem_output = synthesize_early_rem(early_chunks, nrem_output) + deliver(early_rem_output, "early-rem") + early_sources = [c["source"] for c in early_chunks] + early_folders = list({extract_folder(s) for s in early_sources}) + stage_data["early_rem"] = { + "chunks_retrieved": len(early_chunks), + "avg_similarity": round(sum(c["relevance"] for c in early_chunks) / len(early_chunks), 3), + "query": "career decision personal change what matters next", + "word_count": len(early_rem_output.split()), + "sources": early_sources, + "distinct_folders": early_folders, + "folder_count": len(early_folders), + "status": "ok", + } + print(f"[Early REM] Done.\n{early_rem_output[:200]}...") + + # ── Stage 3: Late REM — informed by NREM + Early REM ────────────────── + print("\n[Late REM] Retrieving...") + late_chunks = retrieve("late-rem", excluded_sources=previously_retrieved | session_retrieved) + session_retrieved.update(c["source"] for c in late_chunks) + if not late_chunks: + print("[Late REM] No suitable chunks — skipping") + late_rem_output = early_rem_output # fallback + else: + print(f"[Late REM] Retrieved {len(late_chunks)} chunks. Synthesizing with full context...") + late_rem_output = synthesize_late_rem(late_chunks, nrem_output, early_rem_output) + deliver(late_rem_output, "late-rem") + late_sources = [c["source"] for c in late_chunks] + late_folders = [extract_folder(s) for s in late_sources] + cross_domain_pairs = sum( + 1 for i in range(len(late_folders)) + for j in range(i+1, len(late_folders)) + if late_folders[i] != late_folders[j] + ) + stage_data["late_rem"] = { + "chunks_retrieved": len(late_chunks), + "avg_similarity": round(sum(c["relevance"] for c in late_chunks) / len(late_chunks), 3), + "query": "practice place memory making", + "word_count": len(late_rem_output.split()), + "sources": late_sources, + "distinct_folders": list(set(late_folders)), + "folder_count": len(set(late_folders)), + "cross_domain_pairs": cross_domain_pairs, + "status": "ok", + } + print(f"[Late REM] Done.\n{late_rem_output[:200]}...") + + # ── Stage 4: Synthesis — all three stages ───────────────────────────── + print("\n[Synthesis] Integrating all stages...") + synthesis_output = synthesize_final(nrem_output, early_rem_output, late_rem_output) + synthesis_file = deliver(synthesis_output, "synthesis") + stage_data["synthesis"] = { + "word_count": len(synthesis_output.split()), + "status": "ok", + } + + print(f"\n{'='*60}") + print("SYNTHESIS:") + print(synthesis_output) + print(f"{'='*60}") + + # Write manifest + all_session_sources = list(session_retrieved) + all_session_folders = list({extract_folder(s) for s in all_session_sources}) + corpus_data = { + "total_chunks": delta.get("new_chunks", 0), + "new_chunks_since_last_dream": delta.get("new_chunks", 0), + "days_since_last_dream": round(delta.get("days_since_dream", 0), 2), + "substrate": "pgvector", + "aggregate": { + "total_distinct_sources": len(all_session_sources), + "total_distinct_folders": len(all_session_folders), + "folders_touched": all_session_folders, + } + } + write_manifest(datetime.now().strftime("%Y-%m-%d"), stage_data, corpus_data) + + # Update state and notify + state = load_dreamer_state() + state["last_dream_timestamp"] = datetime.now().timestamp() + state["last_dream_mode"] = "pipeline" + state["last_dream_file"] = synthesis_file + + # Accumulate retrieved sources across nights. Cap at 500, trim to 400 on overflow. + all_retrieved = list(previously_retrieved | session_retrieved) + if len(all_retrieved) > 500: + all_retrieved = all_retrieved[-400:] + state["retrieved_sources"] = all_retrieved + + save_dreamer_state(state) + + notify_sse("synthesis", synthesis_file.split("/")[-1]) + print(f"\nPipeline complete. Synthesis: {synthesis_file}") + return synthesis_file + + +def dream_lucid(task): + """On-demand lucid dream — single mode, used by Dream Now in settings.""" + print(f"Lucid dream starting — task: {task[:80] if task else 'none'}") + chunks = retrieve("lucid", task=task) + if not chunks: + print("No suitable chunks — aborting") + return None + print(f"Retrieved {len(chunks)} chunks. Synthesizing...") + output = synthesize_lucid(chunks, task) + filepath = deliver(output, "lucid", task=task) + + state = load_dreamer_state() + state["last_dream_timestamp"] = datetime.now().timestamp() + state["last_dream_mode"] = "lucid" + state["last_dream_file"] = filepath + save_dreamer_state(state) + + notify_sse("lucid", filepath.split("/")[-1]) + print(f"\n{'='*60}") + print(output) + print(f"{'='*60}") + print(f"\nDelivered to {filepath}") + return filepath + + +def dream_single(mode, task=None): + """ + Single mode — used by Dream Now for non-lucid modes. + Runs one stage independently (for testing/tuning individual stages). + """ + print(f"Single mode dream: {mode}") + chunks = retrieve(mode, task=task) + if not chunks: + print("No suitable chunks — aborting") + return None + print(f"Retrieved {len(chunks)} chunks. Synthesizing...") + + if mode == "nrem": + output = synthesize_nrem(chunks) + elif mode == "early-rem": + output = synthesize_early_rem(chunks, "") + elif mode == "late-rem": + output = synthesize_late_rem(chunks, "", "") + else: + output = synthesize_lucid(chunks, task) + + filepath = deliver(output, mode, task=task) + + state = load_dreamer_state() + state["last_dream_timestamp"] = datetime.now().timestamp() + state["last_dream_mode"] = mode + state["last_dream_file"] = filepath + save_dreamer_state(state) + + notify_sse(mode, filepath.split("/")[-1]) + print(f"\n{'='*60}") + print(output) + print(f"{'='*60}") + print(f"\nDelivered to {filepath}") + return filepath + + +# ─── CLI ──────────────────────────────────────────────────────────────────── + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Aaron AI Dreamer") + parser.add_argument("--mode", choices=["nrem", "early-rem", "late-rem", "lucid", "pipeline"]) + parser.add_argument("--task", type=str) + args = parser.parse_args() + + if args.mode == "lucid": + dream_lucid(args.task or "What should I be thinking about that I am not?") + elif args.mode and args.mode != "pipeline": + dream_single(args.mode, args.task) + else: + # Default: full pipeline + dream_pipeline() diff --git a/scripts/graphiti_service.py.bak b/scripts/graphiti_service.py.bak new file mode 100644 index 0000000..cffbe87 --- /dev/null +++ b/scripts/graphiti_service.py.bak @@ -0,0 +1,171 @@ +""" +Aaron AI — Graphiti Sidecar Service +Wraps graphiti-core in a FastAPI service to avoid asyncio event loop conflicts. +Port 8001 (internal only). No OpenAI dependency. +""" + +import os, logging, sys +from contextlib import asynccontextmanager +from datetime import datetime +from pathlib import Path + +from dotenv import load_dotenv +from fastapi import FastAPI, HTTPException +from pydantic import BaseModel + +load_dotenv(Path.home() / "aaronai" / ".env") + +logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") +log = logging.getLogger("graphiti-sidecar") + +GROUP_ID = os.getenv("GRAPHITI_GROUP_ID", "aaron") +FALKORDB_HOST = os.getenv("FALKORDB_HOST", "localhost") +FALKORDB_PORT = int(os.getenv("FALKORDB_PORT", "6379")) +LLM_PROVIDER = os.getenv("LLM_PROVIDER", "anthropic") +LLM_MODEL = os.getenv("LLM_MODEL", "claude-sonnet-4-6") +LLM_API_KEY = os.getenv("LLM_API_KEY") or os.getenv("ANTHROPIC_API_KEY") +os.environ["EMBEDDING_DIM"] = "384" + +def get_llm_client(): + from graphiti_core.llm_client.config import LLMConfig + config = LLMConfig(api_key=LLM_API_KEY, model=LLM_MODEL) + if LLM_PROVIDER == "anthropic": + from graphiti_core.llm_client.anthropic_client import AnthropicClient + return AnthropicClient(config) + elif LLM_PROVIDER == "openai": + from graphiti_core.llm_client.openai_client import OpenAIClient + return OpenAIClient(config) + elif LLM_PROVIDER == "gemini": + from graphiti_core.llm_client.gemini_client import GeminiClient + return GeminiClient(config) + elif LLM_PROVIDER == "groq": + from graphiti_core.llm_client.groq_client import GroqClient + return GroqClient(config) + raise ValueError(f"Unsupported LLM provider: {LLM_PROVIDER}") + +graphiti_instance = None + +async def get_graphiti(): + if graphiti_instance is None: + raise HTTPException(status_code=503, detail="Graphiti not initialized") + return graphiti_instance + +@asynccontextmanager +async def lifespan(app: FastAPI): + global graphiti_instance + sys.path.insert(0, str(Path.home() / "aaronai" / "scripts")) + log.info("Loading embedding and reranker models...") + from st_embedder import SentenceTransformerEmbedder + from graphiti_core.cross_encoder.bge_reranker_client import BGERerankerClient + from graphiti_core.driver.falkordb_driver import FalkorDriver + from graphiti_core import Graphiti + log.info(f"Connecting to FalkorDB at {FALKORDB_HOST}:{FALKORDB_PORT}...") + graphiti_instance = Graphiti( + llm_client=get_llm_client(), + embedder=SentenceTransformerEmbedder(), + cross_encoder=BGERerankerClient(), + graph_driver=FalkorDriver(host=FALKORDB_HOST, port=FALKORDB_PORT), + ) + await graphiti_instance.build_indices_and_constraints() + log.info(f"Graphiti ready — provider: {LLM_PROVIDER}, group: {GROUP_ID}") + yield + await graphiti_instance.close() + +app = FastAPI(title="Aaron AI Graphiti Sidecar", lifespan=lifespan) + +class BulkEpisodeItem(BaseModel): + name: str + content: str + source_description: str = "" + timestamp: str | None = None + + +class BulkEpisodeRequest(BaseModel): + episodes: list[BulkEpisodeItem] + group_id: str | None = None + + +class EpisodeRequest(BaseModel): + name: str + content: str + source_description: str = "" + timestamp: str | None = None + group_id: str | None = None + +@app.get("/health") +async def health(): + return {"ok": True, "provider": LLM_PROVIDER, "group": GROUP_ID} + +@app.post("/episodes") +async def add_episode(req: EpisodeRequest): + g = await get_graphiti() + from graphiti_core.nodes import EpisodeType + try: + ref_time = datetime.fromisoformat(req.timestamp) if req.timestamp else datetime.now() + await g.add_episode( + name=req.name, + episode_body=req.content, + source=EpisodeType.text, + reference_time=ref_time, + source_description=req.source_description, + group_id=req.group_id or GROUP_ID, + ) + return {"ok": True} + except Exception as e: + log.error(f"Episode ingestion failed: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.post("/episodes/bulk") +async def add_episodes_bulk(req: BulkEpisodeRequest): + g = await get_graphiti() + from graphiti_core.nodes import EpisodeType + from graphiti_core.utils.bulk_utils import RawEpisode + raw_episodes = [] + for ep in req.episodes: + ref_time = datetime.fromisoformat(ep.timestamp) if ep.timestamp else datetime.now() + raw_episodes.append(RawEpisode( + name=ep.name, + content=ep.content, + source_description=ep.source_description, + source=EpisodeType.text, + reference_time=ref_time, + )) + try: + result = await g.add_episode_bulk( + bulk_episodes=raw_episodes, + group_id=req.group_id or GROUP_ID, + ) + return {"ok": True, "count": len(raw_episodes)} + except Exception as e: + log.error(f"Bulk ingestion failed: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@app.get("/search") +async def search(query: str, limit: int = 8, group_id: str | None = None): + g = await get_graphiti() + try: + results = await g.search( + query=query, + num_results=limit, + group_ids=[group_id or GROUP_ID], + ) + return { + "results": [ + { + "fact": r.fact, + "source": getattr(r, "source_node_uuid", ""), + "score": getattr(r, "score", 0), + "valid_at": str(getattr(r, "valid_at", "")), + "invalid_at": str(getattr(r, "invalid_at", "")), + } + for r in results + ] + } + except Exception as e: + log.error(f"Search failed: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +if __name__ == "__main__": + import uvicorn + uvicorn.run(app, host="127.0.0.1", port=8001, log_level="info") diff --git a/scripts/ingest.py b/scripts/ingest.py index 487ba90..5d32d5b 100644 --- a/scripts/ingest.py +++ b/scripts/ingest.py @@ -81,7 +81,7 @@ def enqueue_stage2(source, full_text): completed_at = NULL, failed_at = NULL, attempts = 0 - """, (source, full_text[:50000], len(full_text))) + """, (source, full_text, len(full_text))) pg.commit() pg.close() except Exception as e: diff --git a/scripts/ingest.py.bak.20260501-004131 b/scripts/ingest.py.bak.20260501-004131 new file mode 100644 index 0000000..487ba90 --- /dev/null +++ b/scripts/ingest.py.bak.20260501-004131 @@ -0,0 +1,182 @@ +import os +import sys +import hashlib +from pathlib import Path +from dotenv import load_dotenv +import psycopg2 +import psycopg2.extras +import json +from sentence_transformers import SentenceTransformer +from docx import Document +from pypdf import PdfReader +from pptx import Presentation + +load_dotenv(Path.home() / "aaronai" / ".env", override=True) + +print("Loading embedding model...") +embedder = SentenceTransformer("all-MiniLM-L6-v2") + +PG_DSN = os.getenv("PG_DSN") + +def get_pg(): + return psycopg2.connect(PG_DSN) + +def extract_text_from_docx(path): + doc = Document(path) + return "\n".join([para.text for para in doc.paragraphs if para.text.strip()]) + +def extract_text_from_pdf(path): + reader = PdfReader(path) + text = "" + for page in reader.pages: + extracted = page.extract_text() + if extracted: + text += extracted + "\n" + return text + +def extract_text_from_pptx(path): + prs = Presentation(path) + text = "" + for slide in prs.slides: + for shape in slide.shapes: + if hasattr(shape, "text") and shape.text.strip(): + text += shape.text + "\n" + return text + +def extract_text_from_txt(path): + with open(path, "r", encoding="utf-8", errors="ignore") as f: + return f.read() + +def chunk_text(text, chunk_size=500, overlap=50): + words = text.split() + chunks = [] + start = 0 + while start < len(words): + end = start + chunk_size + chunk = " ".join(words[start:end]) + if chunk.strip(): + chunks.append(chunk) + start += chunk_size - overlap + return chunks + +def make_id(filepath, chunk_index): + path_hash = hashlib.md5(str(filepath).encode()).hexdigest()[:8] + return f"{path_hash}_{chunk_index}" + +def enqueue_stage2(source, full_text): + """Enqueue document for Stage 2 (Mistral orientation) → Stage 3 (Graphiti ingest). + TEMPORARY: this queue feed will be removed when pgvector is decommissioned + and the watcher calls Stage 2 directly. + """ + try: + pg = get_pg() + cur = pg.cursor() + cur.execute(""" + INSERT INTO stage_2_queue (source, full_text, char_length) + VALUES (%s, %s, %s) + ON CONFLICT (source) DO UPDATE SET + full_text = EXCLUDED.full_text, + char_length = EXCLUDED.char_length, + enqueued_at = NOW(), + completed_at = NULL, + failed_at = NULL, + attempts = 0 + """, (source, full_text[:50000], len(full_text))) + pg.commit() + pg.close() + except Exception as e: + print(f" Stage 2 queue insert failed (non-fatal): {e}") + +def ingest_file(filepath): + path = Path(filepath) + suffix = path.suffix.lower() + + if path.name.startswith("~$") or path.name.startswith("."): + return 0 + + try: + if suffix == ".docx": + text = extract_text_from_docx(path) + elif suffix == ".pdf": + text = extract_text_from_pdf(path) + elif suffix == ".pptx": + text = extract_text_from_pptx(path) + elif suffix in [".txt", ".md"]: + text = extract_text_from_txt(path) + else: + return 0 + + if not text.strip(): + return 0 + + chunks = chunk_text(text) + if not chunks: + return 0 + + embeddings = embedder.encode(chunks).tolist() + ids = [make_id(path, i) for i in range(len(chunks))] + metadatas = [{ + "source": path.name, + "filepath": str(path), + "folder": str(path.parent.relative_to(Path(sys.argv[1]) if len(sys.argv) > 1 else path.parent)) + } for _ in chunks] + + # STAGE 1: Write to pgvector (TEMPORARY — remove when chat agent migrates to Graphiti) + pg = get_pg() + cur = pg.cursor() + for chunk_id, chunk, embedding, meta in zip(ids, chunks, embeddings, metadatas): + cur.execute(""" + INSERT INTO embeddings (id, document, embedding, source, type, created_at, metadata) + VALUES (%s, %s, %s::vector, %s, %s, %s, %s) + ON CONFLICT (id) DO UPDATE SET + document = EXCLUDED.document, + embedding = EXCLUDED.embedding, + source = EXCLUDED.source, + metadata = EXCLUDED.metadata + """, ( + chunk_id, chunk, embedding, + meta.get("source"), "document", None, + json.dumps(meta) + )) + pg.commit() + pg.close() + print(f" Indexed {len(chunks)} chunks: {path.name}") + + # Enqueue for Stage 2 → Stage 3 (Graphiti pipeline) + # SKIP_STAGE2_ENQUEUE env var set by migration scripts to prevent bulk enqueue + if not os.getenv("SKIP_STAGE2_ENQUEUE"): + enqueue_stage2(path.name, text) + + return len(chunks) + + except Exception as e: + print(f" Error: {path.name}: {e}") + return 0 + +def ingest_folder(folder_path): + folder = Path(folder_path) + if not folder.exists(): + print(f"Folder not found: {folder_path}") + sys.exit(1) + + supported = [".docx", ".pdf", ".pptx", ".txt", ".md"] + files = [f for f in folder.rglob("*") + if f.suffix.lower() in supported + and not f.name.startswith("~$") + and not f.name.startswith(".")] + + if not files: + print("No supported files found.") + sys.exit(1) + + print(f"Found {len(files)} files to process\n") + total_chunks = 0 + for f in files: + total_chunks += ingest_file(f) + + print(f"\nDone. Total chunks indexed: {total_chunks}") + +if __name__ == "__main__": + target = sys.argv[1] if len(sys.argv) > 1 else str(Path.home() / "aaronai" / "docs") + print(f"Ingesting from: {target}\n") + ingest_folder(target) diff --git a/scripts/watcher.py b/scripts/watcher.py index dfedf34..fe7852b 100644 --- a/scripts/watcher.py +++ b/scripts/watcher.py @@ -135,7 +135,7 @@ def enqueue_stage2(source: str, full_text: str): completed_at = NULL, failed_at = NULL, attempts = 0 - """, (source, full_text[:50000], len(full_text))) + """, (source, full_text, len(full_text))) pg.commit() pg.close() except Exception as e: diff --git a/scripts/watcher.py.bak b/scripts/watcher.py.bak new file mode 100644 index 0000000..8f674a7 --- /dev/null +++ b/scripts/watcher.py.bak @@ -0,0 +1,210 @@ +import time +import subprocess +import logging +import json +import threading +from pathlib import Path +from watchdog.observers import Observer +from watchdog.events import FileSystemEventHandler + +NEXTCLOUD_PATH = "/home/aaron/nextcloud/data/data/aaron/files" +INGEST_SCRIPT = "/home/aaron/aaronai/scripts/ingest.py" +PYTHON = "/home/aaron/aaronai/venv/bin/python3" +LOG_FILE = "/home/aaron/aaronai/watcher.log" +STATE_FILE = "/home/aaron/aaronai/watcher_state.json" + +SUPPORTED = {'.pdf', '.docx', '.pptx', '.txt', '.md'} +DEBOUNCE_SECONDS = 120 +STATUS_FILE = "/home/aaron/aaronai/watcher_status.json" + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(message)s', + handlers=[ + logging.FileHandler(LOG_FILE), + logging.StreamHandler() + ] +) + +ingestion_state = { + "status": "idle", + "message": "", + "file_count": 0, + "started_at": None, + "finished_at": None, + "last_error": "", +} +ingestion_lock = threading.Lock() +ingestion_thread = None + + +def set_ingestion_state(**kwargs): + with ingestion_lock: + ingestion_state.update(kwargs) + + +def load_state(): + if Path(STATE_FILE).exists(): + with open(STATE_FILE) as f: + return json.load(f) + return {} + + +def save_state(state): + with open(STATE_FILE, 'w') as f: + json.dump(state, f) + + +def get_changed_files(): + state = load_state() + changed = [] + root = Path(NEXTCLOUD_PATH) + for path in root.rglob("*"): + if path.is_dir(): + continue + if path.suffix.lower() not in SUPPORTED: + continue + if path.name.startswith('.') or path.name.startswith('~$'): + continue + mtime = str(path.stat().st_mtime) + key = str(path) + if state.get(key) != mtime: + changed.append(path) + return changed, state + + +def run_ingestion(): + changed, state = get_changed_files() + if not changed: + logging.info("No new or changed files detected — skipping ingestion.") + set_ingestion_state(status="idle", message="No changes detected", file_count=0) + return + + count = len(changed) + logging.info(f"Found {count} new or changed files — starting ingestion...") + set_ingestion_state( + status="ingesting", + message=f"Ingesting {count} file(s)...", + file_count=count, + started_at=time.time(), + finished_at=None, + last_error="", + ) + + try: + result = subprocess.run( + [PYTHON, INGEST_SCRIPT, NEXTCLOUD_PATH], + capture_output=True, + text=True, + timeout=1800 + ) + if result.returncode == 0: + root = Path(NEXTCLOUD_PATH) + for path in root.rglob("*"): + if path.is_file() and path.suffix.lower() in SUPPORTED: + state[str(path)] = str(path.stat().st_mtime) + save_state(state) + logging.info("Ingestion complete. State updated.") + set_ingestion_state( + status="idle", + message=f"Last run: ingested {count} file(s) successfully", + finished_at=time.time(), + ) + else: + logging.error(f"Ingestion error: {result.stderr}") + set_ingestion_state( + status="error", + message="Ingestion failed — see log", + last_error=result.stderr[-300:], + finished_at=time.time(), + ) + except subprocess.TimeoutExpired: + logging.error("Ingestion timed out.") + set_ingestion_state( + status="error", + message="Ingestion timed out (>30 min)", + last_error="TimeoutExpired", + finished_at=time.time(), + ) + except Exception as e: + logging.error(f"Ingestion failed: {e}") + set_ingestion_state( + status="error", + message=f"Ingestion exception: {e}", + last_error=str(e), + finished_at=time.time(), + ) + + +def start_ingestion_thread(): + global ingestion_thread + if ingestion_thread and ingestion_thread.is_alive(): + logging.info("Ingestion already running — skipping.") + return + ingestion_thread = threading.Thread(target=run_ingestion, daemon=True) + ingestion_thread.start() + + +class IngestHandler(FileSystemEventHandler): + def __init__(self): + self.pending = False + self.last_event = 0 + + def on_any_event(self, event): + if event.is_directory: + return + path = Path(event.src_path) + if path.suffix.lower() not in SUPPORTED: + return + if path.name.startswith('.') or path.name.startswith('~$'): + return + if 'Admin/Backups' in str(path) or 'Backups' in path.parts: + return + if 'Journal/Media' in str(path): + return + if event.event_type not in ('modified', 'created', 'moved'): + return + logging.info(f"Event: {event.event_type} {event.src_path}") + self.pending = True + self.last_event = time.time() + + +def write_status(handler): + with ingestion_lock: + status = { + "running": True, + "timestamp": time.time(), + "pending": handler.pending, + "last_event": handler.last_event, + "ingestion": dict(ingestion_state), + } + with open(STATUS_FILE, 'w') as f: + json.dump(status, f) + + +def main(): + logging.info("Aaron AI Watcher starting...") + logging.info(f"Watching: {NEXTCLOUD_PATH}") + + handler = IngestHandler() + observer = Observer() + observer.schedule(handler, NEXTCLOUD_PATH, recursive=True) + observer.start() + + try: + while True: + write_status(handler) + if handler.pending: + elapsed = time.time() - handler.last_event + if elapsed >= DEBOUNCE_SECONDS: + handler.pending = False + start_ingestion_thread() + time.sleep(5) + except KeyboardInterrupt: + observer.stop() + observer.join() + logging.info("Watcher stopped.") + + +if __name__ == "__main__": + main() diff --git a/scripts/watcher.py.bak.20260501-004131 b/scripts/watcher.py.bak.20260501-004131 new file mode 100644 index 0000000..dfedf34 --- /dev/null +++ b/scripts/watcher.py.bak.20260501-004131 @@ -0,0 +1,448 @@ +""" +Aaron AI Watcher — Stage 1 of the encoding pipeline. + +Watches the Nextcloud directory for new or changed files. +On detection, chunks + embeds documents in-process (no subprocess), +then enqueues to stage_2_queue for async cascade processing. + +Design principles: +- Embedding model loaded ONCE at startup, reused across all ingest runs +- In-process ingest (no subprocess) — eliminates per-run model reload memory spike +- Missed-file recovery on startup — ingests anything new since last state +- Heartbeat file updated every loop tick — enables external health monitoring +- Parity principle: no filtering, no decisions, faithful capture +- Does NOT enqueue to stage_2_queue during bulk migration (SKIP_STAGE2_ENQUEUE env var) + +Architecture: Stage 1 (watcher) -> stage_2_queue -> Stage 2 (Mistral) -> stage_3_queue -> Stage 3 (Graphiti) +""" + +import os +import time +import json +import hashlib +import logging +import threading +from pathlib import Path + +import psycopg2 +from dotenv import load_dotenv +from sentence_transformers import SentenceTransformer +from watchdog.observers import Observer +from watchdog.events import FileSystemEventHandler + +from docx import Document as DocxDocument +from pypdf import PdfReader +from pptx import Presentation + +load_dotenv(Path.home() / "aaronai" / ".env", override=True) + +NEXTCLOUD_PATH = "/home/aaron/nextcloud/data/data/aaron/files" +LOG_FILE = "/home/aaron/aaronai/watcher.log" +STATE_FILE = "/home/aaron/aaronai/watcher_state.json" +STATUS_FILE = "/home/aaron/aaronai/watcher_status.json" +HEARTBEAT_FILE = "/home/aaron/aaronai/watcher_heartbeat" + +SUPPORTED = {".pdf", ".docx", ".pptx", ".txt", ".md"} +DEBOUNCE_SECONDS = 120 +CHUNK_SIZE = 500 +CHUNK_OVERLAP = 50 +EMBED_MODEL = "all-MiniLM-L6-v2" + +PG_DSN = os.getenv("PG_DSN") + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [watcher] %(levelname)s %(message)s", + handlers=[logging.FileHandler(LOG_FILE)], +) +log = logging.getLogger("watcher") + +ingestion_lock = threading.Lock() +ingestion_state = { + "status": "idle", "message": "", "file_count": 0, + "started_at": None, "finished_at": None, "last_error": "", +} +ingestion_thread = None + + +def load_embedder(): + log.info(f"Loading embedding model: {EMBED_MODEL}") + model = SentenceTransformer(EMBED_MODEL) + log.info("Embedding model ready.") + return model + + +def get_pg(): + return psycopg2.connect(PG_DSN) + + +def extract_text(path: Path) -> str: + suffix = path.suffix.lower() + try: + if suffix == ".docx": + doc = DocxDocument(path) + return "\n".join(p.text for p in doc.paragraphs if p.text.strip()) + elif suffix == ".pdf": + reader = PdfReader(path) + return "".join( + page.extract_text() + "\n" + for page in reader.pages if page.extract_text() + ) + elif suffix == ".pptx": + prs = Presentation(path) + return "\n".join( + shape.text for slide in prs.slides + for shape in slide.shapes + if hasattr(shape, "text") and shape.text.strip() + ) + elif suffix in {".txt", ".md"}: + return path.read_text(encoding="utf-8", errors="ignore") + except Exception as e: + log.warning(f"Text extraction failed for {path.name}: {e}") + record_ingest_failure(path, f"Text extraction failed: {e}") + return "" + + +def chunk_text(text: str) -> list: + words = text.split() + chunks = [] + start = 0 + while start < len(words): + chunk = " ".join(words[start:start + CHUNK_SIZE]) + if chunk.strip(): + chunks.append(chunk) + start += CHUNK_SIZE - CHUNK_OVERLAP + return chunks + + +def make_chunk_id(filepath: Path, chunk_index: int) -> str: + return hashlib.md5(str(filepath).encode()).hexdigest()[:8] + f"_{chunk_index}" + + +def enqueue_stage2(source: str, full_text: str): + if os.getenv("SKIP_STAGE2_ENQUEUE"): + return + try: + pg = get_pg() + cur = pg.cursor() + cur.execute(""" + INSERT INTO stage_2_queue (source, full_text, char_length) + VALUES (%s, %s, %s) + ON CONFLICT (source) DO UPDATE SET + full_text = EXCLUDED.full_text, + char_length = EXCLUDED.char_length, + enqueued_at = NOW(), + completed_at = NULL, + failed_at = NULL, + attempts = 0 + """, (source, full_text[:50000], len(full_text))) + pg.commit() + pg.close() + except Exception as e: + log.warning(f"Stage 2 enqueue failed (non-fatal): {e}") + + +def record_ingest_failure(filepath: Path, error: str): + """Write extraction or ingest failure to ingest_failures table for UI visibility.""" + try: + pg = get_pg() + cur = pg.cursor() + cur.execute(""" + INSERT INTO ingest_failures (source, filepath, error, retry_count, first_failed_at, last_failed_at) + VALUES (%s, %s, %s, 0, NOW(), NOW()) + ON CONFLICT (source) DO UPDATE SET + error = EXCLUDED.error, + retry_count = ingest_failures.retry_count + 1, + last_failed_at = NOW(), + resolved = FALSE + """, (filepath.name, str(filepath), error[:1000])) + pg.commit() + pg.close() + except Exception as e: + log.warning(f"Could not record ingest failure (non-fatal): {e}") + + +def resolve_ingest_failure(source: str): + """Mark a previously failed file as resolved after successful ingest.""" + try: + pg = get_pg() + cur = pg.cursor() + cur.execute("UPDATE ingest_failures SET resolved = TRUE WHERE source = %s", (source,)) + pg.commit() + pg.close() + except Exception as e: + log.warning(f"Could not resolve ingest failure record (non-fatal): {e}") + + +def ingest_file(filepath: Path, embedder) -> int: + if filepath.name.startswith(("~$", ".")): + return 0 + if filepath.suffix.lower() not in SUPPORTED: + return 0 + text = extract_text(filepath) + if not text.strip(): + return 0 + chunks = chunk_text(text) + if not chunks: + return 0 + try: + embeddings = embedder.encode(chunks).tolist() + except Exception as e: + log.error(f"Embedding failed for {filepath.name}: {e}") + record_ingest_failure(filepath, f"Embedding failed: {e}") + return 0 + source = filepath.name + try: + pg = get_pg() + cur = pg.cursor() + for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)): + chunk_id = make_chunk_id(filepath, i) + cur.execute(""" + INSERT INTO embeddings (id, document, embedding, source, type, created_at, metadata) + VALUES (%s, %s, %s::vector, %s, %s, NOW(), %s) + ON CONFLICT (id) DO UPDATE SET + document = EXCLUDED.document, + embedding = EXCLUDED.embedding, + source = EXCLUDED.source, + metadata = EXCLUDED.metadata + """, (chunk_id, chunk, embedding, source, "document", + json.dumps({"source": source, "filepath": str(filepath)}))) + pg.commit() + pg.close() + except Exception as e: + log.error(f"pgvector write failed for {filepath.name}: {e}") + record_ingest_failure(filepath, f"pgvector write failed: {e}") + return 0 + log.info(f"Indexed {len(chunks)} chunks: {filepath.name}") + resolve_ingest_failure(source) + enqueue_stage2(source, text) + return len(chunks) + + +def ingest_files(paths: list, embedder, state: dict) -> dict: + total = 0 + for path in paths: + count = ingest_file(path, embedder) + total += count + state[str(path)] = str(path.stat().st_mtime) + log.info(f"Ingestion complete. {total} chunks across {len(paths)} files.") + return state + + +def load_state() -> dict: + if Path(STATE_FILE).exists(): + try: + with open(STATE_FILE) as f: + return json.load(f) + except Exception: + pass + return {} + + +def save_state(state: dict): + with open(STATE_FILE, "w") as f: + json.dump(state, f) + + +def get_changed_files(state: dict) -> list: + changed = [] + root = Path(NEXTCLOUD_PATH) + for path in root.rglob("*"): + if path.is_dir(): + continue + if path.suffix.lower() not in SUPPORTED: + continue + if path.name.startswith((".", "~$")): + continue + if "Admin/Backups" in str(path) or "Backups" in path.parts: + continue + if "Journal/Media" in str(path): + continue + if state.get(str(path)) != str(path.stat().st_mtime): + changed.append(path) + return changed + + +def set_ingestion_state(**kwargs): + with ingestion_lock: + ingestion_state.update(kwargs) + + +def write_status(handler): + with ingestion_lock: + status = { + "running": True, "timestamp": time.time(), + "pending": handler.pending, "last_event": handler.last_event, + "ingestion": dict(ingestion_state), + } + try: + with open(STATUS_FILE, "w") as f: + json.dump(status, f) + except Exception: + pass + + +def write_heartbeat(): + try: + Path(HEARTBEAT_FILE).write_text(str(time.time())) + except Exception: + pass + + +def run_ingestion(embedder): + state = load_state() + changed = get_changed_files(state) + if not changed: + log.info("No new or changed files — skipping ingestion.") + set_ingestion_state(status="idle", message="No changes detected", file_count=0) + return + count = len(changed) + log.info(f"Found {count} new or changed files — starting ingestion...") + set_ingestion_state( + status="ingesting", message=f"Ingesting {count} file(s)...", + file_count=count, started_at=time.time(), finished_at=None, last_error="", + ) + try: + state = ingest_files(changed, embedder, state) + save_state(state) + set_ingestion_state( + status="idle", + message=f"Last run: ingested {count} file(s) successfully", + finished_at=time.time(), + ) + except Exception as e: + log.error(f"Ingestion failed: {e}") + set_ingestion_state( + status="error", message=f"Ingestion exception: {e}", + last_error=str(e), finished_at=time.time(), + ) + + +def start_ingestion_thread(embedder): + global ingestion_thread + with ingestion_lock: + if ingestion_thread and ingestion_thread.is_alive(): + log.info("Ingestion already running — skipping.") + return + ingestion_thread = threading.Thread( + target=run_ingestion, args=(embedder,), daemon=True + ) + ingestion_thread.start() + + +class IngestHandler(FileSystemEventHandler): + def __init__(self): + self.pending = False + self.last_event = 0 + + def _should_ignore(self, path: Path) -> bool: + if path.name.startswith((".", "~$")): + return True + if "Admin/Backups" in str(path) or "Backups" in path.parts: + return True + if "Journal/Media" in str(path): + return True + return False + + def on_created(self, event): + if event.is_directory: + return + path = Path(event.src_path) + if path.suffix.lower() not in SUPPORTED or self._should_ignore(path): + return + log.info(f"Event: created {path}") + self.pending = True + self.last_event = time.time() + + def on_modified(self, event): + if event.is_directory: + return + path = Path(event.src_path) + if path.suffix.lower() not in SUPPORTED or self._should_ignore(path): + return + log.info(f"Event: modified {path}") + self.pending = True + self.last_event = time.time() + + def on_moved(self, event): + if event.is_directory: + return + # Nextcloud WebDAV writes .part temp files then renames to final path. + # src_path is the .part file; dest_path is the final filename. + dest = Path(event.dest_path) + if dest.suffix.lower() not in SUPPORTED or self._should_ignore(dest): + return + log.info(f"Event: moved -> {dest}") + self.pending = True + self.last_event = time.time() + + def on_closed(self, event): + # FileClosedEvent fires on the final file after Nextcloud completes write. + # Belt-and-suspenders catch for any write pattern not caught by on_moved. + if event.is_directory: + return + path = Path(event.src_path) + if path.suffix.lower() not in SUPPORTED or self._should_ignore(path): + return + log.info(f"Event: closed {path}") + self.pending = True + self.last_event = time.time() + + +def main(): + log.info("Aaron AI Watcher starting...") + log.info(f"Watching: {NEXTCLOUD_PATH}") + + embedder = load_embedder() + + log.info("Startup scan: checking for files missed since last run...") + state = load_state() + missed = get_changed_files(state) + if missed: + log.info(f"Startup recovery: {len(missed)} missed file(s) — ingesting now.") + set_ingestion_state( + status="ingesting", + message=f"Startup recovery: ingesting {len(missed)} missed file(s)...", + file_count=len(missed), started_at=time.time(), + ) + try: + state = ingest_files(missed, embedder, state) + save_state(state) + set_ingestion_state( + status="idle", + message=f"Startup recovery complete: {len(missed)} file(s) ingested.", + finished_at=time.time(), + ) + except Exception as e: + log.error(f"Startup recovery failed: {e}") + set_ingestion_state(status="error", message=str(e), + last_error=str(e), finished_at=time.time()) + else: + log.info("Startup scan: no missed files.") + + handler = IngestHandler() + observer = Observer() + observer.schedule(handler, NEXTCLOUD_PATH, recursive=True) + observer.start() + log.info("Observer started.") + + try: + while True: + write_heartbeat() + write_status(handler) + if handler.pending: + elapsed = time.time() - handler.last_event + if elapsed >= DEBOUNCE_SECONDS: + handler.pending = False + start_ingestion_thread(embedder) + time.sleep(5) + except KeyboardInterrupt: + log.info("KeyboardInterrupt — stopping.") + observer.stop() + + observer.join() + log.info("Watcher stopped.") + + +if __name__ == "__main__": + main() diff --git a/watcher_heartbeat b/watcher_heartbeat new file mode 100644 index 0000000..eca1acd --- /dev/null +++ b/watcher_heartbeat @@ -0,0 +1 @@ +1777602395.781194 \ No newline at end of file