Compare commits
11 Commits
7c7b649775
...
7b77794319
| Author | SHA1 | Date | |
|---|---|---|---|
| 7b77794319 | |||
| d985f9e91e | |||
| b9eea6cb62 | |||
| 93c0d89308 | |||
| f18fb64fe5 | |||
| 72e07afc03 | |||
| c3011c80a5 | |||
| 4204806c80 | |||
| c5fc517fef | |||
| b35d44ef58 | |||
| a27f22ceaf |
+36
-19
@@ -38,6 +38,19 @@ load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
MEMORY_PATH = Path.home() / "aaronai" / "memory.md"
|
||||
CONVERSATIONS_DB = str(Path.home() / "aaronai" / "conversations.db")
|
||||
|
||||
def _connect(path):
|
||||
conn = sqlite3.connect(path, timeout=5.0)
|
||||
conn.execute("PRAGMA synchronous=NORMAL")
|
||||
conn.execute("PRAGMA foreign_keys=ON")
|
||||
return conn
|
||||
|
||||
def _connect_conversations():
|
||||
return _connect(CONVERSATIONS_DB)
|
||||
|
||||
def _connect_sessions():
|
||||
return _connect(SESSIONS_DB)
|
||||
|
||||
SETTINGS_PATH = Path.home() / "aaronai" / "settings.json"
|
||||
WATCHER_LOG = str(Path.home() / "aaronai" / "watcher.log")
|
||||
WATCHER_STATE = str(Path.home() / "aaronai" / "watcher_state.json")
|
||||
@@ -73,7 +86,7 @@ WHISPER_PROMPT = (
|
||||
whisper_model = None
|
||||
if HAS_WHISPER:
|
||||
try:
|
||||
whisper_model = WhisperModel("large-v3", device="cpu", compute_type="int8", cpu_threads=8)
|
||||
whisper_model = WhisperModel("distil-large-v3", device="cpu", compute_type="int8", cpu_threads=4)
|
||||
print("Whisper model loaded")
|
||||
except Exception as e:
|
||||
print(f"Whisper not available: {e}")
|
||||
@@ -122,8 +135,9 @@ SESSION_PASSWORD = os.getenv("AARON_AI_PASSWORD", "changeme")
|
||||
SESSIONS_DB = str(Path.home() / "aaronai" / "sessions.db")
|
||||
|
||||
def _init_sessions():
|
||||
conn = sqlite3.connect(SESSIONS_DB)
|
||||
conn = _connect_sessions()
|
||||
conn.execute("CREATE TABLE IF NOT EXISTS sessions (token TEXT PRIMARY KEY, created_at TEXT)")
|
||||
conn.execute("PRAGMA journal_mode=WAL")
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
@@ -136,19 +150,19 @@ def hash_password(password: str) -> str:
|
||||
return hashlib.sha256(password.encode()).hexdigest()
|
||||
|
||||
def save_session(token: str):
|
||||
conn = sqlite3.connect(SESSIONS_DB)
|
||||
conn = _connect_sessions()
|
||||
conn.execute("INSERT OR REPLACE INTO sessions VALUES (?, ?)", (token, datetime.now().isoformat()))
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
def delete_session(token: str):
|
||||
conn = sqlite3.connect(SESSIONS_DB)
|
||||
conn = _connect_sessions()
|
||||
conn.execute("DELETE FROM sessions WHERE token = ?", (token,))
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
def session_exists(token: str) -> bool:
|
||||
conn = sqlite3.connect(SESSIONS_DB)
|
||||
conn = _connect_sessions()
|
||||
row = conn.execute("SELECT 1 FROM sessions WHERE token = ?", (token,)).fetchone()
|
||||
conn.close()
|
||||
return row is not None
|
||||
@@ -163,7 +177,7 @@ def require_auth(request: Request):
|
||||
return token
|
||||
|
||||
def init_conversations_db():
|
||||
conn = sqlite3.connect(CONVERSATIONS_DB)
|
||||
conn = _connect_conversations()
|
||||
c = conn.cursor()
|
||||
c.execute('''CREATE TABLE IF NOT EXISTS conversations (
|
||||
id TEXT PRIMARY KEY,
|
||||
@@ -182,6 +196,8 @@ def init_conversations_db():
|
||||
timestamp TEXT NOT NULL,
|
||||
FOREIGN KEY (conversation_id) REFERENCES conversations(id)
|
||||
)''')
|
||||
c.execute("PRAGMA journal_mode=WAL")
|
||||
c.execute("CREATE INDEX IF NOT EXISTS idx_messages_conv_ts ON messages(conversation_id, timestamp DESC)")
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
@@ -250,7 +266,7 @@ def retrieve_context(query, n_results=8):
|
||||
return context_pieces, sources
|
||||
|
||||
def get_conversation_history(conversation_id, limit=20):
|
||||
conn = sqlite3.connect(CONVERSATIONS_DB)
|
||||
conn = _connect_conversations()
|
||||
c = conn.cursor()
|
||||
c.execute('''SELECT role, content FROM messages
|
||||
WHERE conversation_id = ?
|
||||
@@ -260,7 +276,7 @@ def get_conversation_history(conversation_id, limit=20):
|
||||
return [{"role": r[0], "content": r[1]} for r in reversed(rows)]
|
||||
|
||||
def save_message(conversation_id, role, content, sources=None):
|
||||
conn = sqlite3.connect(CONVERSATIONS_DB)
|
||||
conn = _connect_conversations()
|
||||
c = conn.cursor()
|
||||
msg_id = hashlib.md5(f"{conversation_id}{role}{datetime.now().isoformat()}".encode()).hexdigest()
|
||||
timestamp = datetime.now().isoformat()
|
||||
@@ -274,7 +290,7 @@ def save_message(conversation_id, role, content, sources=None):
|
||||
conn.close()
|
||||
|
||||
def create_conversation(title="New conversation"):
|
||||
conn = sqlite3.connect(CONVERSATIONS_DB)
|
||||
conn = _connect_conversations()
|
||||
c = conn.cursor()
|
||||
conv_id = hashlib.md5(f"{datetime.now().isoformat()}".encode()).hexdigest()[:16]
|
||||
now = datetime.now().isoformat()
|
||||
@@ -409,7 +425,7 @@ async def update_settings(request: Request, auth: str = Depends(require_auth)):
|
||||
|
||||
@app.get("/api/conversations")
|
||||
async def list_conversations(auth: str = Depends(require_auth)):
|
||||
conn = sqlite3.connect(CONVERSATIONS_DB)
|
||||
conn = _connect_conversations()
|
||||
c = conn.cursor()
|
||||
c.execute('''SELECT id, title, created_at, updated_at, message_count
|
||||
FROM conversations ORDER BY updated_at DESC LIMIT 100''')
|
||||
@@ -429,7 +445,7 @@ async def new_conversation(request: Request, auth: str = Depends(require_auth)):
|
||||
|
||||
@app.get("/api/conversations/{conv_id}/messages")
|
||||
async def get_messages(conv_id: str, auth: str = Depends(require_auth)):
|
||||
conn = sqlite3.connect(CONVERSATIONS_DB)
|
||||
conn = _connect_conversations()
|
||||
c = conn.cursor()
|
||||
c.execute('''SELECT role, content, sources, timestamp FROM messages
|
||||
WHERE conversation_id = ? ORDER BY timestamp ASC''', (conv_id,))
|
||||
@@ -446,7 +462,7 @@ async def rename_conversation(conv_id: str, request: Request, auth: str = Depend
|
||||
title = data.get("title", "")
|
||||
if not title:
|
||||
return JSONResponse({"error": "Title required"}, status_code=400)
|
||||
conn = sqlite3.connect(CONVERSATIONS_DB)
|
||||
conn = _connect_conversations()
|
||||
c = conn.cursor()
|
||||
c.execute("UPDATE conversations SET title = ? WHERE id = ?", (title, conv_id))
|
||||
conn.commit()
|
||||
@@ -455,7 +471,7 @@ async def rename_conversation(conv_id: str, request: Request, auth: str = Depend
|
||||
|
||||
@app.delete("/api/conversations/{conv_id}")
|
||||
async def delete_conversation(conv_id: str, auth: str = Depends(require_auth)):
|
||||
conn = sqlite3.connect(CONVERSATIONS_DB)
|
||||
conn = _connect_conversations()
|
||||
c = conn.cursor()
|
||||
c.execute("DELETE FROM messages WHERE conversation_id = ?", (conv_id,))
|
||||
c.execute("DELETE FROM conversations WHERE id = ?", (conv_id,))
|
||||
@@ -500,14 +516,14 @@ async def chat_endpoint(request: Request, auth: str = Depends(require_auth)):
|
||||
save_message(conversation_id, "user", user_message)
|
||||
|
||||
# Auto-title conversation from first message
|
||||
conn = sqlite3.connect(CONVERSATIONS_DB)
|
||||
conn = _connect_conversations()
|
||||
c = conn.cursor()
|
||||
c.execute("SELECT message_count, title FROM conversations WHERE id = ?", (conversation_id,))
|
||||
row = c.fetchone()
|
||||
conn.close()
|
||||
if row and row[0] <= 1 and row[1] == "New conversation":
|
||||
auto_title = user_message[:60] + ("..." if len(user_message) > 60 else "")
|
||||
conn = sqlite3.connect(CONVERSATIONS_DB)
|
||||
conn = _connect_conversations()
|
||||
c = conn.cursor()
|
||||
c.execute("UPDATE conversations SET title = ? WHERE id = ?", (auto_title, conversation_id))
|
||||
conn.commit()
|
||||
@@ -587,7 +603,7 @@ async def get_status(auth: str = Depends(require_auth)):
|
||||
pass
|
||||
|
||||
# Conversation count
|
||||
conn = sqlite3.connect(CONVERSATIONS_DB)
|
||||
conn = _connect_conversations()
|
||||
c = conn.cursor()
|
||||
c.execute("SELECT COUNT(*) FROM conversations")
|
||||
conv_count = c.fetchone()[0]
|
||||
@@ -623,6 +639,7 @@ async def transcribe_audio(request: Request, audio: UploadFile = File(...), auth
|
||||
tmp_path,
|
||||
language="en",
|
||||
vad_filter=True,
|
||||
beam_size=1,
|
||||
initial_prompt=WHISPER_PROMPT
|
||||
)
|
||||
transcript = " ".join(s.text.strip() for s in segments)
|
||||
@@ -674,7 +691,7 @@ def transcribe_and_save(tmp_path, timestamp, nextcloud_url, nextcloud_user, next
|
||||
nc_auth = (nextcloud_user, nextcloud_password)
|
||||
try:
|
||||
segments, _ = whisper_model.transcribe(
|
||||
tmp_path, language="en", vad_filter=True, initial_prompt=WHISPER_PROMPT
|
||||
tmp_path, language="en", vad_filter=True, beam_size=1, initial_prompt=WHISPER_PROMPT
|
||||
)
|
||||
transcript = " ".join(s.text.strip() for s in segments).strip()
|
||||
os.unlink(tmp_path)
|
||||
@@ -760,7 +777,7 @@ async def capture_endpoint(
|
||||
tmp.write(audio_bytes)
|
||||
tmp_audio_path = tmp.name
|
||||
segments, _ = whisper_model.transcribe(
|
||||
tmp_audio_path, language="en", vad_filter=True, initial_prompt=WHISPER_PROMPT
|
||||
tmp_audio_path, language="en", vad_filter=True, beam_size=1, initial_prompt=WHISPER_PROMPT
|
||||
)
|
||||
voice_annotation = " ".join(s.text.strip() for s in segments).strip() or None
|
||||
os.unlink(tmp_audio_path)
|
||||
@@ -969,7 +986,7 @@ async def reindex_status(auth: str = Depends(require_auth)):
|
||||
|
||||
@app.delete("/api/conversations")
|
||||
async def clear_all_conversations(auth: str = Depends(require_auth)):
|
||||
conn = sqlite3.connect(CONVERSATIONS_DB)
|
||||
conn = _connect_conversations()
|
||||
c = conn.cursor()
|
||||
c.execute("DELETE FROM messages")
|
||||
c.execute("DELETE FROM conversations")
|
||||
|
||||
+1
-1
@@ -6,7 +6,7 @@ mkdir -p "$BACKUP_DIR"
|
||||
# Copy critical files
|
||||
cp ~/aaronai/memory.md "$BACKUP_DIR/memory-$DATE.md"
|
||||
cp ~/aaronai/settings.json "$BACKUP_DIR/settings-$DATE.json"
|
||||
cp ~/aaronai/conversations.db "$BACKUP_DIR/conversations-$DATE.db"
|
||||
python3 -c "import sqlite3, sys; src = sqlite3.connect('$HOME/aaronai/conversations.db'); dst = sqlite3.connect('$BACKUP_DIR/conversations-$DATE.db'); src.backup(dst); dst.close(); src.close()"
|
||||
|
||||
# Keep only last 7 days
|
||||
find "$BACKUP_DIR" -name "*.md" -mtime +7 -delete
|
||||
|
||||
+23
-5
@@ -16,6 +16,7 @@ import os
|
||||
import json
|
||||
import sqlite3
|
||||
import argparse
|
||||
from functools import lru_cache
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timedelta
|
||||
@@ -283,6 +284,11 @@ def retrieve_graphiti(mode, task=None, n_results=8, excluded_sources=None):
|
||||
print(f"[Graphiti retrieval error: {e}] — falling back to empty.")
|
||||
return []
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def _get_embedder():
|
||||
from sentence_transformers import SentenceTransformer
|
||||
return SentenceTransformer("all-MiniLM-L6-v2")
|
||||
|
||||
def retrieve(mode, task=None, n_results=8, excluded_sources=None, type_filter=None):
|
||||
# E3 experiment: DREAMER_SUBSTRATE=graphiti routes retrieval to Graphiti /search
|
||||
# Default behavior: pgvector similarity search (unchanged)
|
||||
@@ -291,8 +297,7 @@ def retrieve(mode, task=None, n_results=8, excluded_sources=None, type_filter=No
|
||||
substrate = os.getenv("DREAMER_SUBSTRATE", "pgvector")
|
||||
if substrate == "graphiti":
|
||||
return retrieve_graphiti(mode, task=task, n_results=n_results, excluded_sources=excluded_sources)
|
||||
from sentence_transformers import SentenceTransformer
|
||||
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
embedder = _get_embedder()
|
||||
low, high = MODE_RANGES[mode]
|
||||
|
||||
if task:
|
||||
@@ -480,10 +485,11 @@ def write_manifest(date_str, stage_data, corpus_data):
|
||||
auth = (NEXTCLOUD_USER, NEXTCLOUD_PASSWORD)
|
||||
url = f"{DREAMS_WEBDAV}/dream-manifest-{date_str}.json"
|
||||
try:
|
||||
requests.put(url, data=content.encode("utf-8"), auth=auth, timeout=30)
|
||||
response = requests.put(url, data=content.encode("utf-8"), auth=auth, timeout=30)
|
||||
response.raise_for_status()
|
||||
print(f"Manifest written: Journal/Dreams/dream-manifest-{date_str}.json")
|
||||
except Exception as e:
|
||||
print(f"Manifest write failed (non-critical): {e}")
|
||||
print(f"Manifest write failed — manifest not persisted: {e}")
|
||||
|
||||
|
||||
def dream_pipeline(type_filter=None):
|
||||
@@ -613,8 +619,20 @@ def dream_pipeline(type_filter=None):
|
||||
# Write manifest
|
||||
all_session_sources = list(session_retrieved)
|
||||
all_session_folders = list({extract_folder(s) for s in all_session_sources})
|
||||
total_chunks = 0
|
||||
pg = None
|
||||
try:
|
||||
pg = get_pg()
|
||||
cur = pg.cursor()
|
||||
cur.execute("SELECT COUNT(*) FROM embeddings")
|
||||
total_chunks = cur.fetchone()[0]
|
||||
except Exception as e:
|
||||
print(f"total_chunks query failed (non-critical): {e}")
|
||||
finally:
|
||||
if pg is not None:
|
||||
pg.close()
|
||||
corpus_data = {
|
||||
"total_chunks": delta.get("new_chunks", 0),
|
||||
"total_chunks": total_chunks,
|
||||
"new_chunks_since_last_dream": delta.get("new_chunks", 0),
|
||||
"days_since_last_dream": round(delta.get("days_since_dream", 0), 2),
|
||||
"substrate": "pgvector",
|
||||
|
||||
+48
-6
@@ -25,6 +25,30 @@ DEFAULT_CHUNK_SIZE = 500
|
||||
DEFAULT_CHUNK_OVERLAP = 50
|
||||
|
||||
|
||||
def _docx_cell_paragraphs(cell):
|
||||
yield from (p for p in cell.paragraphs if p.text.strip())
|
||||
for nested in cell.tables:
|
||||
for row in nested.rows:
|
||||
for c in row.cells:
|
||||
yield from _docx_cell_paragraphs(c)
|
||||
|
||||
|
||||
def _pptx_shape_text(shape):
|
||||
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
||||
parts = []
|
||||
if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
|
||||
for sub in shape.shapes:
|
||||
parts.extend(_pptx_shape_text(sub))
|
||||
return parts
|
||||
if hasattr(shape, "text") and shape.text.strip():
|
||||
parts.append(shape.text)
|
||||
if getattr(shape, "has_table", False):
|
||||
for cell in shape.table.iter_cells():
|
||||
if cell.text.strip():
|
||||
parts.append(cell.text)
|
||||
return parts
|
||||
|
||||
|
||||
def extract_text(filepath: Path) -> str:
|
||||
"""Return the text of a supported file. Returns "" on any failure or
|
||||
unsupported extension. Does not write to ingest_failures — caller decides."""
|
||||
@@ -32,7 +56,21 @@ def extract_text(filepath: Path) -> str:
|
||||
try:
|
||||
if suffix == ".docx":
|
||||
doc = DocxDocument(filepath)
|
||||
return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
|
||||
parts = [p.text for p in doc.paragraphs if p.text.strip()]
|
||||
for tbl in doc.tables:
|
||||
for row in tbl.rows:
|
||||
for cell in row.cells:
|
||||
parts.extend(p.text for p in _docx_cell_paragraphs(cell))
|
||||
for section in doc.sections:
|
||||
parts.extend(p.text for p in section.header.paragraphs if p.text.strip())
|
||||
parts.extend(p.text for p in section.footer.paragraphs if p.text.strip())
|
||||
from docx.oxml.ns import qn
|
||||
for txbx in doc.element.body.findall(".//" + qn("w:txbxContent")):
|
||||
for p in txbx.findall(".//" + qn("w:p")):
|
||||
text = "".join(t.text or "" for t in p.findall(".//" + qn("w:t")))
|
||||
if text.strip():
|
||||
parts.append(text)
|
||||
return "\n".join(parts)
|
||||
elif suffix == ".pdf":
|
||||
reader = PdfReader(filepath)
|
||||
return "".join(
|
||||
@@ -41,11 +79,15 @@ def extract_text(filepath: Path) -> str:
|
||||
)
|
||||
elif suffix == ".pptx":
|
||||
prs = Presentation(filepath)
|
||||
return "\n".join(
|
||||
shape.text for slide in prs.slides
|
||||
for shape in slide.shapes
|
||||
if hasattr(shape, "text") and shape.text.strip()
|
||||
)
|
||||
parts = []
|
||||
for slide in prs.slides:
|
||||
for shape in slide.shapes:
|
||||
parts.extend(_pptx_shape_text(shape))
|
||||
if slide.has_notes_slide:
|
||||
notes = slide.notes_slide.notes_text_frame.text
|
||||
if notes.strip():
|
||||
parts.append(notes)
|
||||
return "\n".join(parts)
|
||||
elif suffix in {".txt", ".md"}:
|
||||
return filepath.read_text(encoding="utf-8", errors="ignore")
|
||||
except Exception as e:
|
||||
|
||||
@@ -18,8 +18,14 @@ CONVERSATIONS_DB = str(Path.home() / "aaronai" / "conversations.db")
|
||||
PG_DSN = os.getenv("PG_DSN")
|
||||
MIN_EXCHANGES = 3
|
||||
|
||||
_embedder = None
|
||||
|
||||
def get_embedder():
|
||||
global _embedder
|
||||
if _embedder is None:
|
||||
print("Loading embedding model...")
|
||||
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
_embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
return _embedder
|
||||
|
||||
def get_conversations():
|
||||
conn = sqlite3.connect(CONVERSATIONS_DB)
|
||||
@@ -123,7 +129,7 @@ def run():
|
||||
|
||||
# Embed and insert
|
||||
texts = [c[1] for c in new_chunks]
|
||||
embeddings = embedder.encode(texts, show_progress_bar=False).tolist()
|
||||
embeddings = get_embedder().encode(texts, show_progress_bar=False).tolist()
|
||||
|
||||
for (chunk_id, chunk_text, meta), embedding in zip(new_chunks, embeddings):
|
||||
if not meta.get("type"):
|
||||
|
||||
+14
-3
@@ -124,7 +124,7 @@ def resolve_ingest_failure(source: str):
|
||||
|
||||
|
||||
def ingest_file(filepath: Path, embedder) -> int:
|
||||
if filepath.name.startswith(("~$", ".")):
|
||||
if filepath.name.startswith(("~$", "~", ".")):
|
||||
return 0
|
||||
if filepath.suffix.lower() not in SUPPORTED:
|
||||
return 0
|
||||
@@ -168,6 +168,7 @@ def ingest_files(paths: list, embedder, state: dict) -> dict:
|
||||
for path in paths:
|
||||
count = ingest_file(path, embedder)
|
||||
total += count
|
||||
if count > 0:
|
||||
state[str(path)] = str(path.stat().st_mtime)
|
||||
log.info(f"Ingestion complete. {total} chunks across {len(paths)} files.")
|
||||
return state
|
||||
@@ -196,12 +197,18 @@ def get_changed_files(state: dict) -> list:
|
||||
continue
|
||||
if path.suffix.lower() not in SUPPORTED:
|
||||
continue
|
||||
if path.name.startswith((".", "~$")):
|
||||
if path.name.startswith((".", "~$", "~")):
|
||||
continue
|
||||
if "Admin/Backups" in str(path) or "Backups" in path.parts:
|
||||
continue
|
||||
if "Journal/Media" in str(path):
|
||||
continue
|
||||
if "Generative Design" in path.parts and "Processing" in path.parts:
|
||||
continue
|
||||
if "Computational Design 2017" in path.parts and "Student Work" in path.parts:
|
||||
continue
|
||||
if path.stat().st_size == 0:
|
||||
continue
|
||||
if state.get(str(path)) != str(path.stat().st_mtime):
|
||||
changed.append(path)
|
||||
return changed
|
||||
@@ -280,12 +287,16 @@ class IngestHandler(FileSystemEventHandler):
|
||||
self.last_event = 0
|
||||
|
||||
def _should_ignore(self, path: Path) -> bool:
|
||||
if path.name.startswith((".", "~$")):
|
||||
if path.name.startswith((".", "~$", "~")):
|
||||
return True
|
||||
if "Admin/Backups" in str(path) or "Backups" in path.parts:
|
||||
return True
|
||||
if "Journal/Media" in str(path):
|
||||
return True
|
||||
if "Generative Design" in path.parts and "Processing" in path.parts:
|
||||
return True
|
||||
if "Computational Design 2017" in path.parts and "Student Work" in path.parts:
|
||||
return True
|
||||
return False
|
||||
|
||||
def on_created(self, event):
|
||||
|
||||
Reference in New Issue
Block a user