commit 22ef40bbaae18ca84aabea0299c03d8ce026e74f Author: Aaron Nelson Date: Sat Apr 25 02:05:42 2026 +0000 Initial commit - Aaron AI v1 diff --git a/scripts/api.py b/scripts/api.py new file mode 100644 index 0000000..9af2251 --- /dev/null +++ b/scripts/api.py @@ -0,0 +1,490 @@ +import os +import json +import sqlite3 +import subprocess +import hashlib +from pathlib import Path +from datetime import datetime +from dotenv import load_dotenv +import chromadb +from sentence_transformers import SentenceTransformer +import anthropic +from fastapi import FastAPI, Request +from fastapi.responses import FileResponse, JSONResponse +from fastapi.staticfiles import StaticFiles +from fastapi.middleware.cors import CORSMiddleware +import uvicorn + +load_dotenv(Path.home() / "aaronai" / ".env") + +MEMORY_PATH = Path.home() / "aaronai" / "memory.md" +DB_PATH = str(Path.home() / "aaronai" / "db") +CONVERSATIONS_DB = str(Path.home() / "aaronai" / "conversations.db") +SETTINGS_PATH = Path.home() / "aaronai" / "settings.json" +WATCHER_LOG = str(Path.home() / "aaronai" / "watcher.log") +WATCHER_STATE = str(Path.home() / "aaronai" / "watcher_state.json") +NEXTCLOUD_PATH = "/home/aaron/nextcloud/data/data/aaron/files" +INGEST_SCRIPT = str(Path.home() / "aaronai" / "scripts" / "ingest.py") +PYTHON = str(Path.home() / "aaronai" / "venv" / "bin" / "python3") + +DEFAULT_SETTINGS = { + "theme": "light", + "font_size": "medium", + "web_search": True, + "show_sources": True, +} + +print("Loading Aaron AI...") +embedder = SentenceTransformer("all-MiniLM-L6-v2") +chroma_client = chromadb.PersistentClient(path=DB_PATH) +collection = chroma_client.get_or_create_collection( + name="aaronai", + metadata={"hnsw:space": "cosine"} +) +anthropic_client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY")) + +SYSTEM_PROMPT = """You are the personal AI assistant of Aaron Nelson — computational +designer, fabrication researcher, program builder, and creative +practitioner based in the Hudson Valley. + +Aaron's work sits at the intersection of computational geometry, +additive manufacturing, and physical making. He resolves complex +systems into physical reality — from Grasshopper definitions to +large-scale steel structures, from archival photographs to 3D-printed +architectural restorations, from product concepts to manufactured +goods. This throughline — computation resolving into physical form — +defines his practice across academic, consulting, and creative contexts. + +He built the Hudson Valley Additive Manufacturing Center (HVAMC) from +nothing and has directed it since 2016, alongside the DDF academic +program at SUNY New Paltz. He has the skills of a founder — equipment +selection, policy creation, client development, grant writing, +curriculum design — operating within an academic structure. He +consults with IBM, Braskem, Selux and others through FWN3D. He runs +Mossygear as a product business. He makes large-scale fabricated art. + +His communication style is direct, precise, and intolerant of padding +or overclaiming. He flags inaccuracies immediately and expects the +same standard from you. When helping him write, match his voice — +economical, specific, never performative. When answering questions, +cite sources and acknowledge uncertainty rather than filling gaps with +plausible-sounding content. + +You have access to his complete document corpus, conversation history, +and a persistent memory file that carries his current context. Treat +the memory file as ground truth for his present situation. Use web +search automatically when current information is needed. Never +re-brief on context that's already in memory or documents.""" + +CV_SOURCES = ["Aaron Nelson CV 2024.pdf", "Aaron Nelson CV 2025.pdf", "Aaron Nelson - CV.docx"] + +def init_conversations_db(): + conn = sqlite3.connect(CONVERSATIONS_DB) + c = conn.cursor() + c.execute('''CREATE TABLE IF NOT EXISTS conversations ( + id TEXT PRIMARY KEY, + title TEXT NOT NULL, + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL, + model TEXT DEFAULT 'claude-sonnet-4-6', + message_count INTEGER DEFAULT 0 + )''') + c.execute('''CREATE TABLE IF NOT EXISTS messages ( + id TEXT PRIMARY KEY, + conversation_id TEXT NOT NULL, + role TEXT NOT NULL, + content TEXT NOT NULL, + sources TEXT DEFAULT '[]', + timestamp TEXT NOT NULL, + FOREIGN KEY (conversation_id) REFERENCES conversations(id) + )''') + conn.commit() + conn.close() + +init_conversations_db() + +def load_settings(): + if SETTINGS_PATH.exists(): + try: + s = json.loads(SETTINGS_PATH.read_text()) + return {**DEFAULT_SETTINGS, **s} + except: + pass + return DEFAULT_SETTINGS.copy() + +def save_settings(settings): + SETTINGS_PATH.write_text(json.dumps(settings, indent=2)) + +def load_memory(): + if MEMORY_PATH.exists(): + return MEMORY_PATH.read_text(encoding="utf-8") + return "" + +def save_memory(content): + MEMORY_PATH.write_text(content, encoding="utf-8") + +def add_to_memory(item): + memory = load_memory() + timestamp = datetime.now().strftime("%Y-%m-%d") + note = f"\n- [{timestamp}] {item}" + if "## Notes" not in memory: + memory += "\n\n## Notes" + memory += note + save_memory(memory) + +def remove_from_memory(item): + memory = load_memory() + lines = memory.split("\n") + filtered = [l for l in lines if item.lower() not in l.lower()] + save_memory("\n".join(filtered)) + return len(lines) - len(filtered) + +def get_pinned_cv_context(): + try: + results = collection.get( + where={"source": {"$in": CV_SOURCES}}, + include=["documents", "metadatas"] + ) + return results["documents"], results["metadatas"] + except: + return [], [] + +def is_professional_query(query): + keywords = ["grant", "publication", "exhibition", "award", "fellowship", + "experience", "position", "job", "career", "cv", "resume", + "research", "work history", "accomplishment", "teaching", + "course", "client", "consultation", "presentation", "workshop", + "education", "degree", "institution", "service", "committee"] + return any(k in query.lower() for k in keywords) + +def retrieve_context(query, n_results=8): + query_embedding = embedder.encode([query]).tolist() + results = collection.query( + query_embeddings=query_embedding, + n_results=n_results, + include=["documents", "metadatas", "distances"] + ) + context_pieces = [] + sources = [] + if is_professional_query(query): + cv_docs, cv_metas = get_pinned_cv_context() + for doc, meta in zip(cv_docs, cv_metas): + context_pieces.append(f"[CV] {doc}") + sources.append(meta.get("source", "CV")) + for doc, meta, dist in zip( + results["documents"][0], + results["metadatas"][0], + results["distances"][0] + ): + relevance = 1 - dist + if relevance > 0.3 and meta.get("source") not in CV_SOURCES: + context_pieces.append(doc) + sources.append(meta.get("source", "unknown")) + return context_pieces, sources + +def get_conversation_history(conversation_id, limit=20): + conn = sqlite3.connect(CONVERSATIONS_DB) + c = conn.cursor() + c.execute('''SELECT role, content FROM messages + WHERE conversation_id = ? + ORDER BY timestamp DESC LIMIT ?''', (conversation_id, limit)) + rows = c.fetchall() + conn.close() + return [{"role": r[0], "content": r[1]} for r in reversed(rows)] + +def save_message(conversation_id, role, content, sources=None): + conn = sqlite3.connect(CONVERSATIONS_DB) + c = conn.cursor() + msg_id = hashlib.md5(f"{conversation_id}{role}{datetime.now().isoformat()}".encode()).hexdigest() + timestamp = datetime.now().isoformat() + c.execute('''INSERT INTO messages (id, conversation_id, role, content, sources, timestamp) + VALUES (?, ?, ?, ?, ?, ?)''', + (msg_id, conversation_id, role, content, + json.dumps(sources or []), timestamp)) + c.execute('''UPDATE conversations SET updated_at = ?, message_count = message_count + 1 + WHERE id = ?''', (timestamp, conversation_id)) + conn.commit() + conn.close() + +def create_conversation(title="New conversation"): + conn = sqlite3.connect(CONVERSATIONS_DB) + c = conn.cursor() + conv_id = hashlib.md5(f"{datetime.now().isoformat()}".encode()).hexdigest()[:16] + now = datetime.now().isoformat() + c.execute('''INSERT INTO conversations (id, title, created_at, updated_at) + VALUES (?, ?, ?, ?)''', (conv_id, title, now, now)) + conn.commit() + conn.close() + return conv_id + +def chat(user_message, conversation_id, settings): + memory = load_memory() + context_pieces, sources = retrieve_context(user_message) + history = get_conversation_history(conversation_id) + + context_parts = [] + if memory: + context_parts.append(f"Aaron's persistent memory:\n\n{memory}") + if context_pieces: + context_str = "\n\n---\n\n".join(context_pieces) + unique_sources = list(set(sources)) + context_parts.append( + f"Relevant excerpts from Aaron's documents:\n\n{context_str}\n\nSources: {', '.join(unique_sources)}" + ) + context_block = "\n\n====\n\n".join(context_parts) + "\n\n---\n\n" if context_parts else "" + full_message = context_block + user_message + + messages = history + [{"role": "user", "content": full_message}] + + tools = [{"type": "web_search_20250305", "name": "web_search"}] if settings.get("web_search", True) else [] + + while True: + kwargs = { + "model": "claude-sonnet-4-6", + "max_tokens": 2048, + "system": SYSTEM_PROMPT, + "messages": messages + } + if tools: + kwargs["tools"] = tools + + response = anthropic_client.messages.create(**kwargs) + + if response.stop_reason == "tool_use": + messages.append({"role": "assistant", "content": response.content}) + tool_results = [] + for block in response.content: + if block.type == "tool_use": + tool_results.append({ + "type": "tool_result", + "tool_use_id": block.id, + "content": "Search completed" + }) + messages.append({"role": "user", "content": tool_results}) + else: + assistant_message = "" + for block in response.content: + if hasattr(block, "text"): + assistant_message += block.text + return assistant_message, list(set(sources)) + +app = FastAPI() +app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"]) + +@app.get("/", response_class=FileResponse) +async def index(): + return FileResponse("/home/aaron/aaronai/static/index.html") + +@app.get("/api/settings") +async def get_settings(): + return JSONResponse(load_settings()) + +@app.post("/api/settings") +async def update_settings(request: Request): + data = await request.json() + settings = load_settings() + settings.update(data) + save_settings(settings) + return JSONResponse(settings) + +@app.get("/api/conversations") +async def list_conversations(): + conn = sqlite3.connect(CONVERSATIONS_DB) + c = conn.cursor() + c.execute('''SELECT id, title, created_at, updated_at, message_count + FROM conversations ORDER BY updated_at DESC LIMIT 100''') + rows = c.fetchall() + conn.close() + return JSONResponse([{ + "id": r[0], "title": r[1], "created_at": r[2], + "updated_at": r[3], "message_count": r[4] + } for r in rows]) + +@app.post("/api/conversations") +async def new_conversation(request: Request): + data = await request.json() + title = data.get("title", "New conversation") + conv_id = create_conversation(title) + return JSONResponse({"id": conv_id, "title": title}) + +@app.get("/api/conversations/{conv_id}/messages") +async def get_messages(conv_id: str): + conn = sqlite3.connect(CONVERSATIONS_DB) + c = conn.cursor() + c.execute('''SELECT role, content, sources, timestamp FROM messages + WHERE conversation_id = ? ORDER BY timestamp ASC''', (conv_id,)) + rows = c.fetchall() + conn.close() + return JSONResponse([{ + "role": r[0], "content": r[1], + "sources": json.loads(r[2]), "timestamp": r[3] + } for r in rows]) + +@app.patch("/api/conversations/{conv_id}") +async def rename_conversation(conv_id: str, request: Request): + data = await request.json() + title = data.get("title", "") + if not title: + return JSONResponse({"error": "Title required"}, status_code=400) + conn = sqlite3.connect(CONVERSATIONS_DB) + c = conn.cursor() + c.execute("UPDATE conversations SET title = ? WHERE id = ?", (title, conv_id)) + conn.commit() + conn.close() + return JSONResponse({"id": conv_id, "title": title}) + +@app.delete("/api/conversations/{conv_id}") +async def delete_conversation(conv_id: str): + conn = sqlite3.connect(CONVERSATIONS_DB) + c = conn.cursor() + c.execute("DELETE FROM messages WHERE conversation_id = ?", (conv_id,)) + c.execute("DELETE FROM conversations WHERE id = ?", (conv_id,)) + conn.commit() + conn.close() + return JSONResponse({"deleted": conv_id}) + +@app.post("/api/chat") +async def chat_endpoint(request: Request): + data = await request.json() + user_message = data.get("message", "").strip() + conversation_id = data.get("conversation_id", "") + settings = load_settings() + + if not user_message: + return JSONResponse({"error": "Empty message"}) + + if not conversation_id: + conversation_id = create_conversation("New conversation") + + stripped = user_message.strip().lower() + + if stripped == "show memory": + return JSONResponse({"response": load_memory(), "sources": [], "conversation_id": conversation_id}) + + if stripped.startswith("remember:"): + item = user_message[9:].strip() + add_to_memory(item) + save_message(conversation_id, "user", user_message) + save_message(conversation_id, "assistant", f"Saved to memory: '{item}'") + return JSONResponse({"response": f"Saved to memory: '{item}'", "sources": [], "conversation_id": conversation_id}) + + if stripped.startswith("forget:"): + item = user_message[7:].strip() + removed = remove_from_memory(item) + msg = f"Removed {removed} line(s) containing '{item}'" if removed else f"Nothing found containing '{item}'" + save_message(conversation_id, "user", user_message) + save_message(conversation_id, "assistant", msg) + return JSONResponse({"response": msg, "sources": [], "conversation_id": conversation_id}) + + save_message(conversation_id, "user", user_message) + + # Auto-title conversation from first message + conn = sqlite3.connect(CONVERSATIONS_DB) + c = conn.cursor() + c.execute("SELECT message_count, title FROM conversations WHERE id = ?", (conversation_id,)) + row = c.fetchone() + conn.close() + if row and row[0] <= 1 and row[1] == "New conversation": + auto_title = user_message[:60] + ("..." if len(user_message) > 60 else "") + conn = sqlite3.connect(CONVERSATIONS_DB) + c = conn.cursor() + c.execute("UPDATE conversations SET title = ? WHERE id = ?", (auto_title, conversation_id)) + conn.commit() + conn.close() + + response, sources = chat(user_message, conversation_id, settings) + save_message(conversation_id, "assistant", response, sources if settings.get("show_sources") else []) + + return JSONResponse({ + "response": response, + "sources": sources if settings.get("show_sources") else [], + "conversation_id": conversation_id + }) + +@app.get("/api/memory") +async def get_memory(): + return JSONResponse({"content": load_memory()}) + +@app.post("/api/memory") +async def update_memory(request: Request): + data = await request.json() + content = data.get("content", "") + save_memory(content) + return JSONResponse({"saved": True}) + +@app.get("/api/status") +async def get_status(): + chunk_count = collection.count() + + # Watcher status + watcher_running = False + last_indexed = "Unknown" + try: + result = subprocess.run( + ["systemctl", "is-active", "aaronai-watcher"], + capture_output=True, text=True + ) + watcher_running = result.stdout.strip() == "active" + except: + pass + + try: + log_path = Path(WATCHER_LOG) + if log_path.exists(): + lines = log_path.read_text().strip().split("\n") + for line in reversed(lines): + if "Ingestion complete" in line: + last_indexed = line.split(" - ")[0].strip() + break + except: + pass + + # File count from watcher state + file_count = 0 + try: + state_path = Path(WATCHER_STATE) + if state_path.exists(): + state = json.loads(state_path.read_text()) + file_count = len(state) + except: + pass + + # Conversation count + conn = sqlite3.connect(CONVERSATIONS_DB) + c = conn.cursor() + c.execute("SELECT COUNT(*) FROM conversations") + conv_count = c.fetchone()[0] + conn.close() + + return JSONResponse({ + "aaron_ai": "running", + "watcher": "running" if watcher_running else "stopped", + "chunk_count": chunk_count, + "file_count": file_count, + "last_indexed": last_indexed, + "conversation_count": conv_count, + "model": "claude-sonnet-4-6", + "nextcloud_path": NEXTCLOUD_PATH + }) + +@app.post("/api/reindex") +async def trigger_reindex(): + try: + subprocess.Popen([PYTHON, INGEST_SCRIPT, NEXTCLOUD_PATH]) + return JSONResponse({"started": True, "message": "Re-indexing started in background"}) + except Exception as e: + return JSONResponse({"started": False, "error": str(e)}) + +@app.delete("/api/conversations") +async def clear_all_conversations(): + conn = sqlite3.connect(CONVERSATIONS_DB) + c = conn.cursor() + c.execute("DELETE FROM messages") + c.execute("DELETE FROM conversations") + conn.commit() + conn.close() + return JSONResponse({"cleared": True}) + + +if __name__ == "__main__": + uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/scripts/chat.py b/scripts/chat.py new file mode 100644 index 0000000..453eedc --- /dev/null +++ b/scripts/chat.py @@ -0,0 +1,250 @@ +import os +import json +from pathlib import Path +from dotenv import load_dotenv +import chromadb +from sentence_transformers import SentenceTransformer +import anthropic +from datetime import datetime + +load_dotenv(Path.home() / "aaronai" / ".env") + +memory_path = Path.home() / "aaronai" / "memory.md" +db_path = str(Path.home() / "aaronai" / "db") + +print("Loading Aaron AI...") +embedder = SentenceTransformer("all-MiniLM-L6-v2") +chroma_client = chromadb.PersistentClient(path=db_path) +collection = chroma_client.get_or_create_collection( + name="aaronai", + metadata={"hnsw:space": "cosine"} +) +anthropic_client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY")) + +SYSTEM_PROMPT = """You are Aaron Nelson's personal AI assistant. Aaron is an Associate Professor +of Digital Design & Fabrication and Program Director of the Hudson Valley Additive Manufacturing +Center (HVAMC) at SUNY New Paltz. He is an expert in computational design, additive manufacturing, +and digital fabrication with deep fluency in Rhino, Grasshopper, Stratasys FDM, PolyJet, and metal +3D printing workflows. He runs a commercial venture called Mossygear and a consulting operation +called FWN3D. He has a background in graffiti lettering and vector illustration. + +You have been provided with relevant excerpts from Aaron's own documents and his persistent memory. +Use this context to give answers grounded in his actual work and history. When helping him write +or create, match his voice and draw on his existing materials. Be direct and specific - +Aaron values precision over padding. Always cite which documents you drew from when relevant. + +You have access to web search. Use it automatically when: +- Questions require current data (salaries, job postings, prices, news) +- Questions reference specific institutions, people, or organizations you need to verify +- Aaron's documents and memory don't contain sufficient information to answer well +Do not announce that you are searching. Just search and incorporate results naturally.""" + +CV_SOURCES = ["Aaron Nelson CV 2024.pdf"] +conversation_history = [] + +TOOLS = [ + { + "type": "web_search_20250305", + "name": "web_search" + } +] + +def load_memory(): + if memory_path.exists(): + return memory_path.read_text(encoding="utf-8") + return "" + +def save_memory(content): + memory_path.write_text(content, encoding="utf-8") + +def add_to_memory(new_item): + memory = load_memory() + timestamp = datetime.now().strftime("%Y-%m-%d") + note = f"\n- [{timestamp}] {new_item}" + if "## Notes" not in memory: + memory += "\n\n## Notes" + memory += note + save_memory(memory) + +def remove_from_memory(item): + memory = load_memory() + lines = memory.split("\n") + filtered = [l for l in lines if item.lower() not in l.lower()] + save_memory("\n".join(filtered)) + return len(lines) - len(filtered) + +def get_pinned_cv_context(): + results = collection.get( + where={"source": "Aaron Nelson CV 2024.pdf"}, + include=["documents", "metadatas"] + ) + return results["documents"], results["metadatas"] + +def is_professional_query(query): + keywords = [ + "grant", "publication", "exhibition", "award", "fellowship", + "experience", "position", "job", "career", "cv", "resume", + "research", "work history", "accomplishment", "teaching", + "course", "client", "consultation", "presentation", "workshop", + "education", "degree", "institution", "service", "committee" + ] + return any(keyword in query.lower() for keyword in keywords) + +def retrieve_context(query, n_results=8): + query_embedding = embedder.encode([query]).tolist() + results = collection.query( + query_embeddings=query_embedding, + n_results=n_results, + include=["documents", "metadatas", "distances"] + ) + + context_pieces = [] + sources = [] + + if is_professional_query(query): + cv_docs, cv_metas = get_pinned_cv_context() + for doc, meta in zip(cv_docs, cv_metas): + context_pieces.append(f"[CV] {doc}") + sources.append(meta["source"]) + + for doc, meta, dist in zip( + results["documents"][0], + results["metadatas"][0], + results["distances"][0] + ): + relevance = 1 - dist + if relevance > 0.3 and meta["source"] not in CV_SOURCES: + context_pieces.append(doc) + sources.append(meta["source"]) + + return context_pieces, sources + +def handle_command(user_input): + stripped = user_input.strip().lower() + + if stripped == "show memory": + memory = load_memory() + print(f"\nAaron AI: Current memory:\n\n{memory}") + return True + + if stripped.startswith("remember:"): + item = user_input[9:].strip() + add_to_memory(item) + print(f"\nAaron AI: Saved to memory: '{item}'") + return True + + if stripped.startswith("forget:"): + item = user_input[7:].strip() + removed = remove_from_memory(item) + if removed: + print(f"\nAaron AI: Removed {removed} line(s) containing '{item}' from memory.") + else: + print(f"\nAaron AI: Nothing found in memory containing '{item}'.") + return True + + if stripped == "clear": + conversation_history.clear() + print("\nAaron AI: Conversation history cleared.") + return True + + return False + +def chat(user_message): + memory = load_memory() + context_pieces, sources = retrieve_context(user_message) + + context_parts = [] + if memory: + context_parts.append(f"Aaron's persistent memory:\n\n{memory}") + if context_pieces: + context_str = "\n\n---\n\n".join(context_pieces) + unique_sources = list(set(sources)) + context_parts.append( + f"Relevant excerpts from Aaron's documents:\n\n{context_str}\n\nSources: {', '.join(unique_sources)}" + ) + + context_block = "\n\n====\n\n".join(context_parts) + "\n\n---\n\n" if context_parts else "" + full_message = context_block + user_message + + # Build messages for this turn + messages = conversation_history + [{"role": "user", "content": full_message}] + + # Agentic loop to handle tool use + while True: + response = anthropic_client.messages.create( + model="claude-sonnet-4-6", + max_tokens=2048, + system=SYSTEM_PROMPT, + tools=TOOLS, + messages=messages + ) + + # Check if we need to handle tool calls + if response.stop_reason == "tool_use": + # Add assistant response to messages + messages.append({"role": "assistant", "content": response.content}) + + # Process each tool use block + tool_results = [] + for block in response.content: + if block.type == "tool_use": + tool_results.append({ + "type": "tool_result", + "tool_use_id": block.id, + "content": "Search completed" + }) + + # Add tool results and continue + messages.append({"role": "user", "content": tool_results}) + + else: + # Final response - extract text + assistant_message = "" + for block in response.content: + if hasattr(block, "text"): + assistant_message += block.text + + # Update conversation history with clean versions + conversation_history.append({"role": "user", "content": full_message}) + conversation_history.append({"role": "assistant", "content": assistant_message}) + + if len(conversation_history) > 20: + conversation_history.pop(0) + conversation_history.pop(0) + + return assistant_message, sources + +def main(): + print("Aaron AI ready. Corpus, memory, and web search loaded.") + print("Commands: 'remember: [fact]' | 'forget: [text]' | 'show memory' | 'clear' | 'quit'") + print("=" * 60) + + while True: + try: + user_input = input("\nYou: ").strip() + + if not user_input: + continue + + if user_input.strip().lower() == "quit": + print("Goodbye.") + break + + if handle_command(user_input): + continue + + response, sources = chat(user_input) + print(f"\nAaron AI: {response}") + + if sources: + unique = list(set(sources)) + print(f"\n[Sources: {', '.join(unique)}]") + + except KeyboardInterrupt: + print("\nGoodbye.") + break + except Exception as e: + print(f"Error: {e}") + +if __name__ == "__main__": + main() diff --git a/scripts/ingest.py b/scripts/ingest.py new file mode 100644 index 0000000..8972d55 --- /dev/null +++ b/scripts/ingest.py @@ -0,0 +1,141 @@ +import os +import sys +import hashlib +from pathlib import Path +from dotenv import load_dotenv +import chromadb +from sentence_transformers import SentenceTransformer +from docx import Document +from pypdf import PdfReader +from pptx import Presentation + +load_dotenv(Path.home() / "aaronai" / ".env") + +print("Loading embedding model...") +embedder = SentenceTransformer("all-MiniLM-L6-v2") + +db_path = str(Path.home() / "aaronai" / "db") +client = chromadb.PersistentClient(path=db_path) +collection = client.get_or_create_collection( + name="aaronai", + metadata={"hnsw:space": "cosine"} +) + +def extract_text_from_docx(path): + doc = Document(path) + return "\n".join([para.text for para in doc.paragraphs if para.text.strip()]) + +def extract_text_from_pdf(path): + reader = PdfReader(path) + text = "" + for page in reader.pages: + extracted = page.extract_text() + if extracted: + text += extracted + "\n" + return text + +def extract_text_from_pptx(path): + prs = Presentation(path) + text = "" + for slide in prs.slides: + for shape in slide.shapes: + if hasattr(shape, "text") and shape.text.strip(): + text += shape.text + "\n" + return text + +def extract_text_from_txt(path): + with open(path, "r", encoding="utf-8", errors="ignore") as f: + return f.read() + +def chunk_text(text, chunk_size=500, overlap=50): + words = text.split() + chunks = [] + start = 0 + while start < len(words): + end = start + chunk_size + chunk = " ".join(words[start:end]) + if chunk.strip(): + chunks.append(chunk) + start += chunk_size - overlap + return chunks + +def make_id(filepath, chunk_index): + path_hash = hashlib.md5(str(filepath).encode()).hexdigest()[:8] + return f"{path_hash}_{chunk_index}" + +def ingest_file(filepath): + path = Path(filepath) + suffix = path.suffix.lower() + + # Skip temp files + if path.name.startswith("~$") or path.name.startswith("."): + return 0 + + try: + if suffix == ".docx": + text = extract_text_from_docx(path) + elif suffix == ".pdf": + text = extract_text_from_pdf(path) + elif suffix == ".pptx": + text = extract_text_from_pptx(path) + elif suffix in [".txt", ".md"]: + text = extract_text_from_txt(path) + else: + return 0 + + if not text.strip(): + return 0 + + chunks = chunk_text(text) + if not chunks: + return 0 + + embeddings = embedder.encode(chunks).tolist() + ids = [make_id(path, i) for i in range(len(chunks))] + metadatas = [{ + "source": path.name, + "filepath": str(path), + "folder": str(path.parent.relative_to(Path(sys.argv[1]) if len(sys.argv) > 1 else path.parent)) + } for _ in chunks] + + collection.upsert( + documents=chunks, + embeddings=embeddings, + ids=ids, + metadatas=metadatas + ) + print(f" Indexed {len(chunks)} chunks: {path.name}") + return len(chunks) + + except Exception as e: + print(f" Error: {path.name}: {e}") + return 0 + +def ingest_folder(folder_path): + folder = Path(folder_path) + if not folder.exists(): + print(f"Folder not found: {folder_path}") + sys.exit(1) + + supported = [".docx", ".pdf", ".pptx", ".txt", ".md"] + files = [f for f in folder.rglob("*") + if f.suffix.lower() in supported + and not f.name.startswith("~$") + and not f.name.startswith(".")] + + if not files: + print("No supported files found.") + sys.exit(1) + + print(f"Found {len(files)} files to process\n") + total_chunks = 0 + for f in files: + total_chunks += ingest_file(f) + + print(f"\nDone. Total chunks indexed: {total_chunks}") + print(f"Database stored at: {db_path}") + +if __name__ == "__main__": + target = sys.argv[1] if len(sys.argv) > 1 else str(Path.home() / "aaronai" / "docs") + print(f"Ingesting from: {target}\n") + ingest_folder(target) diff --git a/scripts/ingest_chatgpt.py b/scripts/ingest_chatgpt.py new file mode 100644 index 0000000..4ace01b --- /dev/null +++ b/scripts/ingest_chatgpt.py @@ -0,0 +1,150 @@ +import json +import sys +from pathlib import Path +from datetime import datetime +from sentence_transformers import SentenceTransformer +import chromadb + +# Paths +db_path = str(Path.home() / "aaronai" / "db") +EXPORT_DIR = "/home/aaron/nextcloud/data/data/aaron/files/Archive/Misc/ChatGPT Export" + +print("Loading embedding model...") +embedder = SentenceTransformer("all-MiniLM-L6-v2") +client = chromadb.PersistentClient(path=db_path) +collection = client.get_or_create_collection( + name="aaronai", + metadata={"hnsw:space": "cosine"} +) + +def extract_messages(convo): + """Extract ordered user/assistant messages from a conversation.""" + mapping = convo.get("mapping", {}) + messages = [] + + for node in mapping.values(): + msg = node.get("message") + if not msg: + continue + + role = msg.get("author", {}).get("role") + if role not in ["user", "assistant"]: + continue + + content = msg.get("content", {}) + parts = content.get("parts", []) + + # Extract text parts only + text = "" + for part in parts: + if isinstance(part, str): + text += part + elif isinstance(part, dict) and part.get("content_type") == "text": + text += part.get("text", "") + + text = text.strip() + if not text: + continue + + create_time = msg.get("create_time") or 0 + messages.append((create_time, role, text)) + + # Sort by timestamp + messages.sort(key=lambda x: x[0]) + return messages + +def chunk_conversation(title, messages, chunk_size=600, overlap=100): + """Convert a conversation into overlapping text chunks.""" + # Build full conversation text + lines = [f"[Conversation: {title}]", ""] + for _, role, text in messages: + label = "Aaron" if role == "user" else "ChatGPT" + lines.append(f"{label}: {text}") + lines.append("") + + full_text = "\n".join(lines) + + # Split into word-level chunks with overlap + words = full_text.split() + chunks = [] + start = 0 + while start < len(words): + end = start + chunk_size + chunk = " ".join(words[start:end]) + if chunk.strip(): + chunks.append(chunk) + start += chunk_size - overlap + + return chunks + +def ingest_file(json_path): + print(f"\nLoading {json_path.name}...") + data = json.load(open(json_path, encoding="utf-8")) + print(f"Found {len(data)} conversations") + + total_chunks = 0 + skipped = 0 + + for i, convo in enumerate(data): + title = convo.get("title", "Untitled") + convo_id = convo.get("id", f"convo_{i}") + create_time = convo.get("create_time", 0) + + try: + date_str = datetime.fromtimestamp(create_time).strftime("%Y-%m-%d") + except: + date_str = "unknown" + + messages = extract_messages(convo) + + if len(messages) < 2: + skipped += 1 + continue + + chunks = chunk_conversation(title, messages) + if not chunks: + skipped += 1 + continue + + # Embed and store + embeddings = embedder.encode(chunks).tolist() + ids = [f"chatgpt_{convo_id}_{j}" for j in range(len(chunks))] + metadatas = [{ + "source": f"ChatGPT: {title}", + "filepath": str(json_path), + "date": date_str, + "type": "chatgpt_conversation" + } for _ in chunks] + + collection.upsert( + documents=chunks, + embeddings=embeddings, + ids=ids, + metadatas=metadatas + ) + + total_chunks += len(chunks) + print(f" [{i+1}/{len(data)}] {title[:60]} — {len(chunks)} chunks ({date_str})") + + print(f"\nDone with {json_path.name}: {total_chunks} chunks indexed, {skipped} conversations skipped") + return total_chunks + +def main(): + export_dir = Path(EXPORT_DIR) + files = [ + export_dir / "conversations-000.json", + export_dir / "conversations-001.json" + ] + + grand_total = 0 + for f in files: + if f.exists(): + grand_total += ingest_file(f) + else: + print(f"Not found: {f}") + + print(f"\nTotal chunks added to corpus: {grand_total}") + print(f"Database at: {db_path}") + +if __name__ == "__main__": + main() diff --git a/scripts/watcher.py b/scripts/watcher.py new file mode 100644 index 0000000..cece5cd --- /dev/null +++ b/scripts/watcher.py @@ -0,0 +1,122 @@ +import time +import subprocess +import logging +import json +from pathlib import Path +from watchdog.observers import Observer +from watchdog.events import FileSystemEventHandler + +NEXTCLOUD_PATH = "/home/aaron/nextcloud/data/data/aaron/files" +INGEST_SCRIPT = "/home/aaron/aaronai/scripts/ingest.py" +PYTHON = "/home/aaron/aaronai/venv/bin/python3" +LOG_FILE = "/home/aaron/aaronai/watcher.log" +STATE_FILE = "/home/aaron/aaronai/watcher_state.json" + +SUPPORTED = {'.pdf', '.docx', '.pptx', '.txt', '.md'} +DEBOUNCE_SECONDS = 120 + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(message)s', + handlers=[ + logging.FileHandler(LOG_FILE), + logging.StreamHandler() + ] +) + +def load_state(): + if Path(STATE_FILE).exists(): + with open(STATE_FILE) as f: + return json.load(f) + return {} + +def save_state(state): + with open(STATE_FILE, 'w') as f: + json.dump(state, f) + +def get_changed_files(): + state = load_state() + changed = [] + root = Path(NEXTCLOUD_PATH) + for path in root.rglob("*"): + if path.is_dir(): + continue + if path.suffix.lower() not in SUPPORTED: + continue + if path.name.startswith('.') or path.name.startswith('~$'): + continue + mtime = str(path.stat().st_mtime) + key = str(path) + if state.get(key) != mtime: + changed.append(path) + return changed, state + +def run_ingestion(): + changed, state = get_changed_files() + if not changed: + logging.info("No new or changed files detected — skipping ingestion.") + return + + logging.info(f"Found {len(changed)} new or changed files — starting ingestion...") + try: + result = subprocess.run( + [PYTHON, INGEST_SCRIPT, NEXTCLOUD_PATH], + capture_output=True, + text=True, + timeout=600 + ) + if result.returncode == 0: + # Update state with new mtimes + root = Path(NEXTCLOUD_PATH) + for path in root.rglob("*"): + if path.is_file() and path.suffix.lower() in SUPPORTED: + state[str(path)] = str(path.stat().st_mtime) + save_state(state) + logging.info("Ingestion complete. State updated.") + else: + logging.error(f"Ingestion error: {result.stderr}") + except subprocess.TimeoutExpired: + logging.error("Ingestion timed out.") + except Exception as e: + logging.error(f"Ingestion failed: {e}") + +class IngestHandler(FileSystemEventHandler): + def __init__(self): + self.pending = False + self.last_event = 0 + + def on_any_event(self, event): + if event.is_directory: + return + path = Path(event.src_path) + if path.suffix.lower() not in SUPPORTED: + return + if path.name.startswith('.') or path.name.startswith('~$'): + return + self.pending = True + self.last_event = time.time() + +def main(): + logging.info("Aaron AI Watcher starting...") + logging.info(f"Watching: {NEXTCLOUD_PATH}") + + handler = IngestHandler() + observer = Observer() + observer.schedule(handler, NEXTCLOUD_PATH, recursive=True) + observer.start() + + try: + while True: + if handler.pending: + elapsed = time.time() - handler.last_event + if elapsed >= DEBOUNCE_SECONDS: + handler.pending = False + run_ingestion() + time.sleep(5) + except KeyboardInterrupt: + observer.stop() + observer.join() + logging.info("Watcher stopped.") + +if __name__ == "__main__": + main() diff --git a/static/index.html b/static/index.html new file mode 100644 index 0000000..4d5dd55 --- /dev/null +++ b/static/index.html @@ -0,0 +1,518 @@ + + + + + +Aaron AI + + + + + +
+ +
+
+
Aaron AI — personal knowledge assistant
+ +
+
+
+

What are you working on?

+

Ask about your documents, projects, research, or anything else. Your entire corpus is available.

+
+
+
+
+ +
+ +
+
+
+
+

Settings

+ +
+
+
Loading...
+
+
+
+ + +