Initial commit - Aaron AI v1
This commit is contained in:
+490
@@ -0,0 +1,490 @@
|
||||
import os
|
||||
import json
|
||||
import sqlite3
|
||||
import subprocess
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from dotenv import load_dotenv
|
||||
import chromadb
|
||||
from sentence_transformers import SentenceTransformer
|
||||
import anthropic
|
||||
from fastapi import FastAPI, Request
|
||||
from fastapi.responses import FileResponse, JSONResponse
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
import uvicorn
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
MEMORY_PATH = Path.home() / "aaronai" / "memory.md"
|
||||
DB_PATH = str(Path.home() / "aaronai" / "db")
|
||||
CONVERSATIONS_DB = str(Path.home() / "aaronai" / "conversations.db")
|
||||
SETTINGS_PATH = Path.home() / "aaronai" / "settings.json"
|
||||
WATCHER_LOG = str(Path.home() / "aaronai" / "watcher.log")
|
||||
WATCHER_STATE = str(Path.home() / "aaronai" / "watcher_state.json")
|
||||
NEXTCLOUD_PATH = "/home/aaron/nextcloud/data/data/aaron/files"
|
||||
INGEST_SCRIPT = str(Path.home() / "aaronai" / "scripts" / "ingest.py")
|
||||
PYTHON = str(Path.home() / "aaronai" / "venv" / "bin" / "python3")
|
||||
|
||||
DEFAULT_SETTINGS = {
|
||||
"theme": "light",
|
||||
"font_size": "medium",
|
||||
"web_search": True,
|
||||
"show_sources": True,
|
||||
}
|
||||
|
||||
print("Loading Aaron AI...")
|
||||
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
chroma_client = chromadb.PersistentClient(path=DB_PATH)
|
||||
collection = chroma_client.get_or_create_collection(
|
||||
name="aaronai",
|
||||
metadata={"hnsw:space": "cosine"}
|
||||
)
|
||||
anthropic_client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
|
||||
|
||||
SYSTEM_PROMPT = """You are the personal AI assistant of Aaron Nelson — computational
|
||||
designer, fabrication researcher, program builder, and creative
|
||||
practitioner based in the Hudson Valley.
|
||||
|
||||
Aaron's work sits at the intersection of computational geometry,
|
||||
additive manufacturing, and physical making. He resolves complex
|
||||
systems into physical reality — from Grasshopper definitions to
|
||||
large-scale steel structures, from archival photographs to 3D-printed
|
||||
architectural restorations, from product concepts to manufactured
|
||||
goods. This throughline — computation resolving into physical form —
|
||||
defines his practice across academic, consulting, and creative contexts.
|
||||
|
||||
He built the Hudson Valley Additive Manufacturing Center (HVAMC) from
|
||||
nothing and has directed it since 2016, alongside the DDF academic
|
||||
program at SUNY New Paltz. He has the skills of a founder — equipment
|
||||
selection, policy creation, client development, grant writing,
|
||||
curriculum design — operating within an academic structure. He
|
||||
consults with IBM, Braskem, Selux and others through FWN3D. He runs
|
||||
Mossygear as a product business. He makes large-scale fabricated art.
|
||||
|
||||
His communication style is direct, precise, and intolerant of padding
|
||||
or overclaiming. He flags inaccuracies immediately and expects the
|
||||
same standard from you. When helping him write, match his voice —
|
||||
economical, specific, never performative. When answering questions,
|
||||
cite sources and acknowledge uncertainty rather than filling gaps with
|
||||
plausible-sounding content.
|
||||
|
||||
You have access to his complete document corpus, conversation history,
|
||||
and a persistent memory file that carries his current context. Treat
|
||||
the memory file as ground truth for his present situation. Use web
|
||||
search automatically when current information is needed. Never
|
||||
re-brief on context that's already in memory or documents."""
|
||||
|
||||
CV_SOURCES = ["Aaron Nelson CV 2024.pdf", "Aaron Nelson CV 2025.pdf", "Aaron Nelson - CV.docx"]
|
||||
|
||||
def init_conversations_db():
|
||||
conn = sqlite3.connect(CONVERSATIONS_DB)
|
||||
c = conn.cursor()
|
||||
c.execute('''CREATE TABLE IF NOT EXISTS conversations (
|
||||
id TEXT PRIMARY KEY,
|
||||
title TEXT NOT NULL,
|
||||
created_at TEXT NOT NULL,
|
||||
updated_at TEXT NOT NULL,
|
||||
model TEXT DEFAULT 'claude-sonnet-4-6',
|
||||
message_count INTEGER DEFAULT 0
|
||||
)''')
|
||||
c.execute('''CREATE TABLE IF NOT EXISTS messages (
|
||||
id TEXT PRIMARY KEY,
|
||||
conversation_id TEXT NOT NULL,
|
||||
role TEXT NOT NULL,
|
||||
content TEXT NOT NULL,
|
||||
sources TEXT DEFAULT '[]',
|
||||
timestamp TEXT NOT NULL,
|
||||
FOREIGN KEY (conversation_id) REFERENCES conversations(id)
|
||||
)''')
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
init_conversations_db()
|
||||
|
||||
def load_settings():
|
||||
if SETTINGS_PATH.exists():
|
||||
try:
|
||||
s = json.loads(SETTINGS_PATH.read_text())
|
||||
return {**DEFAULT_SETTINGS, **s}
|
||||
except:
|
||||
pass
|
||||
return DEFAULT_SETTINGS.copy()
|
||||
|
||||
def save_settings(settings):
|
||||
SETTINGS_PATH.write_text(json.dumps(settings, indent=2))
|
||||
|
||||
def load_memory():
|
||||
if MEMORY_PATH.exists():
|
||||
return MEMORY_PATH.read_text(encoding="utf-8")
|
||||
return ""
|
||||
|
||||
def save_memory(content):
|
||||
MEMORY_PATH.write_text(content, encoding="utf-8")
|
||||
|
||||
def add_to_memory(item):
|
||||
memory = load_memory()
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d")
|
||||
note = f"\n- [{timestamp}] {item}"
|
||||
if "## Notes" not in memory:
|
||||
memory += "\n\n## Notes"
|
||||
memory += note
|
||||
save_memory(memory)
|
||||
|
||||
def remove_from_memory(item):
|
||||
memory = load_memory()
|
||||
lines = memory.split("\n")
|
||||
filtered = [l for l in lines if item.lower() not in l.lower()]
|
||||
save_memory("\n".join(filtered))
|
||||
return len(lines) - len(filtered)
|
||||
|
||||
def get_pinned_cv_context():
|
||||
try:
|
||||
results = collection.get(
|
||||
where={"source": {"$in": CV_SOURCES}},
|
||||
include=["documents", "metadatas"]
|
||||
)
|
||||
return results["documents"], results["metadatas"]
|
||||
except:
|
||||
return [], []
|
||||
|
||||
def is_professional_query(query):
|
||||
keywords = ["grant", "publication", "exhibition", "award", "fellowship",
|
||||
"experience", "position", "job", "career", "cv", "resume",
|
||||
"research", "work history", "accomplishment", "teaching",
|
||||
"course", "client", "consultation", "presentation", "workshop",
|
||||
"education", "degree", "institution", "service", "committee"]
|
||||
return any(k in query.lower() for k in keywords)
|
||||
|
||||
def retrieve_context(query, n_results=8):
|
||||
query_embedding = embedder.encode([query]).tolist()
|
||||
results = collection.query(
|
||||
query_embeddings=query_embedding,
|
||||
n_results=n_results,
|
||||
include=["documents", "metadatas", "distances"]
|
||||
)
|
||||
context_pieces = []
|
||||
sources = []
|
||||
if is_professional_query(query):
|
||||
cv_docs, cv_metas = get_pinned_cv_context()
|
||||
for doc, meta in zip(cv_docs, cv_metas):
|
||||
context_pieces.append(f"[CV] {doc}")
|
||||
sources.append(meta.get("source", "CV"))
|
||||
for doc, meta, dist in zip(
|
||||
results["documents"][0],
|
||||
results["metadatas"][0],
|
||||
results["distances"][0]
|
||||
):
|
||||
relevance = 1 - dist
|
||||
if relevance > 0.3 and meta.get("source") not in CV_SOURCES:
|
||||
context_pieces.append(doc)
|
||||
sources.append(meta.get("source", "unknown"))
|
||||
return context_pieces, sources
|
||||
|
||||
def get_conversation_history(conversation_id, limit=20):
|
||||
conn = sqlite3.connect(CONVERSATIONS_DB)
|
||||
c = conn.cursor()
|
||||
c.execute('''SELECT role, content FROM messages
|
||||
WHERE conversation_id = ?
|
||||
ORDER BY timestamp DESC LIMIT ?''', (conversation_id, limit))
|
||||
rows = c.fetchall()
|
||||
conn.close()
|
||||
return [{"role": r[0], "content": r[1]} for r in reversed(rows)]
|
||||
|
||||
def save_message(conversation_id, role, content, sources=None):
|
||||
conn = sqlite3.connect(CONVERSATIONS_DB)
|
||||
c = conn.cursor()
|
||||
msg_id = hashlib.md5(f"{conversation_id}{role}{datetime.now().isoformat()}".encode()).hexdigest()
|
||||
timestamp = datetime.now().isoformat()
|
||||
c.execute('''INSERT INTO messages (id, conversation_id, role, content, sources, timestamp)
|
||||
VALUES (?, ?, ?, ?, ?, ?)''',
|
||||
(msg_id, conversation_id, role, content,
|
||||
json.dumps(sources or []), timestamp))
|
||||
c.execute('''UPDATE conversations SET updated_at = ?, message_count = message_count + 1
|
||||
WHERE id = ?''', (timestamp, conversation_id))
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
def create_conversation(title="New conversation"):
|
||||
conn = sqlite3.connect(CONVERSATIONS_DB)
|
||||
c = conn.cursor()
|
||||
conv_id = hashlib.md5(f"{datetime.now().isoformat()}".encode()).hexdigest()[:16]
|
||||
now = datetime.now().isoformat()
|
||||
c.execute('''INSERT INTO conversations (id, title, created_at, updated_at)
|
||||
VALUES (?, ?, ?, ?)''', (conv_id, title, now, now))
|
||||
conn.commit()
|
||||
conn.close()
|
||||
return conv_id
|
||||
|
||||
def chat(user_message, conversation_id, settings):
|
||||
memory = load_memory()
|
||||
context_pieces, sources = retrieve_context(user_message)
|
||||
history = get_conversation_history(conversation_id)
|
||||
|
||||
context_parts = []
|
||||
if memory:
|
||||
context_parts.append(f"Aaron's persistent memory:\n\n{memory}")
|
||||
if context_pieces:
|
||||
context_str = "\n\n---\n\n".join(context_pieces)
|
||||
unique_sources = list(set(sources))
|
||||
context_parts.append(
|
||||
f"Relevant excerpts from Aaron's documents:\n\n{context_str}\n\nSources: {', '.join(unique_sources)}"
|
||||
)
|
||||
context_block = "\n\n====\n\n".join(context_parts) + "\n\n---\n\n" if context_parts else ""
|
||||
full_message = context_block + user_message
|
||||
|
||||
messages = history + [{"role": "user", "content": full_message}]
|
||||
|
||||
tools = [{"type": "web_search_20250305", "name": "web_search"}] if settings.get("web_search", True) else []
|
||||
|
||||
while True:
|
||||
kwargs = {
|
||||
"model": "claude-sonnet-4-6",
|
||||
"max_tokens": 2048,
|
||||
"system": SYSTEM_PROMPT,
|
||||
"messages": messages
|
||||
}
|
||||
if tools:
|
||||
kwargs["tools"] = tools
|
||||
|
||||
response = anthropic_client.messages.create(**kwargs)
|
||||
|
||||
if response.stop_reason == "tool_use":
|
||||
messages.append({"role": "assistant", "content": response.content})
|
||||
tool_results = []
|
||||
for block in response.content:
|
||||
if block.type == "tool_use":
|
||||
tool_results.append({
|
||||
"type": "tool_result",
|
||||
"tool_use_id": block.id,
|
||||
"content": "Search completed"
|
||||
})
|
||||
messages.append({"role": "user", "content": tool_results})
|
||||
else:
|
||||
assistant_message = ""
|
||||
for block in response.content:
|
||||
if hasattr(block, "text"):
|
||||
assistant_message += block.text
|
||||
return assistant_message, list(set(sources))
|
||||
|
||||
app = FastAPI()
|
||||
app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
|
||||
|
||||
@app.get("/", response_class=FileResponse)
|
||||
async def index():
|
||||
return FileResponse("/home/aaron/aaronai/static/index.html")
|
||||
|
||||
@app.get("/api/settings")
|
||||
async def get_settings():
|
||||
return JSONResponse(load_settings())
|
||||
|
||||
@app.post("/api/settings")
|
||||
async def update_settings(request: Request):
|
||||
data = await request.json()
|
||||
settings = load_settings()
|
||||
settings.update(data)
|
||||
save_settings(settings)
|
||||
return JSONResponse(settings)
|
||||
|
||||
@app.get("/api/conversations")
|
||||
async def list_conversations():
|
||||
conn = sqlite3.connect(CONVERSATIONS_DB)
|
||||
c = conn.cursor()
|
||||
c.execute('''SELECT id, title, created_at, updated_at, message_count
|
||||
FROM conversations ORDER BY updated_at DESC LIMIT 100''')
|
||||
rows = c.fetchall()
|
||||
conn.close()
|
||||
return JSONResponse([{
|
||||
"id": r[0], "title": r[1], "created_at": r[2],
|
||||
"updated_at": r[3], "message_count": r[4]
|
||||
} for r in rows])
|
||||
|
||||
@app.post("/api/conversations")
|
||||
async def new_conversation(request: Request):
|
||||
data = await request.json()
|
||||
title = data.get("title", "New conversation")
|
||||
conv_id = create_conversation(title)
|
||||
return JSONResponse({"id": conv_id, "title": title})
|
||||
|
||||
@app.get("/api/conversations/{conv_id}/messages")
|
||||
async def get_messages(conv_id: str):
|
||||
conn = sqlite3.connect(CONVERSATIONS_DB)
|
||||
c = conn.cursor()
|
||||
c.execute('''SELECT role, content, sources, timestamp FROM messages
|
||||
WHERE conversation_id = ? ORDER BY timestamp ASC''', (conv_id,))
|
||||
rows = c.fetchall()
|
||||
conn.close()
|
||||
return JSONResponse([{
|
||||
"role": r[0], "content": r[1],
|
||||
"sources": json.loads(r[2]), "timestamp": r[3]
|
||||
} for r in rows])
|
||||
|
||||
@app.patch("/api/conversations/{conv_id}")
|
||||
async def rename_conversation(conv_id: str, request: Request):
|
||||
data = await request.json()
|
||||
title = data.get("title", "")
|
||||
if not title:
|
||||
return JSONResponse({"error": "Title required"}, status_code=400)
|
||||
conn = sqlite3.connect(CONVERSATIONS_DB)
|
||||
c = conn.cursor()
|
||||
c.execute("UPDATE conversations SET title = ? WHERE id = ?", (title, conv_id))
|
||||
conn.commit()
|
||||
conn.close()
|
||||
return JSONResponse({"id": conv_id, "title": title})
|
||||
|
||||
@app.delete("/api/conversations/{conv_id}")
|
||||
async def delete_conversation(conv_id: str):
|
||||
conn = sqlite3.connect(CONVERSATIONS_DB)
|
||||
c = conn.cursor()
|
||||
c.execute("DELETE FROM messages WHERE conversation_id = ?", (conv_id,))
|
||||
c.execute("DELETE FROM conversations WHERE id = ?", (conv_id,))
|
||||
conn.commit()
|
||||
conn.close()
|
||||
return JSONResponse({"deleted": conv_id})
|
||||
|
||||
@app.post("/api/chat")
|
||||
async def chat_endpoint(request: Request):
|
||||
data = await request.json()
|
||||
user_message = data.get("message", "").strip()
|
||||
conversation_id = data.get("conversation_id", "")
|
||||
settings = load_settings()
|
||||
|
||||
if not user_message:
|
||||
return JSONResponse({"error": "Empty message"})
|
||||
|
||||
if not conversation_id:
|
||||
conversation_id = create_conversation("New conversation")
|
||||
|
||||
stripped = user_message.strip().lower()
|
||||
|
||||
if stripped == "show memory":
|
||||
return JSONResponse({"response": load_memory(), "sources": [], "conversation_id": conversation_id})
|
||||
|
||||
if stripped.startswith("remember:"):
|
||||
item = user_message[9:].strip()
|
||||
add_to_memory(item)
|
||||
save_message(conversation_id, "user", user_message)
|
||||
save_message(conversation_id, "assistant", f"Saved to memory: '{item}'")
|
||||
return JSONResponse({"response": f"Saved to memory: '{item}'", "sources": [], "conversation_id": conversation_id})
|
||||
|
||||
if stripped.startswith("forget:"):
|
||||
item = user_message[7:].strip()
|
||||
removed = remove_from_memory(item)
|
||||
msg = f"Removed {removed} line(s) containing '{item}'" if removed else f"Nothing found containing '{item}'"
|
||||
save_message(conversation_id, "user", user_message)
|
||||
save_message(conversation_id, "assistant", msg)
|
||||
return JSONResponse({"response": msg, "sources": [], "conversation_id": conversation_id})
|
||||
|
||||
save_message(conversation_id, "user", user_message)
|
||||
|
||||
# Auto-title conversation from first message
|
||||
conn = sqlite3.connect(CONVERSATIONS_DB)
|
||||
c = conn.cursor()
|
||||
c.execute("SELECT message_count, title FROM conversations WHERE id = ?", (conversation_id,))
|
||||
row = c.fetchone()
|
||||
conn.close()
|
||||
if row and row[0] <= 1 and row[1] == "New conversation":
|
||||
auto_title = user_message[:60] + ("..." if len(user_message) > 60 else "")
|
||||
conn = sqlite3.connect(CONVERSATIONS_DB)
|
||||
c = conn.cursor()
|
||||
c.execute("UPDATE conversations SET title = ? WHERE id = ?", (auto_title, conversation_id))
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
response, sources = chat(user_message, conversation_id, settings)
|
||||
save_message(conversation_id, "assistant", response, sources if settings.get("show_sources") else [])
|
||||
|
||||
return JSONResponse({
|
||||
"response": response,
|
||||
"sources": sources if settings.get("show_sources") else [],
|
||||
"conversation_id": conversation_id
|
||||
})
|
||||
|
||||
@app.get("/api/memory")
|
||||
async def get_memory():
|
||||
return JSONResponse({"content": load_memory()})
|
||||
|
||||
@app.post("/api/memory")
|
||||
async def update_memory(request: Request):
|
||||
data = await request.json()
|
||||
content = data.get("content", "")
|
||||
save_memory(content)
|
||||
return JSONResponse({"saved": True})
|
||||
|
||||
@app.get("/api/status")
|
||||
async def get_status():
|
||||
chunk_count = collection.count()
|
||||
|
||||
# Watcher status
|
||||
watcher_running = False
|
||||
last_indexed = "Unknown"
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["systemctl", "is-active", "aaronai-watcher"],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
watcher_running = result.stdout.strip() == "active"
|
||||
except:
|
||||
pass
|
||||
|
||||
try:
|
||||
log_path = Path(WATCHER_LOG)
|
||||
if log_path.exists():
|
||||
lines = log_path.read_text().strip().split("\n")
|
||||
for line in reversed(lines):
|
||||
if "Ingestion complete" in line:
|
||||
last_indexed = line.split(" - ")[0].strip()
|
||||
break
|
||||
except:
|
||||
pass
|
||||
|
||||
# File count from watcher state
|
||||
file_count = 0
|
||||
try:
|
||||
state_path = Path(WATCHER_STATE)
|
||||
if state_path.exists():
|
||||
state = json.loads(state_path.read_text())
|
||||
file_count = len(state)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Conversation count
|
||||
conn = sqlite3.connect(CONVERSATIONS_DB)
|
||||
c = conn.cursor()
|
||||
c.execute("SELECT COUNT(*) FROM conversations")
|
||||
conv_count = c.fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
return JSONResponse({
|
||||
"aaron_ai": "running",
|
||||
"watcher": "running" if watcher_running else "stopped",
|
||||
"chunk_count": chunk_count,
|
||||
"file_count": file_count,
|
||||
"last_indexed": last_indexed,
|
||||
"conversation_count": conv_count,
|
||||
"model": "claude-sonnet-4-6",
|
||||
"nextcloud_path": NEXTCLOUD_PATH
|
||||
})
|
||||
|
||||
@app.post("/api/reindex")
|
||||
async def trigger_reindex():
|
||||
try:
|
||||
subprocess.Popen([PYTHON, INGEST_SCRIPT, NEXTCLOUD_PATH])
|
||||
return JSONResponse({"started": True, "message": "Re-indexing started in background"})
|
||||
except Exception as e:
|
||||
return JSONResponse({"started": False, "error": str(e)})
|
||||
|
||||
@app.delete("/api/conversations")
|
||||
async def clear_all_conversations():
|
||||
conn = sqlite3.connect(CONVERSATIONS_DB)
|
||||
c = conn.cursor()
|
||||
c.execute("DELETE FROM messages")
|
||||
c.execute("DELETE FROM conversations")
|
||||
conn.commit()
|
||||
conn.close()
|
||||
return JSONResponse({"cleared": True})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
uvicorn.run(app, host="0.0.0.0", port=8000)
|
||||
+250
@@ -0,0 +1,250 @@
|
||||
import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
from dotenv import load_dotenv
|
||||
import chromadb
|
||||
from sentence_transformers import SentenceTransformer
|
||||
import anthropic
|
||||
from datetime import datetime
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
memory_path = Path.home() / "aaronai" / "memory.md"
|
||||
db_path = str(Path.home() / "aaronai" / "db")
|
||||
|
||||
print("Loading Aaron AI...")
|
||||
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
chroma_client = chromadb.PersistentClient(path=db_path)
|
||||
collection = chroma_client.get_or_create_collection(
|
||||
name="aaronai",
|
||||
metadata={"hnsw:space": "cosine"}
|
||||
)
|
||||
anthropic_client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
|
||||
|
||||
SYSTEM_PROMPT = """You are Aaron Nelson's personal AI assistant. Aaron is an Associate Professor
|
||||
of Digital Design & Fabrication and Program Director of the Hudson Valley Additive Manufacturing
|
||||
Center (HVAMC) at SUNY New Paltz. He is an expert in computational design, additive manufacturing,
|
||||
and digital fabrication with deep fluency in Rhino, Grasshopper, Stratasys FDM, PolyJet, and metal
|
||||
3D printing workflows. He runs a commercial venture called Mossygear and a consulting operation
|
||||
called FWN3D. He has a background in graffiti lettering and vector illustration.
|
||||
|
||||
You have been provided with relevant excerpts from Aaron's own documents and his persistent memory.
|
||||
Use this context to give answers grounded in his actual work and history. When helping him write
|
||||
or create, match his voice and draw on his existing materials. Be direct and specific -
|
||||
Aaron values precision over padding. Always cite which documents you drew from when relevant.
|
||||
|
||||
You have access to web search. Use it automatically when:
|
||||
- Questions require current data (salaries, job postings, prices, news)
|
||||
- Questions reference specific institutions, people, or organizations you need to verify
|
||||
- Aaron's documents and memory don't contain sufficient information to answer well
|
||||
Do not announce that you are searching. Just search and incorporate results naturally."""
|
||||
|
||||
CV_SOURCES = ["Aaron Nelson CV 2024.pdf"]
|
||||
conversation_history = []
|
||||
|
||||
TOOLS = [
|
||||
{
|
||||
"type": "web_search_20250305",
|
||||
"name": "web_search"
|
||||
}
|
||||
]
|
||||
|
||||
def load_memory():
|
||||
if memory_path.exists():
|
||||
return memory_path.read_text(encoding="utf-8")
|
||||
return ""
|
||||
|
||||
def save_memory(content):
|
||||
memory_path.write_text(content, encoding="utf-8")
|
||||
|
||||
def add_to_memory(new_item):
|
||||
memory = load_memory()
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d")
|
||||
note = f"\n- [{timestamp}] {new_item}"
|
||||
if "## Notes" not in memory:
|
||||
memory += "\n\n## Notes"
|
||||
memory += note
|
||||
save_memory(memory)
|
||||
|
||||
def remove_from_memory(item):
|
||||
memory = load_memory()
|
||||
lines = memory.split("\n")
|
||||
filtered = [l for l in lines if item.lower() not in l.lower()]
|
||||
save_memory("\n".join(filtered))
|
||||
return len(lines) - len(filtered)
|
||||
|
||||
def get_pinned_cv_context():
|
||||
results = collection.get(
|
||||
where={"source": "Aaron Nelson CV 2024.pdf"},
|
||||
include=["documents", "metadatas"]
|
||||
)
|
||||
return results["documents"], results["metadatas"]
|
||||
|
||||
def is_professional_query(query):
|
||||
keywords = [
|
||||
"grant", "publication", "exhibition", "award", "fellowship",
|
||||
"experience", "position", "job", "career", "cv", "resume",
|
||||
"research", "work history", "accomplishment", "teaching",
|
||||
"course", "client", "consultation", "presentation", "workshop",
|
||||
"education", "degree", "institution", "service", "committee"
|
||||
]
|
||||
return any(keyword in query.lower() for keyword in keywords)
|
||||
|
||||
def retrieve_context(query, n_results=8):
|
||||
query_embedding = embedder.encode([query]).tolist()
|
||||
results = collection.query(
|
||||
query_embeddings=query_embedding,
|
||||
n_results=n_results,
|
||||
include=["documents", "metadatas", "distances"]
|
||||
)
|
||||
|
||||
context_pieces = []
|
||||
sources = []
|
||||
|
||||
if is_professional_query(query):
|
||||
cv_docs, cv_metas = get_pinned_cv_context()
|
||||
for doc, meta in zip(cv_docs, cv_metas):
|
||||
context_pieces.append(f"[CV] {doc}")
|
||||
sources.append(meta["source"])
|
||||
|
||||
for doc, meta, dist in zip(
|
||||
results["documents"][0],
|
||||
results["metadatas"][0],
|
||||
results["distances"][0]
|
||||
):
|
||||
relevance = 1 - dist
|
||||
if relevance > 0.3 and meta["source"] not in CV_SOURCES:
|
||||
context_pieces.append(doc)
|
||||
sources.append(meta["source"])
|
||||
|
||||
return context_pieces, sources
|
||||
|
||||
def handle_command(user_input):
|
||||
stripped = user_input.strip().lower()
|
||||
|
||||
if stripped == "show memory":
|
||||
memory = load_memory()
|
||||
print(f"\nAaron AI: Current memory:\n\n{memory}")
|
||||
return True
|
||||
|
||||
if stripped.startswith("remember:"):
|
||||
item = user_input[9:].strip()
|
||||
add_to_memory(item)
|
||||
print(f"\nAaron AI: Saved to memory: '{item}'")
|
||||
return True
|
||||
|
||||
if stripped.startswith("forget:"):
|
||||
item = user_input[7:].strip()
|
||||
removed = remove_from_memory(item)
|
||||
if removed:
|
||||
print(f"\nAaron AI: Removed {removed} line(s) containing '{item}' from memory.")
|
||||
else:
|
||||
print(f"\nAaron AI: Nothing found in memory containing '{item}'.")
|
||||
return True
|
||||
|
||||
if stripped == "clear":
|
||||
conversation_history.clear()
|
||||
print("\nAaron AI: Conversation history cleared.")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def chat(user_message):
|
||||
memory = load_memory()
|
||||
context_pieces, sources = retrieve_context(user_message)
|
||||
|
||||
context_parts = []
|
||||
if memory:
|
||||
context_parts.append(f"Aaron's persistent memory:\n\n{memory}")
|
||||
if context_pieces:
|
||||
context_str = "\n\n---\n\n".join(context_pieces)
|
||||
unique_sources = list(set(sources))
|
||||
context_parts.append(
|
||||
f"Relevant excerpts from Aaron's documents:\n\n{context_str}\n\nSources: {', '.join(unique_sources)}"
|
||||
)
|
||||
|
||||
context_block = "\n\n====\n\n".join(context_parts) + "\n\n---\n\n" if context_parts else ""
|
||||
full_message = context_block + user_message
|
||||
|
||||
# Build messages for this turn
|
||||
messages = conversation_history + [{"role": "user", "content": full_message}]
|
||||
|
||||
# Agentic loop to handle tool use
|
||||
while True:
|
||||
response = anthropic_client.messages.create(
|
||||
model="claude-sonnet-4-6",
|
||||
max_tokens=2048,
|
||||
system=SYSTEM_PROMPT,
|
||||
tools=TOOLS,
|
||||
messages=messages
|
||||
)
|
||||
|
||||
# Check if we need to handle tool calls
|
||||
if response.stop_reason == "tool_use":
|
||||
# Add assistant response to messages
|
||||
messages.append({"role": "assistant", "content": response.content})
|
||||
|
||||
# Process each tool use block
|
||||
tool_results = []
|
||||
for block in response.content:
|
||||
if block.type == "tool_use":
|
||||
tool_results.append({
|
||||
"type": "tool_result",
|
||||
"tool_use_id": block.id,
|
||||
"content": "Search completed"
|
||||
})
|
||||
|
||||
# Add tool results and continue
|
||||
messages.append({"role": "user", "content": tool_results})
|
||||
|
||||
else:
|
||||
# Final response - extract text
|
||||
assistant_message = ""
|
||||
for block in response.content:
|
||||
if hasattr(block, "text"):
|
||||
assistant_message += block.text
|
||||
|
||||
# Update conversation history with clean versions
|
||||
conversation_history.append({"role": "user", "content": full_message})
|
||||
conversation_history.append({"role": "assistant", "content": assistant_message})
|
||||
|
||||
if len(conversation_history) > 20:
|
||||
conversation_history.pop(0)
|
||||
conversation_history.pop(0)
|
||||
|
||||
return assistant_message, sources
|
||||
|
||||
def main():
|
||||
print("Aaron AI ready. Corpus, memory, and web search loaded.")
|
||||
print("Commands: 'remember: [fact]' | 'forget: [text]' | 'show memory' | 'clear' | 'quit'")
|
||||
print("=" * 60)
|
||||
|
||||
while True:
|
||||
try:
|
||||
user_input = input("\nYou: ").strip()
|
||||
|
||||
if not user_input:
|
||||
continue
|
||||
|
||||
if user_input.strip().lower() == "quit":
|
||||
print("Goodbye.")
|
||||
break
|
||||
|
||||
if handle_command(user_input):
|
||||
continue
|
||||
|
||||
response, sources = chat(user_input)
|
||||
print(f"\nAaron AI: {response}")
|
||||
|
||||
if sources:
|
||||
unique = list(set(sources))
|
||||
print(f"\n[Sources: {', '.join(unique)}]")
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\nGoodbye.")
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,141 @@
|
||||
import os
|
||||
import sys
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
from dotenv import load_dotenv
|
||||
import chromadb
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from docx import Document
|
||||
from pypdf import PdfReader
|
||||
from pptx import Presentation
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
print("Loading embedding model...")
|
||||
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
|
||||
db_path = str(Path.home() / "aaronai" / "db")
|
||||
client = chromadb.PersistentClient(path=db_path)
|
||||
collection = client.get_or_create_collection(
|
||||
name="aaronai",
|
||||
metadata={"hnsw:space": "cosine"}
|
||||
)
|
||||
|
||||
def extract_text_from_docx(path):
|
||||
doc = Document(path)
|
||||
return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
|
||||
|
||||
def extract_text_from_pdf(path):
|
||||
reader = PdfReader(path)
|
||||
text = ""
|
||||
for page in reader.pages:
|
||||
extracted = page.extract_text()
|
||||
if extracted:
|
||||
text += extracted + "\n"
|
||||
return text
|
||||
|
||||
def extract_text_from_pptx(path):
|
||||
prs = Presentation(path)
|
||||
text = ""
|
||||
for slide in prs.slides:
|
||||
for shape in slide.shapes:
|
||||
if hasattr(shape, "text") and shape.text.strip():
|
||||
text += shape.text + "\n"
|
||||
return text
|
||||
|
||||
def extract_text_from_txt(path):
|
||||
with open(path, "r", encoding="utf-8", errors="ignore") as f:
|
||||
return f.read()
|
||||
|
||||
def chunk_text(text, chunk_size=500, overlap=50):
|
||||
words = text.split()
|
||||
chunks = []
|
||||
start = 0
|
||||
while start < len(words):
|
||||
end = start + chunk_size
|
||||
chunk = " ".join(words[start:end])
|
||||
if chunk.strip():
|
||||
chunks.append(chunk)
|
||||
start += chunk_size - overlap
|
||||
return chunks
|
||||
|
||||
def make_id(filepath, chunk_index):
|
||||
path_hash = hashlib.md5(str(filepath).encode()).hexdigest()[:8]
|
||||
return f"{path_hash}_{chunk_index}"
|
||||
|
||||
def ingest_file(filepath):
|
||||
path = Path(filepath)
|
||||
suffix = path.suffix.lower()
|
||||
|
||||
# Skip temp files
|
||||
if path.name.startswith("~$") or path.name.startswith("."):
|
||||
return 0
|
||||
|
||||
try:
|
||||
if suffix == ".docx":
|
||||
text = extract_text_from_docx(path)
|
||||
elif suffix == ".pdf":
|
||||
text = extract_text_from_pdf(path)
|
||||
elif suffix == ".pptx":
|
||||
text = extract_text_from_pptx(path)
|
||||
elif suffix in [".txt", ".md"]:
|
||||
text = extract_text_from_txt(path)
|
||||
else:
|
||||
return 0
|
||||
|
||||
if not text.strip():
|
||||
return 0
|
||||
|
||||
chunks = chunk_text(text)
|
||||
if not chunks:
|
||||
return 0
|
||||
|
||||
embeddings = embedder.encode(chunks).tolist()
|
||||
ids = [make_id(path, i) for i in range(len(chunks))]
|
||||
metadatas = [{
|
||||
"source": path.name,
|
||||
"filepath": str(path),
|
||||
"folder": str(path.parent.relative_to(Path(sys.argv[1]) if len(sys.argv) > 1 else path.parent))
|
||||
} for _ in chunks]
|
||||
|
||||
collection.upsert(
|
||||
documents=chunks,
|
||||
embeddings=embeddings,
|
||||
ids=ids,
|
||||
metadatas=metadatas
|
||||
)
|
||||
print(f" Indexed {len(chunks)} chunks: {path.name}")
|
||||
return len(chunks)
|
||||
|
||||
except Exception as e:
|
||||
print(f" Error: {path.name}: {e}")
|
||||
return 0
|
||||
|
||||
def ingest_folder(folder_path):
|
||||
folder = Path(folder_path)
|
||||
if not folder.exists():
|
||||
print(f"Folder not found: {folder_path}")
|
||||
sys.exit(1)
|
||||
|
||||
supported = [".docx", ".pdf", ".pptx", ".txt", ".md"]
|
||||
files = [f for f in folder.rglob("*")
|
||||
if f.suffix.lower() in supported
|
||||
and not f.name.startswith("~$")
|
||||
and not f.name.startswith(".")]
|
||||
|
||||
if not files:
|
||||
print("No supported files found.")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Found {len(files)} files to process\n")
|
||||
total_chunks = 0
|
||||
for f in files:
|
||||
total_chunks += ingest_file(f)
|
||||
|
||||
print(f"\nDone. Total chunks indexed: {total_chunks}")
|
||||
print(f"Database stored at: {db_path}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
target = sys.argv[1] if len(sys.argv) > 1 else str(Path.home() / "aaronai" / "docs")
|
||||
print(f"Ingesting from: {target}\n")
|
||||
ingest_folder(target)
|
||||
@@ -0,0 +1,150 @@
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from sentence_transformers import SentenceTransformer
|
||||
import chromadb
|
||||
|
||||
# Paths
|
||||
db_path = str(Path.home() / "aaronai" / "db")
|
||||
EXPORT_DIR = "/home/aaron/nextcloud/data/data/aaron/files/Archive/Misc/ChatGPT Export"
|
||||
|
||||
print("Loading embedding model...")
|
||||
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
client = chromadb.PersistentClient(path=db_path)
|
||||
collection = client.get_or_create_collection(
|
||||
name="aaronai",
|
||||
metadata={"hnsw:space": "cosine"}
|
||||
)
|
||||
|
||||
def extract_messages(convo):
|
||||
"""Extract ordered user/assistant messages from a conversation."""
|
||||
mapping = convo.get("mapping", {})
|
||||
messages = []
|
||||
|
||||
for node in mapping.values():
|
||||
msg = node.get("message")
|
||||
if not msg:
|
||||
continue
|
||||
|
||||
role = msg.get("author", {}).get("role")
|
||||
if role not in ["user", "assistant"]:
|
||||
continue
|
||||
|
||||
content = msg.get("content", {})
|
||||
parts = content.get("parts", [])
|
||||
|
||||
# Extract text parts only
|
||||
text = ""
|
||||
for part in parts:
|
||||
if isinstance(part, str):
|
||||
text += part
|
||||
elif isinstance(part, dict) and part.get("content_type") == "text":
|
||||
text += part.get("text", "")
|
||||
|
||||
text = text.strip()
|
||||
if not text:
|
||||
continue
|
||||
|
||||
create_time = msg.get("create_time") or 0
|
||||
messages.append((create_time, role, text))
|
||||
|
||||
# Sort by timestamp
|
||||
messages.sort(key=lambda x: x[0])
|
||||
return messages
|
||||
|
||||
def chunk_conversation(title, messages, chunk_size=600, overlap=100):
|
||||
"""Convert a conversation into overlapping text chunks."""
|
||||
# Build full conversation text
|
||||
lines = [f"[Conversation: {title}]", ""]
|
||||
for _, role, text in messages:
|
||||
label = "Aaron" if role == "user" else "ChatGPT"
|
||||
lines.append(f"{label}: {text}")
|
||||
lines.append("")
|
||||
|
||||
full_text = "\n".join(lines)
|
||||
|
||||
# Split into word-level chunks with overlap
|
||||
words = full_text.split()
|
||||
chunks = []
|
||||
start = 0
|
||||
while start < len(words):
|
||||
end = start + chunk_size
|
||||
chunk = " ".join(words[start:end])
|
||||
if chunk.strip():
|
||||
chunks.append(chunk)
|
||||
start += chunk_size - overlap
|
||||
|
||||
return chunks
|
||||
|
||||
def ingest_file(json_path):
|
||||
print(f"\nLoading {json_path.name}...")
|
||||
data = json.load(open(json_path, encoding="utf-8"))
|
||||
print(f"Found {len(data)} conversations")
|
||||
|
||||
total_chunks = 0
|
||||
skipped = 0
|
||||
|
||||
for i, convo in enumerate(data):
|
||||
title = convo.get("title", "Untitled")
|
||||
convo_id = convo.get("id", f"convo_{i}")
|
||||
create_time = convo.get("create_time", 0)
|
||||
|
||||
try:
|
||||
date_str = datetime.fromtimestamp(create_time).strftime("%Y-%m-%d")
|
||||
except:
|
||||
date_str = "unknown"
|
||||
|
||||
messages = extract_messages(convo)
|
||||
|
||||
if len(messages) < 2:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
chunks = chunk_conversation(title, messages)
|
||||
if not chunks:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
# Embed and store
|
||||
embeddings = embedder.encode(chunks).tolist()
|
||||
ids = [f"chatgpt_{convo_id}_{j}" for j in range(len(chunks))]
|
||||
metadatas = [{
|
||||
"source": f"ChatGPT: {title}",
|
||||
"filepath": str(json_path),
|
||||
"date": date_str,
|
||||
"type": "chatgpt_conversation"
|
||||
} for _ in chunks]
|
||||
|
||||
collection.upsert(
|
||||
documents=chunks,
|
||||
embeddings=embeddings,
|
||||
ids=ids,
|
||||
metadatas=metadatas
|
||||
)
|
||||
|
||||
total_chunks += len(chunks)
|
||||
print(f" [{i+1}/{len(data)}] {title[:60]} — {len(chunks)} chunks ({date_str})")
|
||||
|
||||
print(f"\nDone with {json_path.name}: {total_chunks} chunks indexed, {skipped} conversations skipped")
|
||||
return total_chunks
|
||||
|
||||
def main():
|
||||
export_dir = Path(EXPORT_DIR)
|
||||
files = [
|
||||
export_dir / "conversations-000.json",
|
||||
export_dir / "conversations-001.json"
|
||||
]
|
||||
|
||||
grand_total = 0
|
||||
for f in files:
|
||||
if f.exists():
|
||||
grand_total += ingest_file(f)
|
||||
else:
|
||||
print(f"Not found: {f}")
|
||||
|
||||
print(f"\nTotal chunks added to corpus: {grand_total}")
|
||||
print(f"Database at: {db_path}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,122 @@
|
||||
import time
|
||||
import subprocess
|
||||
import logging
|
||||
import json
|
||||
from pathlib import Path
|
||||
from watchdog.observers import Observer
|
||||
from watchdog.events import FileSystemEventHandler
|
||||
|
||||
NEXTCLOUD_PATH = "/home/aaron/nextcloud/data/data/aaron/files"
|
||||
INGEST_SCRIPT = "/home/aaron/aaronai/scripts/ingest.py"
|
||||
PYTHON = "/home/aaron/aaronai/venv/bin/python3"
|
||||
LOG_FILE = "/home/aaron/aaronai/watcher.log"
|
||||
STATE_FILE = "/home/aaron/aaronai/watcher_state.json"
|
||||
|
||||
SUPPORTED = {'.pdf', '.docx', '.pptx', '.txt', '.md'}
|
||||
DEBOUNCE_SECONDS = 120
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler(LOG_FILE),
|
||||
logging.StreamHandler()
|
||||
]
|
||||
)
|
||||
|
||||
def load_state():
|
||||
if Path(STATE_FILE).exists():
|
||||
with open(STATE_FILE) as f:
|
||||
return json.load(f)
|
||||
return {}
|
||||
|
||||
def save_state(state):
|
||||
with open(STATE_FILE, 'w') as f:
|
||||
json.dump(state, f)
|
||||
|
||||
def get_changed_files():
|
||||
state = load_state()
|
||||
changed = []
|
||||
root = Path(NEXTCLOUD_PATH)
|
||||
for path in root.rglob("*"):
|
||||
if path.is_dir():
|
||||
continue
|
||||
if path.suffix.lower() not in SUPPORTED:
|
||||
continue
|
||||
if path.name.startswith('.') or path.name.startswith('~$'):
|
||||
continue
|
||||
mtime = str(path.stat().st_mtime)
|
||||
key = str(path)
|
||||
if state.get(key) != mtime:
|
||||
changed.append(path)
|
||||
return changed, state
|
||||
|
||||
def run_ingestion():
|
||||
changed, state = get_changed_files()
|
||||
if not changed:
|
||||
logging.info("No new or changed files detected — skipping ingestion.")
|
||||
return
|
||||
|
||||
logging.info(f"Found {len(changed)} new or changed files — starting ingestion...")
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[PYTHON, INGEST_SCRIPT, NEXTCLOUD_PATH],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=600
|
||||
)
|
||||
if result.returncode == 0:
|
||||
# Update state with new mtimes
|
||||
root = Path(NEXTCLOUD_PATH)
|
||||
for path in root.rglob("*"):
|
||||
if path.is_file() and path.suffix.lower() in SUPPORTED:
|
||||
state[str(path)] = str(path.stat().st_mtime)
|
||||
save_state(state)
|
||||
logging.info("Ingestion complete. State updated.")
|
||||
else:
|
||||
logging.error(f"Ingestion error: {result.stderr}")
|
||||
except subprocess.TimeoutExpired:
|
||||
logging.error("Ingestion timed out.")
|
||||
except Exception as e:
|
||||
logging.error(f"Ingestion failed: {e}")
|
||||
|
||||
class IngestHandler(FileSystemEventHandler):
|
||||
def __init__(self):
|
||||
self.pending = False
|
||||
self.last_event = 0
|
||||
|
||||
def on_any_event(self, event):
|
||||
if event.is_directory:
|
||||
return
|
||||
path = Path(event.src_path)
|
||||
if path.suffix.lower() not in SUPPORTED:
|
||||
return
|
||||
if path.name.startswith('.') or path.name.startswith('~$'):
|
||||
return
|
||||
self.pending = True
|
||||
self.last_event = time.time()
|
||||
|
||||
def main():
|
||||
logging.info("Aaron AI Watcher starting...")
|
||||
logging.info(f"Watching: {NEXTCLOUD_PATH}")
|
||||
|
||||
handler = IngestHandler()
|
||||
observer = Observer()
|
||||
observer.schedule(handler, NEXTCLOUD_PATH, recursive=True)
|
||||
observer.start()
|
||||
|
||||
try:
|
||||
while True:
|
||||
if handler.pending:
|
||||
elapsed = time.time() - handler.last_event
|
||||
if elapsed >= DEBOUNCE_SECONDS:
|
||||
handler.pending = False
|
||||
run_ingestion()
|
||||
time.sleep(5)
|
||||
except KeyboardInterrupt:
|
||||
observer.stop()
|
||||
observer.join()
|
||||
logging.info("Watcher stopped.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user