chore: archive deprecated chromadb and migration scripts
This commit is contained in:
@@ -565,6 +565,7 @@ async def get_status(auth: str = Depends(require_auth)):
|
||||
|
||||
# Watcher status
|
||||
watcher_running = False
|
||||
watcher_ingestion = {"status": "idle", "message": "", "file_count": 0}
|
||||
last_indexed = "Unknown"
|
||||
try:
|
||||
import time as _time, json as _json
|
||||
@@ -573,6 +574,7 @@ async def get_status(auth: str = Depends(require_auth)):
|
||||
_s = _json.loads(_sp.read_text())
|
||||
_age = _time.time() - _s.get("timestamp", 0)
|
||||
watcher_running = _s.get("running", False) and _age < 30
|
||||
watcher_ingestion = _s.get("ingestion", watcher_ingestion)
|
||||
except:
|
||||
pass
|
||||
|
||||
@@ -613,6 +615,7 @@ async def get_status(auth: str = Depends(require_auth)):
|
||||
return JSONResponse({
|
||||
"aaron_ai": "running",
|
||||
"watcher": "running" if watcher_running else "stopped",
|
||||
"watcher_ingestion": watcher_ingestion,
|
||||
"chunk_count": chunk_count,
|
||||
"file_count": file_count,
|
||||
"last_indexed": last_indexed,
|
||||
|
||||
-250
@@ -1,250 +0,0 @@
|
||||
import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
from dotenv import load_dotenv
|
||||
import chromadb
|
||||
from sentence_transformers import SentenceTransformer
|
||||
import anthropic
|
||||
from datetime import datetime
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
memory_path = Path.home() / "aaronai" / "memory.md"
|
||||
db_path = str(Path.home() / "aaronai" / "db")
|
||||
|
||||
print("Loading Aaron AI...")
|
||||
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
chroma_client = chromadb.PersistentClient(path=db_path)
|
||||
collection = chroma_client.get_or_create_collection(
|
||||
name="aaronai",
|
||||
metadata={"hnsw:space": "cosine"}
|
||||
)
|
||||
anthropic_client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
|
||||
|
||||
SYSTEM_PROMPT = """You are Aaron Nelson's personal AI assistant. Aaron is an Associate Professor
|
||||
of Digital Design & Fabrication and Program Director of the Hudson Valley Additive Manufacturing
|
||||
Center (HVAMC) at SUNY New Paltz. He is an expert in computational design, additive manufacturing,
|
||||
and digital fabrication with deep fluency in Rhino, Grasshopper, Stratasys FDM, PolyJet, and metal
|
||||
3D printing workflows. He runs a commercial venture called Mossygear and a consulting operation
|
||||
called FWN3D. He has a background in graffiti lettering and vector illustration.
|
||||
|
||||
You have been provided with relevant excerpts from Aaron's own documents and his persistent memory.
|
||||
Use this context to give answers grounded in his actual work and history. When helping him write
|
||||
or create, match his voice and draw on his existing materials. Be direct and specific -
|
||||
Aaron values precision over padding. Always cite which documents you drew from when relevant.
|
||||
|
||||
You have access to web search. Use it automatically when:
|
||||
- Questions require current data (salaries, job postings, prices, news)
|
||||
- Questions reference specific institutions, people, or organizations you need to verify
|
||||
- Aaron's documents and memory don't contain sufficient information to answer well
|
||||
Do not announce that you are searching. Just search and incorporate results naturally."""
|
||||
|
||||
CV_SOURCES = ["Aaron Nelson CV 2024.pdf"]
|
||||
conversation_history = []
|
||||
|
||||
TOOLS = [
|
||||
{
|
||||
"type": "web_search_20250305",
|
||||
"name": "web_search"
|
||||
}
|
||||
]
|
||||
|
||||
def load_memory():
|
||||
if memory_path.exists():
|
||||
return memory_path.read_text(encoding="utf-8")
|
||||
return ""
|
||||
|
||||
def save_memory(content):
|
||||
memory_path.write_text(content, encoding="utf-8")
|
||||
|
||||
def add_to_memory(new_item):
|
||||
memory = load_memory()
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d")
|
||||
note = f"\n- [{timestamp}] {new_item}"
|
||||
if "## Notes" not in memory:
|
||||
memory += "\n\n## Notes"
|
||||
memory += note
|
||||
save_memory(memory)
|
||||
|
||||
def remove_from_memory(item):
|
||||
memory = load_memory()
|
||||
lines = memory.split("\n")
|
||||
filtered = [l for l in lines if item.lower() not in l.lower()]
|
||||
save_memory("\n".join(filtered))
|
||||
return len(lines) - len(filtered)
|
||||
|
||||
def get_pinned_cv_context():
|
||||
results = collection.get(
|
||||
where={"source": "Aaron Nelson CV 2024.pdf"},
|
||||
include=["documents", "metadatas"]
|
||||
)
|
||||
return results["documents"], results["metadatas"]
|
||||
|
||||
def is_professional_query(query):
|
||||
keywords = [
|
||||
"grant", "publication", "exhibition", "award", "fellowship",
|
||||
"experience", "position", "job", "career", "cv", "resume",
|
||||
"research", "work history", "accomplishment", "teaching",
|
||||
"course", "client", "consultation", "presentation", "workshop",
|
||||
"education", "degree", "institution", "service", "committee"
|
||||
]
|
||||
return any(keyword in query.lower() for keyword in keywords)
|
||||
|
||||
def retrieve_context(query, n_results=8):
|
||||
query_embedding = embedder.encode([query]).tolist()
|
||||
results = collection.query(
|
||||
query_embeddings=query_embedding,
|
||||
n_results=n_results,
|
||||
include=["documents", "metadatas", "distances"]
|
||||
)
|
||||
|
||||
context_pieces = []
|
||||
sources = []
|
||||
|
||||
if is_professional_query(query):
|
||||
cv_docs, cv_metas = get_pinned_cv_context()
|
||||
for doc, meta in zip(cv_docs, cv_metas):
|
||||
context_pieces.append(f"[CV] {doc}")
|
||||
sources.append(meta["source"])
|
||||
|
||||
for doc, meta, dist in zip(
|
||||
results["documents"][0],
|
||||
results["metadatas"][0],
|
||||
results["distances"][0]
|
||||
):
|
||||
relevance = 1 - dist
|
||||
if relevance > 0.3 and meta["source"] not in CV_SOURCES:
|
||||
context_pieces.append(doc)
|
||||
sources.append(meta["source"])
|
||||
|
||||
return context_pieces, sources
|
||||
|
||||
def handle_command(user_input):
|
||||
stripped = user_input.strip().lower()
|
||||
|
||||
if stripped == "show memory":
|
||||
memory = load_memory()
|
||||
print(f"\nAaron AI: Current memory:\n\n{memory}")
|
||||
return True
|
||||
|
||||
if stripped.startswith("remember:"):
|
||||
item = user_input[9:].strip()
|
||||
add_to_memory(item)
|
||||
print(f"\nAaron AI: Saved to memory: '{item}'")
|
||||
return True
|
||||
|
||||
if stripped.startswith("forget:"):
|
||||
item = user_input[7:].strip()
|
||||
removed = remove_from_memory(item)
|
||||
if removed:
|
||||
print(f"\nAaron AI: Removed {removed} line(s) containing '{item}' from memory.")
|
||||
else:
|
||||
print(f"\nAaron AI: Nothing found in memory containing '{item}'.")
|
||||
return True
|
||||
|
||||
if stripped == "clear":
|
||||
conversation_history.clear()
|
||||
print("\nAaron AI: Conversation history cleared.")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def chat(user_message):
|
||||
memory = load_memory()
|
||||
context_pieces, sources = retrieve_context(user_message)
|
||||
|
||||
context_parts = []
|
||||
if memory:
|
||||
context_parts.append(f"Aaron's persistent memory:\n\n{memory}")
|
||||
if context_pieces:
|
||||
context_str = "\n\n---\n\n".join(context_pieces)
|
||||
unique_sources = list(set(sources))
|
||||
context_parts.append(
|
||||
f"Relevant excerpts from Aaron's documents:\n\n{context_str}\n\nSources: {', '.join(unique_sources)}"
|
||||
)
|
||||
|
||||
context_block = "\n\n====\n\n".join(context_parts) + "\n\n---\n\n" if context_parts else ""
|
||||
full_message = context_block + user_message
|
||||
|
||||
# Build messages for this turn
|
||||
messages = conversation_history + [{"role": "user", "content": full_message}]
|
||||
|
||||
# Agentic loop to handle tool use
|
||||
while True:
|
||||
response = anthropic_client.messages.create(
|
||||
model="claude-sonnet-4-6",
|
||||
max_tokens=2048,
|
||||
system=SYSTEM_PROMPT,
|
||||
tools=TOOLS,
|
||||
messages=messages
|
||||
)
|
||||
|
||||
# Check if we need to handle tool calls
|
||||
if response.stop_reason == "tool_use":
|
||||
# Add assistant response to messages
|
||||
messages.append({"role": "assistant", "content": response.content})
|
||||
|
||||
# Process each tool use block
|
||||
tool_results = []
|
||||
for block in response.content:
|
||||
if block.type == "tool_use":
|
||||
tool_results.append({
|
||||
"type": "tool_result",
|
||||
"tool_use_id": block.id,
|
||||
"content": "Search completed"
|
||||
})
|
||||
|
||||
# Add tool results and continue
|
||||
messages.append({"role": "user", "content": tool_results})
|
||||
|
||||
else:
|
||||
# Final response - extract text
|
||||
assistant_message = ""
|
||||
for block in response.content:
|
||||
if hasattr(block, "text"):
|
||||
assistant_message += block.text
|
||||
|
||||
# Update conversation history with clean versions
|
||||
conversation_history.append({"role": "user", "content": full_message})
|
||||
conversation_history.append({"role": "assistant", "content": assistant_message})
|
||||
|
||||
if len(conversation_history) > 20:
|
||||
conversation_history.pop(0)
|
||||
conversation_history.pop(0)
|
||||
|
||||
return assistant_message, sources
|
||||
|
||||
def main():
|
||||
print("Aaron AI ready. Corpus, memory, and web search loaded.")
|
||||
print("Commands: 'remember: [fact]' | 'forget: [text]' | 'show memory' | 'clear' | 'quit'")
|
||||
print("=" * 60)
|
||||
|
||||
while True:
|
||||
try:
|
||||
user_input = input("\nYou: ").strip()
|
||||
|
||||
if not user_input:
|
||||
continue
|
||||
|
||||
if user_input.strip().lower() == "quit":
|
||||
print("Goodbye.")
|
||||
break
|
||||
|
||||
if handle_command(user_input):
|
||||
continue
|
||||
|
||||
response, sources = chat(user_input)
|
||||
print(f"\nAaron AI: {response}")
|
||||
|
||||
if sources:
|
||||
unique = list(set(sources))
|
||||
print(f"\n[Sources: {', '.join(unique)}]")
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\nGoodbye.")
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -145,7 +145,6 @@ def ingest_folder(folder_path):
|
||||
total_chunks += ingest_file(f)
|
||||
|
||||
print(f"\nDone. Total chunks indexed: {total_chunks}")
|
||||
print(f"Database stored at: {db_path}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
target = sys.argv[1] if len(sys.argv) > 1 else str(Path.home() / "aaronai" / "docs")
|
||||
|
||||
@@ -1,152 +0,0 @@
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from sentence_transformers import SentenceTransformer
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
import json as json_module
|
||||
|
||||
# Paths
|
||||
db_path = str(Path.home() / "aaronai" / "db")
|
||||
EXPORT_DIR = "/home/aaron/nextcloud/data/data/aaron/files/Archive/Misc/ChatGPT Export"
|
||||
|
||||
print("Loading embedding model...")
|
||||
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
client = chromadb.PersistentClient(path=db_path)
|
||||
collection = client.get_or_create_collection(
|
||||
name="aaronai",
|
||||
metadata={"hnsw:space": "cosine", "hnsw:allow_replace_deleted": True}
|
||||
)
|
||||
|
||||
def extract_messages(convo):
|
||||
"""Extract ordered user/assistant messages from a conversation."""
|
||||
mapping = convo.get("mapping", {})
|
||||
messages = []
|
||||
|
||||
for node in mapping.values():
|
||||
msg = node.get("message")
|
||||
if not msg:
|
||||
continue
|
||||
|
||||
role = msg.get("author", {}).get("role")
|
||||
if role not in ["user", "assistant"]:
|
||||
continue
|
||||
|
||||
content = msg.get("content", {})
|
||||
parts = content.get("parts", [])
|
||||
|
||||
# Extract text parts only
|
||||
text = ""
|
||||
for part in parts:
|
||||
if isinstance(part, str):
|
||||
text += part
|
||||
elif isinstance(part, dict) and part.get("content_type") == "text":
|
||||
text += part.get("text", "")
|
||||
|
||||
text = text.strip()
|
||||
if not text:
|
||||
continue
|
||||
|
||||
create_time = msg.get("create_time") or 0
|
||||
messages.append((create_time, role, text))
|
||||
|
||||
# Sort by timestamp
|
||||
messages.sort(key=lambda x: x[0])
|
||||
return messages
|
||||
|
||||
def chunk_conversation(title, messages, chunk_size=600, overlap=100):
|
||||
"""Convert a conversation into overlapping text chunks."""
|
||||
# Build full conversation text
|
||||
lines = [f"[Conversation: {title}]", ""]
|
||||
for _, role, text in messages:
|
||||
label = "Aaron" if role == "user" else "ChatGPT"
|
||||
lines.append(f"{label}: {text}")
|
||||
lines.append("")
|
||||
|
||||
full_text = "\n".join(lines)
|
||||
|
||||
# Split into word-level chunks with overlap
|
||||
words = full_text.split()
|
||||
chunks = []
|
||||
start = 0
|
||||
while start < len(words):
|
||||
end = start + chunk_size
|
||||
chunk = " ".join(words[start:end])
|
||||
if chunk.strip():
|
||||
chunks.append(chunk)
|
||||
start += chunk_size - overlap
|
||||
|
||||
return chunks
|
||||
|
||||
def ingest_file(json_path):
|
||||
print(f"\nLoading {json_path.name}...")
|
||||
data = json.load(open(json_path, encoding="utf-8"))
|
||||
print(f"Found {len(data)} conversations")
|
||||
|
||||
total_chunks = 0
|
||||
skipped = 0
|
||||
|
||||
for i, convo in enumerate(data):
|
||||
title = convo.get("title", "Untitled")
|
||||
convo_id = convo.get("id", f"convo_{i}")
|
||||
create_time = convo.get("create_time", 0)
|
||||
|
||||
try:
|
||||
date_str = datetime.fromtimestamp(create_time).strftime("%Y-%m-%d")
|
||||
except:
|
||||
date_str = "unknown"
|
||||
|
||||
messages = extract_messages(convo)
|
||||
|
||||
if len(messages) < 2:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
chunks = chunk_conversation(title, messages)
|
||||
if not chunks:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
# Embed and store
|
||||
embeddings = embedder.encode(chunks).tolist()
|
||||
ids = [f"chatgpt_{convo_id}_{j}" for j in range(len(chunks))]
|
||||
metadatas = [{
|
||||
"source": f"ChatGPT: {title}",
|
||||
"filepath": str(json_path),
|
||||
"date": date_str,
|
||||
"type": "chatgpt_conversation"
|
||||
} for _ in chunks]
|
||||
|
||||
collection.upsert(
|
||||
documents=chunks,
|
||||
embeddings=embeddings,
|
||||
ids=ids,
|
||||
metadatas=metadatas
|
||||
)
|
||||
|
||||
total_chunks += len(chunks)
|
||||
print(f" [{i+1}/{len(data)}] {title[:60]} — {len(chunks)} chunks ({date_str})")
|
||||
|
||||
print(f"\nDone with {json_path.name}: {total_chunks} chunks indexed, {skipped} conversations skipped")
|
||||
return total_chunks
|
||||
|
||||
def main():
|
||||
export_dir = Path(EXPORT_DIR)
|
||||
files = [
|
||||
export_dir / "conversations-000.json",
|
||||
export_dir / "conversations-001.json"
|
||||
]
|
||||
|
||||
grand_total = 0
|
||||
for f in files:
|
||||
if f.exists():
|
||||
grand_total += ingest_file(f)
|
||||
else:
|
||||
print(f"Not found: {f}")
|
||||
|
||||
print(f"\nTotal chunks added to corpus: {grand_total}")
|
||||
print(f"Database at: {db_path}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,189 +0,0 @@
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from sentence_transformers import SentenceTransformer
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
import json as json_module
|
||||
|
||||
# Paths
|
||||
db_path = str(Path.home() / "aaronai" / "db")
|
||||
EXPORT_DIR = "/home/aaron/nextcloud/data/data/aaron/files/Archive/Misc/Claude Export"
|
||||
|
||||
print("Loading embedding model...")
|
||||
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
client = chromadb.PersistentClient(path=db_path)
|
||||
collection = client.get_or_create_collection(
|
||||
name="aaronai",
|
||||
metadata={"hnsw:space": "cosine", "hnsw:allow_replace_deleted": True}
|
||||
)
|
||||
|
||||
def extract_messages(convo):
|
||||
messages = []
|
||||
for msg in convo.get("chat_messages", []):
|
||||
role = msg.get("sender", "")
|
||||
if role not in ["human", "assistant"]:
|
||||
continue
|
||||
content = msg.get("content", [])
|
||||
text = ""
|
||||
if isinstance(content, str):
|
||||
text = content
|
||||
elif isinstance(content, list):
|
||||
for block in content:
|
||||
if isinstance(block, dict) and block.get("type") == "text":
|
||||
text += block.get("text", "")
|
||||
elif isinstance(block, str):
|
||||
text += block
|
||||
text = text.strip()
|
||||
if not text:
|
||||
continue
|
||||
messages.append((msg.get("created_at", ""), role, text))
|
||||
return messages
|
||||
|
||||
def chunk_conversation(convo):
|
||||
chunks = []
|
||||
title = convo.get("name", "Untitled conversation")
|
||||
uuid = convo.get("uuid", "")
|
||||
created_at = convo.get("created_at", "")
|
||||
messages = extract_messages(convo)
|
||||
if not messages:
|
||||
return chunks
|
||||
|
||||
window = []
|
||||
for i, (ts, role, text) in enumerate(messages):
|
||||
label = "You" if role == "human" else "Claude"
|
||||
window.append(f"{label}: {text}")
|
||||
if len(window) >= 3 or i == len(messages) - 1:
|
||||
chunk_text = f"[Claude conversation: {title}]\n\n" + "\n\n".join(window)
|
||||
chunk_id = f"claude_{uuid}_{i}"
|
||||
chunks.append((chunk_id, chunk_text, {
|
||||
"source": f"Claude: {title}",
|
||||
"type": "claude_conversation",
|
||||
"created_at": created_at,
|
||||
}))
|
||||
window = window[-1:]
|
||||
return chunks
|
||||
|
||||
def ingest_conversations(path):
|
||||
print(f"\nIngesting conversations from {path.name}...")
|
||||
conversations = []
|
||||
|
||||
# Handle both .json (array) and .jsonl (one per line)
|
||||
raw = path.read_text(encoding="utf-8").strip()
|
||||
if raw.startswith("["):
|
||||
conversations = json.loads(raw)
|
||||
else:
|
||||
for line in raw.splitlines():
|
||||
line = line.strip()
|
||||
if line:
|
||||
try:
|
||||
conversations.append(json.loads(line))
|
||||
except:
|
||||
continue
|
||||
|
||||
print(f"Found {len(conversations)} conversations")
|
||||
total = 0
|
||||
skipped = 0
|
||||
|
||||
for convo in conversations:
|
||||
chunks = chunk_conversation(convo)
|
||||
if not chunks:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
ids = [c[0] for c in chunks]
|
||||
texts = [c[1] for c in chunks]
|
||||
metas = [c[2] for c in chunks]
|
||||
|
||||
existing = collection.get(ids=ids)
|
||||
existing_ids = set(existing["ids"])
|
||||
new = [(id, txt, meta) for id, txt, meta in zip(ids, texts, metas)
|
||||
if id not in existing_ids]
|
||||
|
||||
if not new:
|
||||
continue
|
||||
|
||||
embeddings = embedder.encode([n[1] for n in new]).tolist()
|
||||
pg = get_pg()
|
||||
cur = pg.cursor()
|
||||
for (chunk_id, chunk_text, meta), embedding in zip(new, embeddings):
|
||||
cur.execute("""
|
||||
INSERT INTO embeddings (id, document, embedding, source, type, created_at, metadata)
|
||||
VALUES (%s, %s, %s::vector, %s, %s, %s, %s)
|
||||
ON CONFLICT (id) DO UPDATE SET
|
||||
document = EXCLUDED.document,
|
||||
embedding = EXCLUDED.embedding,
|
||||
source = EXCLUDED.source,
|
||||
type = EXCLUDED.type,
|
||||
created_at = EXCLUDED.created_at,
|
||||
metadata = EXCLUDED.metadata
|
||||
""", (
|
||||
chunk_id, chunk_text, embedding,
|
||||
meta.get('source'), meta.get('type'), meta.get('created_at'),
|
||||
json_module.dumps(meta)
|
||||
))
|
||||
pg.commit()
|
||||
pg.close()
|
||||
total += len(new)
|
||||
|
||||
print(f"Conversations: {total} chunks added, {skipped} skipped")
|
||||
return total
|
||||
|
||||
def ingest_memories(path):
|
||||
print(f"\nIngesting memories from {path.name}...")
|
||||
raw = json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
# Memories are a list of memory objects
|
||||
memories = raw if isinstance(raw, list) else raw.get("memories", [])
|
||||
if not memories:
|
||||
print("No memories found")
|
||||
return 0
|
||||
|
||||
# Combine all memories into one chunk — they're already distilled
|
||||
memory_text = "\n".join([
|
||||
f"- {m.get('content', m) if isinstance(m, dict) else m}"
|
||||
for m in memories
|
||||
])
|
||||
|
||||
chunk_text = f"[Claude memory — what Claude has learned about Aaron]\n\n{memory_text}"
|
||||
chunk_id = "claude_memories_consolidated"
|
||||
|
||||
existing = collection.get(ids=[chunk_id])
|
||||
if existing["ids"]:
|
||||
# Update by deleting and re-adding
|
||||
collection.delete(ids=[chunk_id])
|
||||
|
||||
embedding = embedder.encode([chunk_text]).tolist()
|
||||
collection.upsert(
|
||||
ids=[chunk_id],
|
||||
documents=[chunk_text],
|
||||
metadatas=[{
|
||||
"source": "Claude: Memory",
|
||||
"type": "claude_memory",
|
||||
}],
|
||||
embeddings=embedding,
|
||||
)
|
||||
|
||||
print(f"Memories: 1 chunk added ({len(memories)} memory items)")
|
||||
return 1
|
||||
|
||||
# Run ingestion
|
||||
export_dir = Path(EXPORT_DIR)
|
||||
total = 0
|
||||
|
||||
conv_files = list(export_dir.glob("conversations.*"))
|
||||
for f in conv_files:
|
||||
total += ingest_conversations(f)
|
||||
|
||||
mem_files = list(export_dir.glob("memories.*"))
|
||||
for f in mem_files:
|
||||
total += ingest_memories(f)
|
||||
|
||||
if total == 0:
|
||||
print("\nNo files found or no new chunks to add.")
|
||||
else:
|
||||
print(f"\nTotal chunks added to corpus: {total}")
|
||||
|
||||
# Show updated corpus size
|
||||
count = collection.count()
|
||||
print(f"Corpus now contains {count} total chunks")
|
||||
@@ -1,125 +0,0 @@
|
||||
"""
|
||||
Migration: ChromaDB → pgvector
|
||||
Re-embeds all documents from ChromaDB SQLite into PostgreSQL with pgvector.
|
||||
Keeps ChromaDB intact as backup until migration is verified.
|
||||
"""
|
||||
import sqlite3
|
||||
import psycopg2
|
||||
import json
|
||||
from pathlib import Path
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
CHROMA_SQLITE = str(Path.home() / "aaronai" / "db" / "chroma.sqlite3")
|
||||
import os
|
||||
PG_DSN = os.getenv("PG_DSN")
|
||||
if not PG_DSN:
|
||||
raise RuntimeError("PG_DSN environment variable not set")
|
||||
|
||||
print("Loading embedding model...")
|
||||
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
|
||||
print("Connecting to databases...")
|
||||
chroma = sqlite3.connect(CHROMA_SQLITE)
|
||||
chroma.row_factory = sqlite3.Row
|
||||
c = chroma.cursor()
|
||||
|
||||
pg = psycopg2.connect(PG_DSN)
|
||||
pg_cur = pg.cursor()
|
||||
|
||||
# Get all documents with their metadata from ChromaDB
|
||||
print("Reading documents from ChromaDB...")
|
||||
c.execute("""
|
||||
SELECT
|
||||
e.id as row_id,
|
||||
e.embedding_id,
|
||||
MAX(CASE WHEN em.key = 'chroma:document' THEN em.string_value END) as document,
|
||||
MAX(CASE WHEN em.key = 'source' THEN em.string_value END) as source,
|
||||
MAX(CASE WHEN em.key = 'type' THEN em.string_value END) as type,
|
||||
MAX(CASE WHEN em.key = 'created_at' THEN em.string_value END) as created_at
|
||||
FROM embeddings e
|
||||
LEFT JOIN embedding_metadata em ON e.id = em.id
|
||||
GROUP BY e.id, e.embedding_id
|
||||
HAVING document IS NOT NULL
|
||||
ORDER BY e.id
|
||||
""")
|
||||
|
||||
rows = c.fetchall()
|
||||
print(f"Found {len(rows)} documents to migrate")
|
||||
|
||||
# Check existing in PostgreSQL
|
||||
pg_cur.execute("SELECT id FROM embeddings")
|
||||
existing_ids = set(r[0] for r in pg_cur.fetchall())
|
||||
print(f"Already in PostgreSQL: {len(existing_ids)}")
|
||||
|
||||
# Filter to only new ones
|
||||
to_migrate = [r for r in rows if r['embedding_id'] not in existing_ids]
|
||||
print(f"Need to migrate: {len(to_migrate)}")
|
||||
|
||||
if not to_migrate:
|
||||
print("Nothing to migrate — already complete")
|
||||
chroma.close()
|
||||
pg.close()
|
||||
exit(0)
|
||||
|
||||
# Migrate in batches
|
||||
batch_size = 200
|
||||
migrated = 0
|
||||
errors = 0
|
||||
|
||||
for i in range(0, len(to_migrate), batch_size):
|
||||
batch = to_migrate[i:i+batch_size]
|
||||
|
||||
# Generate embeddings
|
||||
texts = [r['document'] for r in batch]
|
||||
try:
|
||||
embeddings = embedder.encode(texts, show_progress_bar=False).tolist()
|
||||
except Exception as e:
|
||||
print(f"Embedding error at batch {i}: {e}")
|
||||
errors += len(batch)
|
||||
continue
|
||||
|
||||
# Insert into PostgreSQL
|
||||
for row, embedding in zip(batch, embeddings):
|
||||
try:
|
||||
pg_cur.execute("""
|
||||
INSERT INTO embeddings (id, document, embedding, source, type, created_at, metadata)
|
||||
VALUES (%s, %s, %s::vector, %s, %s, %s, %s)
|
||||
ON CONFLICT (id) DO UPDATE SET
|
||||
document = EXCLUDED.document,
|
||||
embedding = EXCLUDED.embedding,
|
||||
source = EXCLUDED.source,
|
||||
type = EXCLUDED.type,
|
||||
created_at = EXCLUDED.created_at,
|
||||
metadata = EXCLUDED.metadata
|
||||
""", (
|
||||
row['embedding_id'],
|
||||
row['document'],
|
||||
embedding,
|
||||
row['source'],
|
||||
row['type'],
|
||||
row['created_at'],
|
||||
json.dumps({
|
||||
'source': row['source'],
|
||||
'type': row['type'],
|
||||
'created_at': row['created_at'],
|
||||
})
|
||||
))
|
||||
migrated += 1
|
||||
except Exception as e:
|
||||
print(f"Insert error for {row['embedding_id']}: {e}")
|
||||
errors += 1
|
||||
|
||||
pg.commit()
|
||||
print(f"Progress: {min(i+batch_size, len(to_migrate))}/{len(to_migrate)} ({errors} errors)")
|
||||
|
||||
# Final count
|
||||
pg_cur.execute("SELECT COUNT(*) FROM embeddings")
|
||||
final_count = pg_cur.fetchone()[0]
|
||||
|
||||
chroma.close()
|
||||
pg.close()
|
||||
|
||||
print(f"\nMigration complete:")
|
||||
print(f" Migrated: {migrated}")
|
||||
print(f" Errors: {errors}")
|
||||
print(f" PostgreSQL total: {final_count}")
|
||||
+86
-10
@@ -2,6 +2,7 @@ import time
|
||||
import subprocess
|
||||
import logging
|
||||
import json
|
||||
import threading
|
||||
from pathlib import Path
|
||||
from watchdog.observers import Observer
|
||||
from watchdog.events import FileSystemEventHandler
|
||||
@@ -25,16 +26,35 @@ logging.basicConfig(
|
||||
]
|
||||
)
|
||||
|
||||
ingestion_state = {
|
||||
"status": "idle",
|
||||
"message": "",
|
||||
"file_count": 0,
|
||||
"started_at": None,
|
||||
"finished_at": None,
|
||||
"last_error": "",
|
||||
}
|
||||
ingestion_lock = threading.Lock()
|
||||
ingestion_thread = None
|
||||
|
||||
|
||||
def set_ingestion_state(**kwargs):
|
||||
with ingestion_lock:
|
||||
ingestion_state.update(kwargs)
|
||||
|
||||
|
||||
def load_state():
|
||||
if Path(STATE_FILE).exists():
|
||||
with open(STATE_FILE) as f:
|
||||
return json.load(f)
|
||||
return {}
|
||||
|
||||
|
||||
def save_state(state):
|
||||
with open(STATE_FILE, 'w') as f:
|
||||
json.dump(state, f)
|
||||
|
||||
|
||||
def get_changed_files():
|
||||
state = load_state()
|
||||
changed = []
|
||||
@@ -52,13 +72,25 @@ def get_changed_files():
|
||||
changed.append(path)
|
||||
return changed, state
|
||||
|
||||
|
||||
def run_ingestion():
|
||||
changed, state = get_changed_files()
|
||||
if not changed:
|
||||
logging.info("No new or changed files detected — skipping ingestion.")
|
||||
set_ingestion_state(status="idle", message="No changes detected", file_count=0)
|
||||
return
|
||||
|
||||
logging.info(f"Found {len(changed)} new or changed files — starting ingestion...")
|
||||
count = len(changed)
|
||||
logging.info(f"Found {count} new or changed files — starting ingestion...")
|
||||
set_ingestion_state(
|
||||
status="ingesting",
|
||||
message=f"Ingesting {count} file(s)...",
|
||||
file_count=count,
|
||||
started_at=time.time(),
|
||||
finished_at=None,
|
||||
last_error="",
|
||||
)
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[PYTHON, INGEST_SCRIPT, NEXTCLOUD_PATH],
|
||||
@@ -67,19 +99,51 @@ def run_ingestion():
|
||||
timeout=1800
|
||||
)
|
||||
if result.returncode == 0:
|
||||
# Update state with new mtimes
|
||||
root = Path(NEXTCLOUD_PATH)
|
||||
for path in root.rglob("*"):
|
||||
if path.is_file() and path.suffix.lower() in SUPPORTED:
|
||||
state[str(path)] = str(path.stat().st_mtime)
|
||||
save_state(state)
|
||||
logging.info("Ingestion complete. State updated.")
|
||||
set_ingestion_state(
|
||||
status="idle",
|
||||
message=f"Last run: ingested {count} file(s) successfully",
|
||||
finished_at=time.time(),
|
||||
)
|
||||
else:
|
||||
logging.error(f"Ingestion error: {result.stderr}")
|
||||
set_ingestion_state(
|
||||
status="error",
|
||||
message="Ingestion failed — see log",
|
||||
last_error=result.stderr[-300:],
|
||||
finished_at=time.time(),
|
||||
)
|
||||
except subprocess.TimeoutExpired:
|
||||
logging.error("Ingestion timed out.")
|
||||
set_ingestion_state(
|
||||
status="error",
|
||||
message="Ingestion timed out (>30 min)",
|
||||
last_error="TimeoutExpired",
|
||||
finished_at=time.time(),
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error(f"Ingestion failed: {e}")
|
||||
set_ingestion_state(
|
||||
status="error",
|
||||
message=f"Ingestion exception: {e}",
|
||||
last_error=str(e),
|
||||
finished_at=time.time(),
|
||||
)
|
||||
|
||||
|
||||
def start_ingestion_thread():
|
||||
global ingestion_thread
|
||||
if ingestion_thread and ingestion_thread.is_alive():
|
||||
logging.info("Ingestion already running — skipping.")
|
||||
return
|
||||
ingestion_thread = threading.Thread(target=run_ingestion, daemon=True)
|
||||
ingestion_thread.start()
|
||||
|
||||
|
||||
class IngestHandler(FileSystemEventHandler):
|
||||
def __init__(self):
|
||||
@@ -98,9 +162,26 @@ class IngestHandler(FileSystemEventHandler):
|
||||
return
|
||||
if 'Journal/Media' in str(path):
|
||||
return
|
||||
if event.event_type not in ('modified', 'created', 'moved'):
|
||||
return
|
||||
logging.info(f"Event: {event.event_type} {event.src_path}")
|
||||
self.pending = True
|
||||
self.last_event = time.time()
|
||||
|
||||
|
||||
def write_status(handler):
|
||||
with ingestion_lock:
|
||||
status = {
|
||||
"running": True,
|
||||
"timestamp": time.time(),
|
||||
"pending": handler.pending,
|
||||
"last_event": handler.last_event,
|
||||
"ingestion": dict(ingestion_state),
|
||||
}
|
||||
with open(STATUS_FILE, 'w') as f:
|
||||
json.dump(status, f)
|
||||
|
||||
|
||||
def main():
|
||||
logging.info("Aaron AI Watcher starting...")
|
||||
logging.info(f"Watching: {NEXTCLOUD_PATH}")
|
||||
@@ -112,23 +193,18 @@ def main():
|
||||
|
||||
try:
|
||||
while True:
|
||||
import json as _json
|
||||
_json.dump({
|
||||
"running": True,
|
||||
"timestamp": time.time(),
|
||||
"pending": handler.pending,
|
||||
"last_event": handler.last_event
|
||||
}, open(STATUS_FILE, 'w'))
|
||||
write_status(handler)
|
||||
if handler.pending:
|
||||
elapsed = time.time() - handler.last_event
|
||||
if elapsed >= DEBOUNCE_SECONDS:
|
||||
handler.pending = False
|
||||
run_ingestion()
|
||||
start_ingestion_thread()
|
||||
time.sleep(5)
|
||||
except KeyboardInterrupt:
|
||||
observer.stop()
|
||||
observer.join()
|
||||
logging.info("Watcher stopped.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user