chore: archive deprecated chromadb and migration scripts

This commit is contained in:
2026-04-28 00:15:46 +00:00
parent d5b5c2ec14
commit 037d747573
10 changed files with 486 additions and 11 deletions
+3
View File
@@ -565,6 +565,7 @@ async def get_status(auth: str = Depends(require_auth)):
# Watcher status
watcher_running = False
watcher_ingestion = {"status": "idle", "message": "", "file_count": 0}
last_indexed = "Unknown"
try:
import time as _time, json as _json
@@ -573,6 +574,7 @@ async def get_status(auth: str = Depends(require_auth)):
_s = _json.loads(_sp.read_text())
_age = _time.time() - _s.get("timestamp", 0)
watcher_running = _s.get("running", False) and _age < 30
watcher_ingestion = _s.get("ingestion", watcher_ingestion)
except:
pass
@@ -613,6 +615,7 @@ async def get_status(auth: str = Depends(require_auth)):
return JSONResponse({
"aaron_ai": "running",
"watcher": "running" if watcher_running else "stopped",
"watcher_ingestion": watcher_ingestion,
"chunk_count": chunk_count,
"file_count": file_count,
"last_indexed": last_indexed,
-250
View File
@@ -1,250 +0,0 @@
import os
import json
from pathlib import Path
from dotenv import load_dotenv
import chromadb
from sentence_transformers import SentenceTransformer
import anthropic
from datetime import datetime
load_dotenv(Path.home() / "aaronai" / ".env")
memory_path = Path.home() / "aaronai" / "memory.md"
db_path = str(Path.home() / "aaronai" / "db")
print("Loading Aaron AI...")
embedder = SentenceTransformer("all-MiniLM-L6-v2")
chroma_client = chromadb.PersistentClient(path=db_path)
collection = chroma_client.get_or_create_collection(
name="aaronai",
metadata={"hnsw:space": "cosine"}
)
anthropic_client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
SYSTEM_PROMPT = """You are Aaron Nelson's personal AI assistant. Aaron is an Associate Professor
of Digital Design & Fabrication and Program Director of the Hudson Valley Additive Manufacturing
Center (HVAMC) at SUNY New Paltz. He is an expert in computational design, additive manufacturing,
and digital fabrication with deep fluency in Rhino, Grasshopper, Stratasys FDM, PolyJet, and metal
3D printing workflows. He runs a commercial venture called Mossygear and a consulting operation
called FWN3D. He has a background in graffiti lettering and vector illustration.
You have been provided with relevant excerpts from Aaron's own documents and his persistent memory.
Use this context to give answers grounded in his actual work and history. When helping him write
or create, match his voice and draw on his existing materials. Be direct and specific -
Aaron values precision over padding. Always cite which documents you drew from when relevant.
You have access to web search. Use it automatically when:
- Questions require current data (salaries, job postings, prices, news)
- Questions reference specific institutions, people, or organizations you need to verify
- Aaron's documents and memory don't contain sufficient information to answer well
Do not announce that you are searching. Just search and incorporate results naturally."""
CV_SOURCES = ["Aaron Nelson CV 2024.pdf"]
conversation_history = []
TOOLS = [
{
"type": "web_search_20250305",
"name": "web_search"
}
]
def load_memory():
if memory_path.exists():
return memory_path.read_text(encoding="utf-8")
return ""
def save_memory(content):
memory_path.write_text(content, encoding="utf-8")
def add_to_memory(new_item):
memory = load_memory()
timestamp = datetime.now().strftime("%Y-%m-%d")
note = f"\n- [{timestamp}] {new_item}"
if "## Notes" not in memory:
memory += "\n\n## Notes"
memory += note
save_memory(memory)
def remove_from_memory(item):
memory = load_memory()
lines = memory.split("\n")
filtered = [l for l in lines if item.lower() not in l.lower()]
save_memory("\n".join(filtered))
return len(lines) - len(filtered)
def get_pinned_cv_context():
results = collection.get(
where={"source": "Aaron Nelson CV 2024.pdf"},
include=["documents", "metadatas"]
)
return results["documents"], results["metadatas"]
def is_professional_query(query):
keywords = [
"grant", "publication", "exhibition", "award", "fellowship",
"experience", "position", "job", "career", "cv", "resume",
"research", "work history", "accomplishment", "teaching",
"course", "client", "consultation", "presentation", "workshop",
"education", "degree", "institution", "service", "committee"
]
return any(keyword in query.lower() for keyword in keywords)
def retrieve_context(query, n_results=8):
query_embedding = embedder.encode([query]).tolist()
results = collection.query(
query_embeddings=query_embedding,
n_results=n_results,
include=["documents", "metadatas", "distances"]
)
context_pieces = []
sources = []
if is_professional_query(query):
cv_docs, cv_metas = get_pinned_cv_context()
for doc, meta in zip(cv_docs, cv_metas):
context_pieces.append(f"[CV] {doc}")
sources.append(meta["source"])
for doc, meta, dist in zip(
results["documents"][0],
results["metadatas"][0],
results["distances"][0]
):
relevance = 1 - dist
if relevance > 0.3 and meta["source"] not in CV_SOURCES:
context_pieces.append(doc)
sources.append(meta["source"])
return context_pieces, sources
def handle_command(user_input):
stripped = user_input.strip().lower()
if stripped == "show memory":
memory = load_memory()
print(f"\nAaron AI: Current memory:\n\n{memory}")
return True
if stripped.startswith("remember:"):
item = user_input[9:].strip()
add_to_memory(item)
print(f"\nAaron AI: Saved to memory: '{item}'")
return True
if stripped.startswith("forget:"):
item = user_input[7:].strip()
removed = remove_from_memory(item)
if removed:
print(f"\nAaron AI: Removed {removed} line(s) containing '{item}' from memory.")
else:
print(f"\nAaron AI: Nothing found in memory containing '{item}'.")
return True
if stripped == "clear":
conversation_history.clear()
print("\nAaron AI: Conversation history cleared.")
return True
return False
def chat(user_message):
memory = load_memory()
context_pieces, sources = retrieve_context(user_message)
context_parts = []
if memory:
context_parts.append(f"Aaron's persistent memory:\n\n{memory}")
if context_pieces:
context_str = "\n\n---\n\n".join(context_pieces)
unique_sources = list(set(sources))
context_parts.append(
f"Relevant excerpts from Aaron's documents:\n\n{context_str}\n\nSources: {', '.join(unique_sources)}"
)
context_block = "\n\n====\n\n".join(context_parts) + "\n\n---\n\n" if context_parts else ""
full_message = context_block + user_message
# Build messages for this turn
messages = conversation_history + [{"role": "user", "content": full_message}]
# Agentic loop to handle tool use
while True:
response = anthropic_client.messages.create(
model="claude-sonnet-4-6",
max_tokens=2048,
system=SYSTEM_PROMPT,
tools=TOOLS,
messages=messages
)
# Check if we need to handle tool calls
if response.stop_reason == "tool_use":
# Add assistant response to messages
messages.append({"role": "assistant", "content": response.content})
# Process each tool use block
tool_results = []
for block in response.content:
if block.type == "tool_use":
tool_results.append({
"type": "tool_result",
"tool_use_id": block.id,
"content": "Search completed"
})
# Add tool results and continue
messages.append({"role": "user", "content": tool_results})
else:
# Final response - extract text
assistant_message = ""
for block in response.content:
if hasattr(block, "text"):
assistant_message += block.text
# Update conversation history with clean versions
conversation_history.append({"role": "user", "content": full_message})
conversation_history.append({"role": "assistant", "content": assistant_message})
if len(conversation_history) > 20:
conversation_history.pop(0)
conversation_history.pop(0)
return assistant_message, sources
def main():
print("Aaron AI ready. Corpus, memory, and web search loaded.")
print("Commands: 'remember: [fact]' | 'forget: [text]' | 'show memory' | 'clear' | 'quit'")
print("=" * 60)
while True:
try:
user_input = input("\nYou: ").strip()
if not user_input:
continue
if user_input.strip().lower() == "quit":
print("Goodbye.")
break
if handle_command(user_input):
continue
response, sources = chat(user_input)
print(f"\nAaron AI: {response}")
if sources:
unique = list(set(sources))
print(f"\n[Sources: {', '.join(unique)}]")
except KeyboardInterrupt:
print("\nGoodbye.")
break
except Exception as e:
print(f"Error: {e}")
if __name__ == "__main__":
main()
-1
View File
@@ -145,7 +145,6 @@ def ingest_folder(folder_path):
total_chunks += ingest_file(f)
print(f"\nDone. Total chunks indexed: {total_chunks}")
print(f"Database stored at: {db_path}")
if __name__ == "__main__":
target = sys.argv[1] if len(sys.argv) > 1 else str(Path.home() / "aaronai" / "docs")
-152
View File
@@ -1,152 +0,0 @@
import json
import sys
from pathlib import Path
from datetime import datetime
from sentence_transformers import SentenceTransformer
import psycopg2
import psycopg2.extras
import json as json_module
# Paths
db_path = str(Path.home() / "aaronai" / "db")
EXPORT_DIR = "/home/aaron/nextcloud/data/data/aaron/files/Archive/Misc/ChatGPT Export"
print("Loading embedding model...")
embedder = SentenceTransformer("all-MiniLM-L6-v2")
client = chromadb.PersistentClient(path=db_path)
collection = client.get_or_create_collection(
name="aaronai",
metadata={"hnsw:space": "cosine", "hnsw:allow_replace_deleted": True}
)
def extract_messages(convo):
"""Extract ordered user/assistant messages from a conversation."""
mapping = convo.get("mapping", {})
messages = []
for node in mapping.values():
msg = node.get("message")
if not msg:
continue
role = msg.get("author", {}).get("role")
if role not in ["user", "assistant"]:
continue
content = msg.get("content", {})
parts = content.get("parts", [])
# Extract text parts only
text = ""
for part in parts:
if isinstance(part, str):
text += part
elif isinstance(part, dict) and part.get("content_type") == "text":
text += part.get("text", "")
text = text.strip()
if not text:
continue
create_time = msg.get("create_time") or 0
messages.append((create_time, role, text))
# Sort by timestamp
messages.sort(key=lambda x: x[0])
return messages
def chunk_conversation(title, messages, chunk_size=600, overlap=100):
"""Convert a conversation into overlapping text chunks."""
# Build full conversation text
lines = [f"[Conversation: {title}]", ""]
for _, role, text in messages:
label = "Aaron" if role == "user" else "ChatGPT"
lines.append(f"{label}: {text}")
lines.append("")
full_text = "\n".join(lines)
# Split into word-level chunks with overlap
words = full_text.split()
chunks = []
start = 0
while start < len(words):
end = start + chunk_size
chunk = " ".join(words[start:end])
if chunk.strip():
chunks.append(chunk)
start += chunk_size - overlap
return chunks
def ingest_file(json_path):
print(f"\nLoading {json_path.name}...")
data = json.load(open(json_path, encoding="utf-8"))
print(f"Found {len(data)} conversations")
total_chunks = 0
skipped = 0
for i, convo in enumerate(data):
title = convo.get("title", "Untitled")
convo_id = convo.get("id", f"convo_{i}")
create_time = convo.get("create_time", 0)
try:
date_str = datetime.fromtimestamp(create_time).strftime("%Y-%m-%d")
except:
date_str = "unknown"
messages = extract_messages(convo)
if len(messages) < 2:
skipped += 1
continue
chunks = chunk_conversation(title, messages)
if not chunks:
skipped += 1
continue
# Embed and store
embeddings = embedder.encode(chunks).tolist()
ids = [f"chatgpt_{convo_id}_{j}" for j in range(len(chunks))]
metadatas = [{
"source": f"ChatGPT: {title}",
"filepath": str(json_path),
"date": date_str,
"type": "chatgpt_conversation"
} for _ in chunks]
collection.upsert(
documents=chunks,
embeddings=embeddings,
ids=ids,
metadatas=metadatas
)
total_chunks += len(chunks)
print(f" [{i+1}/{len(data)}] {title[:60]}{len(chunks)} chunks ({date_str})")
print(f"\nDone with {json_path.name}: {total_chunks} chunks indexed, {skipped} conversations skipped")
return total_chunks
def main():
export_dir = Path(EXPORT_DIR)
files = [
export_dir / "conversations-000.json",
export_dir / "conversations-001.json"
]
grand_total = 0
for f in files:
if f.exists():
grand_total += ingest_file(f)
else:
print(f"Not found: {f}")
print(f"\nTotal chunks added to corpus: {grand_total}")
print(f"Database at: {db_path}")
if __name__ == "__main__":
main()
-189
View File
@@ -1,189 +0,0 @@
import json
import sys
from pathlib import Path
from sentence_transformers import SentenceTransformer
import psycopg2
import psycopg2.extras
import json as json_module
# Paths
db_path = str(Path.home() / "aaronai" / "db")
EXPORT_DIR = "/home/aaron/nextcloud/data/data/aaron/files/Archive/Misc/Claude Export"
print("Loading embedding model...")
embedder = SentenceTransformer("all-MiniLM-L6-v2")
client = chromadb.PersistentClient(path=db_path)
collection = client.get_or_create_collection(
name="aaronai",
metadata={"hnsw:space": "cosine", "hnsw:allow_replace_deleted": True}
)
def extract_messages(convo):
messages = []
for msg in convo.get("chat_messages", []):
role = msg.get("sender", "")
if role not in ["human", "assistant"]:
continue
content = msg.get("content", [])
text = ""
if isinstance(content, str):
text = content
elif isinstance(content, list):
for block in content:
if isinstance(block, dict) and block.get("type") == "text":
text += block.get("text", "")
elif isinstance(block, str):
text += block
text = text.strip()
if not text:
continue
messages.append((msg.get("created_at", ""), role, text))
return messages
def chunk_conversation(convo):
chunks = []
title = convo.get("name", "Untitled conversation")
uuid = convo.get("uuid", "")
created_at = convo.get("created_at", "")
messages = extract_messages(convo)
if not messages:
return chunks
window = []
for i, (ts, role, text) in enumerate(messages):
label = "You" if role == "human" else "Claude"
window.append(f"{label}: {text}")
if len(window) >= 3 or i == len(messages) - 1:
chunk_text = f"[Claude conversation: {title}]\n\n" + "\n\n".join(window)
chunk_id = f"claude_{uuid}_{i}"
chunks.append((chunk_id, chunk_text, {
"source": f"Claude: {title}",
"type": "claude_conversation",
"created_at": created_at,
}))
window = window[-1:]
return chunks
def ingest_conversations(path):
print(f"\nIngesting conversations from {path.name}...")
conversations = []
# Handle both .json (array) and .jsonl (one per line)
raw = path.read_text(encoding="utf-8").strip()
if raw.startswith("["):
conversations = json.loads(raw)
else:
for line in raw.splitlines():
line = line.strip()
if line:
try:
conversations.append(json.loads(line))
except:
continue
print(f"Found {len(conversations)} conversations")
total = 0
skipped = 0
for convo in conversations:
chunks = chunk_conversation(convo)
if not chunks:
skipped += 1
continue
ids = [c[0] for c in chunks]
texts = [c[1] for c in chunks]
metas = [c[2] for c in chunks]
existing = collection.get(ids=ids)
existing_ids = set(existing["ids"])
new = [(id, txt, meta) for id, txt, meta in zip(ids, texts, metas)
if id not in existing_ids]
if not new:
continue
embeddings = embedder.encode([n[1] for n in new]).tolist()
pg = get_pg()
cur = pg.cursor()
for (chunk_id, chunk_text, meta), embedding in zip(new, embeddings):
cur.execute("""
INSERT INTO embeddings (id, document, embedding, source, type, created_at, metadata)
VALUES (%s, %s, %s::vector, %s, %s, %s, %s)
ON CONFLICT (id) DO UPDATE SET
document = EXCLUDED.document,
embedding = EXCLUDED.embedding,
source = EXCLUDED.source,
type = EXCLUDED.type,
created_at = EXCLUDED.created_at,
metadata = EXCLUDED.metadata
""", (
chunk_id, chunk_text, embedding,
meta.get('source'), meta.get('type'), meta.get('created_at'),
json_module.dumps(meta)
))
pg.commit()
pg.close()
total += len(new)
print(f"Conversations: {total} chunks added, {skipped} skipped")
return total
def ingest_memories(path):
print(f"\nIngesting memories from {path.name}...")
raw = json.loads(path.read_text(encoding="utf-8"))
# Memories are a list of memory objects
memories = raw if isinstance(raw, list) else raw.get("memories", [])
if not memories:
print("No memories found")
return 0
# Combine all memories into one chunk — they're already distilled
memory_text = "\n".join([
f"- {m.get('content', m) if isinstance(m, dict) else m}"
for m in memories
])
chunk_text = f"[Claude memory — what Claude has learned about Aaron]\n\n{memory_text}"
chunk_id = "claude_memories_consolidated"
existing = collection.get(ids=[chunk_id])
if existing["ids"]:
# Update by deleting and re-adding
collection.delete(ids=[chunk_id])
embedding = embedder.encode([chunk_text]).tolist()
collection.upsert(
ids=[chunk_id],
documents=[chunk_text],
metadatas=[{
"source": "Claude: Memory",
"type": "claude_memory",
}],
embeddings=embedding,
)
print(f"Memories: 1 chunk added ({len(memories)} memory items)")
return 1
# Run ingestion
export_dir = Path(EXPORT_DIR)
total = 0
conv_files = list(export_dir.glob("conversations.*"))
for f in conv_files:
total += ingest_conversations(f)
mem_files = list(export_dir.glob("memories.*"))
for f in mem_files:
total += ingest_memories(f)
if total == 0:
print("\nNo files found or no new chunks to add.")
else:
print(f"\nTotal chunks added to corpus: {total}")
# Show updated corpus size
count = collection.count()
print(f"Corpus now contains {count} total chunks")
-125
View File
@@ -1,125 +0,0 @@
"""
Migration: ChromaDB → pgvector
Re-embeds all documents from ChromaDB SQLite into PostgreSQL with pgvector.
Keeps ChromaDB intact as backup until migration is verified.
"""
import sqlite3
import psycopg2
import json
from pathlib import Path
from sentence_transformers import SentenceTransformer
CHROMA_SQLITE = str(Path.home() / "aaronai" / "db" / "chroma.sqlite3")
import os
PG_DSN = os.getenv("PG_DSN")
if not PG_DSN:
raise RuntimeError("PG_DSN environment variable not set")
print("Loading embedding model...")
embedder = SentenceTransformer("all-MiniLM-L6-v2")
print("Connecting to databases...")
chroma = sqlite3.connect(CHROMA_SQLITE)
chroma.row_factory = sqlite3.Row
c = chroma.cursor()
pg = psycopg2.connect(PG_DSN)
pg_cur = pg.cursor()
# Get all documents with their metadata from ChromaDB
print("Reading documents from ChromaDB...")
c.execute("""
SELECT
e.id as row_id,
e.embedding_id,
MAX(CASE WHEN em.key = 'chroma:document' THEN em.string_value END) as document,
MAX(CASE WHEN em.key = 'source' THEN em.string_value END) as source,
MAX(CASE WHEN em.key = 'type' THEN em.string_value END) as type,
MAX(CASE WHEN em.key = 'created_at' THEN em.string_value END) as created_at
FROM embeddings e
LEFT JOIN embedding_metadata em ON e.id = em.id
GROUP BY e.id, e.embedding_id
HAVING document IS NOT NULL
ORDER BY e.id
""")
rows = c.fetchall()
print(f"Found {len(rows)} documents to migrate")
# Check existing in PostgreSQL
pg_cur.execute("SELECT id FROM embeddings")
existing_ids = set(r[0] for r in pg_cur.fetchall())
print(f"Already in PostgreSQL: {len(existing_ids)}")
# Filter to only new ones
to_migrate = [r for r in rows if r['embedding_id'] not in existing_ids]
print(f"Need to migrate: {len(to_migrate)}")
if not to_migrate:
print("Nothing to migrate — already complete")
chroma.close()
pg.close()
exit(0)
# Migrate in batches
batch_size = 200
migrated = 0
errors = 0
for i in range(0, len(to_migrate), batch_size):
batch = to_migrate[i:i+batch_size]
# Generate embeddings
texts = [r['document'] for r in batch]
try:
embeddings = embedder.encode(texts, show_progress_bar=False).tolist()
except Exception as e:
print(f"Embedding error at batch {i}: {e}")
errors += len(batch)
continue
# Insert into PostgreSQL
for row, embedding in zip(batch, embeddings):
try:
pg_cur.execute("""
INSERT INTO embeddings (id, document, embedding, source, type, created_at, metadata)
VALUES (%s, %s, %s::vector, %s, %s, %s, %s)
ON CONFLICT (id) DO UPDATE SET
document = EXCLUDED.document,
embedding = EXCLUDED.embedding,
source = EXCLUDED.source,
type = EXCLUDED.type,
created_at = EXCLUDED.created_at,
metadata = EXCLUDED.metadata
""", (
row['embedding_id'],
row['document'],
embedding,
row['source'],
row['type'],
row['created_at'],
json.dumps({
'source': row['source'],
'type': row['type'],
'created_at': row['created_at'],
})
))
migrated += 1
except Exception as e:
print(f"Insert error for {row['embedding_id']}: {e}")
errors += 1
pg.commit()
print(f"Progress: {min(i+batch_size, len(to_migrate))}/{len(to_migrate)} ({errors} errors)")
# Final count
pg_cur.execute("SELECT COUNT(*) FROM embeddings")
final_count = pg_cur.fetchone()[0]
chroma.close()
pg.close()
print(f"\nMigration complete:")
print(f" Migrated: {migrated}")
print(f" Errors: {errors}")
print(f" PostgreSQL total: {final_count}")
+86 -10
View File
@@ -2,6 +2,7 @@ import time
import subprocess
import logging
import json
import threading
from pathlib import Path
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
@@ -25,16 +26,35 @@ logging.basicConfig(
]
)
ingestion_state = {
"status": "idle",
"message": "",
"file_count": 0,
"started_at": None,
"finished_at": None,
"last_error": "",
}
ingestion_lock = threading.Lock()
ingestion_thread = None
def set_ingestion_state(**kwargs):
with ingestion_lock:
ingestion_state.update(kwargs)
def load_state():
if Path(STATE_FILE).exists():
with open(STATE_FILE) as f:
return json.load(f)
return {}
def save_state(state):
with open(STATE_FILE, 'w') as f:
json.dump(state, f)
def get_changed_files():
state = load_state()
changed = []
@@ -52,13 +72,25 @@ def get_changed_files():
changed.append(path)
return changed, state
def run_ingestion():
changed, state = get_changed_files()
if not changed:
logging.info("No new or changed files detected — skipping ingestion.")
set_ingestion_state(status="idle", message="No changes detected", file_count=0)
return
logging.info(f"Found {len(changed)} new or changed files — starting ingestion...")
count = len(changed)
logging.info(f"Found {count} new or changed files — starting ingestion...")
set_ingestion_state(
status="ingesting",
message=f"Ingesting {count} file(s)...",
file_count=count,
started_at=time.time(),
finished_at=None,
last_error="",
)
try:
result = subprocess.run(
[PYTHON, INGEST_SCRIPT, NEXTCLOUD_PATH],
@@ -67,19 +99,51 @@ def run_ingestion():
timeout=1800
)
if result.returncode == 0:
# Update state with new mtimes
root = Path(NEXTCLOUD_PATH)
for path in root.rglob("*"):
if path.is_file() and path.suffix.lower() in SUPPORTED:
state[str(path)] = str(path.stat().st_mtime)
save_state(state)
logging.info("Ingestion complete. State updated.")
set_ingestion_state(
status="idle",
message=f"Last run: ingested {count} file(s) successfully",
finished_at=time.time(),
)
else:
logging.error(f"Ingestion error: {result.stderr}")
set_ingestion_state(
status="error",
message="Ingestion failed — see log",
last_error=result.stderr[-300:],
finished_at=time.time(),
)
except subprocess.TimeoutExpired:
logging.error("Ingestion timed out.")
set_ingestion_state(
status="error",
message="Ingestion timed out (>30 min)",
last_error="TimeoutExpired",
finished_at=time.time(),
)
except Exception as e:
logging.error(f"Ingestion failed: {e}")
set_ingestion_state(
status="error",
message=f"Ingestion exception: {e}",
last_error=str(e),
finished_at=time.time(),
)
def start_ingestion_thread():
global ingestion_thread
if ingestion_thread and ingestion_thread.is_alive():
logging.info("Ingestion already running — skipping.")
return
ingestion_thread = threading.Thread(target=run_ingestion, daemon=True)
ingestion_thread.start()
class IngestHandler(FileSystemEventHandler):
def __init__(self):
@@ -98,9 +162,26 @@ class IngestHandler(FileSystemEventHandler):
return
if 'Journal/Media' in str(path):
return
if event.event_type not in ('modified', 'created', 'moved'):
return
logging.info(f"Event: {event.event_type} {event.src_path}")
self.pending = True
self.last_event = time.time()
def write_status(handler):
with ingestion_lock:
status = {
"running": True,
"timestamp": time.time(),
"pending": handler.pending,
"last_event": handler.last_event,
"ingestion": dict(ingestion_state),
}
with open(STATUS_FILE, 'w') as f:
json.dump(status, f)
def main():
logging.info("Aaron AI Watcher starting...")
logging.info(f"Watching: {NEXTCLOUD_PATH}")
@@ -112,23 +193,18 @@ def main():
try:
while True:
import json as _json
_json.dump({
"running": True,
"timestamp": time.time(),
"pending": handler.pending,
"last_event": handler.last_event
}, open(STATUS_FILE, 'w'))
write_status(handler)
if handler.pending:
elapsed = time.time() - handler.last_event
if elapsed >= DEBOUNCE_SECONDS:
handler.pending = False
run_ingestion()
start_ingestion_thread()
time.sleep(5)
except KeyboardInterrupt:
observer.stop()
observer.join()
logging.info("Watcher stopped.")
if __name__ == "__main__":
main()