Initial commit - Aaron AI v1
This commit is contained in:
@@ -0,0 +1,150 @@
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from sentence_transformers import SentenceTransformer
|
||||
import chromadb
|
||||
|
||||
# Paths
|
||||
db_path = str(Path.home() / "aaronai" / "db")
|
||||
EXPORT_DIR = "/home/aaron/nextcloud/data/data/aaron/files/Archive/Misc/ChatGPT Export"
|
||||
|
||||
print("Loading embedding model...")
|
||||
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
client = chromadb.PersistentClient(path=db_path)
|
||||
collection = client.get_or_create_collection(
|
||||
name="aaronai",
|
||||
metadata={"hnsw:space": "cosine"}
|
||||
)
|
||||
|
||||
def extract_messages(convo):
|
||||
"""Extract ordered user/assistant messages from a conversation."""
|
||||
mapping = convo.get("mapping", {})
|
||||
messages = []
|
||||
|
||||
for node in mapping.values():
|
||||
msg = node.get("message")
|
||||
if not msg:
|
||||
continue
|
||||
|
||||
role = msg.get("author", {}).get("role")
|
||||
if role not in ["user", "assistant"]:
|
||||
continue
|
||||
|
||||
content = msg.get("content", {})
|
||||
parts = content.get("parts", [])
|
||||
|
||||
# Extract text parts only
|
||||
text = ""
|
||||
for part in parts:
|
||||
if isinstance(part, str):
|
||||
text += part
|
||||
elif isinstance(part, dict) and part.get("content_type") == "text":
|
||||
text += part.get("text", "")
|
||||
|
||||
text = text.strip()
|
||||
if not text:
|
||||
continue
|
||||
|
||||
create_time = msg.get("create_time") or 0
|
||||
messages.append((create_time, role, text))
|
||||
|
||||
# Sort by timestamp
|
||||
messages.sort(key=lambda x: x[0])
|
||||
return messages
|
||||
|
||||
def chunk_conversation(title, messages, chunk_size=600, overlap=100):
|
||||
"""Convert a conversation into overlapping text chunks."""
|
||||
# Build full conversation text
|
||||
lines = [f"[Conversation: {title}]", ""]
|
||||
for _, role, text in messages:
|
||||
label = "Aaron" if role == "user" else "ChatGPT"
|
||||
lines.append(f"{label}: {text}")
|
||||
lines.append("")
|
||||
|
||||
full_text = "\n".join(lines)
|
||||
|
||||
# Split into word-level chunks with overlap
|
||||
words = full_text.split()
|
||||
chunks = []
|
||||
start = 0
|
||||
while start < len(words):
|
||||
end = start + chunk_size
|
||||
chunk = " ".join(words[start:end])
|
||||
if chunk.strip():
|
||||
chunks.append(chunk)
|
||||
start += chunk_size - overlap
|
||||
|
||||
return chunks
|
||||
|
||||
def ingest_file(json_path):
|
||||
print(f"\nLoading {json_path.name}...")
|
||||
data = json.load(open(json_path, encoding="utf-8"))
|
||||
print(f"Found {len(data)} conversations")
|
||||
|
||||
total_chunks = 0
|
||||
skipped = 0
|
||||
|
||||
for i, convo in enumerate(data):
|
||||
title = convo.get("title", "Untitled")
|
||||
convo_id = convo.get("id", f"convo_{i}")
|
||||
create_time = convo.get("create_time", 0)
|
||||
|
||||
try:
|
||||
date_str = datetime.fromtimestamp(create_time).strftime("%Y-%m-%d")
|
||||
except:
|
||||
date_str = "unknown"
|
||||
|
||||
messages = extract_messages(convo)
|
||||
|
||||
if len(messages) < 2:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
chunks = chunk_conversation(title, messages)
|
||||
if not chunks:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
# Embed and store
|
||||
embeddings = embedder.encode(chunks).tolist()
|
||||
ids = [f"chatgpt_{convo_id}_{j}" for j in range(len(chunks))]
|
||||
metadatas = [{
|
||||
"source": f"ChatGPT: {title}",
|
||||
"filepath": str(json_path),
|
||||
"date": date_str,
|
||||
"type": "chatgpt_conversation"
|
||||
} for _ in chunks]
|
||||
|
||||
collection.upsert(
|
||||
documents=chunks,
|
||||
embeddings=embeddings,
|
||||
ids=ids,
|
||||
metadatas=metadatas
|
||||
)
|
||||
|
||||
total_chunks += len(chunks)
|
||||
print(f" [{i+1}/{len(data)}] {title[:60]} — {len(chunks)} chunks ({date_str})")
|
||||
|
||||
print(f"\nDone with {json_path.name}: {total_chunks} chunks indexed, {skipped} conversations skipped")
|
||||
return total_chunks
|
||||
|
||||
def main():
|
||||
export_dir = Path(EXPORT_DIR)
|
||||
files = [
|
||||
export_dir / "conversations-000.json",
|
||||
export_dir / "conversations-001.json"
|
||||
]
|
||||
|
||||
grand_total = 0
|
||||
for f in files:
|
||||
if f.exists():
|
||||
grand_total += ingest_file(f)
|
||||
else:
|
||||
print(f"Not found: {f}")
|
||||
|
||||
print(f"\nTotal chunks added to corpus: {grand_total}")
|
||||
print(f"Database at: {db_path}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user