Files
aaronAI/deprecated/ingest_chatgpt.py
T

153 lines
4.3 KiB
Python

import json
import sys
from pathlib import Path
from datetime import datetime
from sentence_transformers import SentenceTransformer
import psycopg2
import psycopg2.extras
import json as json_module
# Paths
db_path = str(Path.home() / "aaronai" / "db")
EXPORT_DIR = "/home/aaron/nextcloud/data/data/aaron/files/Archive/Misc/ChatGPT Export"
print("Loading embedding model...")
embedder = SentenceTransformer("all-MiniLM-L6-v2")
client = chromadb.PersistentClient(path=db_path)
collection = client.get_or_create_collection(
name="aaronai",
metadata={"hnsw:space": "cosine", "hnsw:allow_replace_deleted": True}
)
def extract_messages(convo):
"""Extract ordered user/assistant messages from a conversation."""
mapping = convo.get("mapping", {})
messages = []
for node in mapping.values():
msg = node.get("message")
if not msg:
continue
role = msg.get("author", {}).get("role")
if role not in ["user", "assistant"]:
continue
content = msg.get("content", {})
parts = content.get("parts", [])
# Extract text parts only
text = ""
for part in parts:
if isinstance(part, str):
text += part
elif isinstance(part, dict) and part.get("content_type") == "text":
text += part.get("text", "")
text = text.strip()
if not text:
continue
create_time = msg.get("create_time") or 0
messages.append((create_time, role, text))
# Sort by timestamp
messages.sort(key=lambda x: x[0])
return messages
def chunk_conversation(title, messages, chunk_size=600, overlap=100):
"""Convert a conversation into overlapping text chunks."""
# Build full conversation text
lines = [f"[Conversation: {title}]", ""]
for _, role, text in messages:
label = "Aaron" if role == "user" else "ChatGPT"
lines.append(f"{label}: {text}")
lines.append("")
full_text = "\n".join(lines)
# Split into word-level chunks with overlap
words = full_text.split()
chunks = []
start = 0
while start < len(words):
end = start + chunk_size
chunk = " ".join(words[start:end])
if chunk.strip():
chunks.append(chunk)
start += chunk_size - overlap
return chunks
def ingest_file(json_path):
print(f"\nLoading {json_path.name}...")
data = json.load(open(json_path, encoding="utf-8"))
print(f"Found {len(data)} conversations")
total_chunks = 0
skipped = 0
for i, convo in enumerate(data):
title = convo.get("title", "Untitled")
convo_id = convo.get("id", f"convo_{i}")
create_time = convo.get("create_time", 0)
try:
date_str = datetime.fromtimestamp(create_time).strftime("%Y-%m-%d")
except:
date_str = "unknown"
messages = extract_messages(convo)
if len(messages) < 2:
skipped += 1
continue
chunks = chunk_conversation(title, messages)
if not chunks:
skipped += 1
continue
# Embed and store
embeddings = embedder.encode(chunks).tolist()
ids = [f"chatgpt_{convo_id}_{j}" for j in range(len(chunks))]
metadatas = [{
"source": f"ChatGPT: {title}",
"filepath": str(json_path),
"date": date_str,
"type": "chatgpt_conversation"
} for _ in chunks]
collection.upsert(
documents=chunks,
embeddings=embeddings,
ids=ids,
metadatas=metadatas
)
total_chunks += len(chunks)
print(f" [{i+1}/{len(data)}] {title[:60]}{len(chunks)} chunks ({date_str})")
print(f"\nDone with {json_path.name}: {total_chunks} chunks indexed, {skipped} conversations skipped")
return total_chunks
def main():
export_dir = Path(EXPORT_DIR)
files = [
export_dir / "conversations-000.json",
export_dir / "conversations-001.json"
]
grand_total = 0
for f in files:
if f.exists():
grand_total += ingest_file(f)
else:
print(f"Not found: {f}")
print(f"\nTotal chunks added to corpus: {grand_total}")
print(f"Database at: {db_path}")
if __name__ == "__main__":
main()