151 lines
4.2 KiB
Python
151 lines
4.2 KiB
Python
import json
|
|
import sys
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
from sentence_transformers import SentenceTransformer
|
|
import chromadb
|
|
|
|
# Paths
|
|
db_path = str(Path.home() / "aaronai" / "db")
|
|
EXPORT_DIR = "/home/aaron/nextcloud/data/data/aaron/files/Archive/Misc/ChatGPT Export"
|
|
|
|
print("Loading embedding model...")
|
|
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
|
client = chromadb.PersistentClient(path=db_path)
|
|
collection = client.get_or_create_collection(
|
|
name="aaronai",
|
|
metadata={"hnsw:space": "cosine"}
|
|
)
|
|
|
|
def extract_messages(convo):
|
|
"""Extract ordered user/assistant messages from a conversation."""
|
|
mapping = convo.get("mapping", {})
|
|
messages = []
|
|
|
|
for node in mapping.values():
|
|
msg = node.get("message")
|
|
if not msg:
|
|
continue
|
|
|
|
role = msg.get("author", {}).get("role")
|
|
if role not in ["user", "assistant"]:
|
|
continue
|
|
|
|
content = msg.get("content", {})
|
|
parts = content.get("parts", [])
|
|
|
|
# Extract text parts only
|
|
text = ""
|
|
for part in parts:
|
|
if isinstance(part, str):
|
|
text += part
|
|
elif isinstance(part, dict) and part.get("content_type") == "text":
|
|
text += part.get("text", "")
|
|
|
|
text = text.strip()
|
|
if not text:
|
|
continue
|
|
|
|
create_time = msg.get("create_time") or 0
|
|
messages.append((create_time, role, text))
|
|
|
|
# Sort by timestamp
|
|
messages.sort(key=lambda x: x[0])
|
|
return messages
|
|
|
|
def chunk_conversation(title, messages, chunk_size=600, overlap=100):
|
|
"""Convert a conversation into overlapping text chunks."""
|
|
# Build full conversation text
|
|
lines = [f"[Conversation: {title}]", ""]
|
|
for _, role, text in messages:
|
|
label = "Aaron" if role == "user" else "ChatGPT"
|
|
lines.append(f"{label}: {text}")
|
|
lines.append("")
|
|
|
|
full_text = "\n".join(lines)
|
|
|
|
# Split into word-level chunks with overlap
|
|
words = full_text.split()
|
|
chunks = []
|
|
start = 0
|
|
while start < len(words):
|
|
end = start + chunk_size
|
|
chunk = " ".join(words[start:end])
|
|
if chunk.strip():
|
|
chunks.append(chunk)
|
|
start += chunk_size - overlap
|
|
|
|
return chunks
|
|
|
|
def ingest_file(json_path):
|
|
print(f"\nLoading {json_path.name}...")
|
|
data = json.load(open(json_path, encoding="utf-8"))
|
|
print(f"Found {len(data)} conversations")
|
|
|
|
total_chunks = 0
|
|
skipped = 0
|
|
|
|
for i, convo in enumerate(data):
|
|
title = convo.get("title", "Untitled")
|
|
convo_id = convo.get("id", f"convo_{i}")
|
|
create_time = convo.get("create_time", 0)
|
|
|
|
try:
|
|
date_str = datetime.fromtimestamp(create_time).strftime("%Y-%m-%d")
|
|
except:
|
|
date_str = "unknown"
|
|
|
|
messages = extract_messages(convo)
|
|
|
|
if len(messages) < 2:
|
|
skipped += 1
|
|
continue
|
|
|
|
chunks = chunk_conversation(title, messages)
|
|
if not chunks:
|
|
skipped += 1
|
|
continue
|
|
|
|
# Embed and store
|
|
embeddings = embedder.encode(chunks).tolist()
|
|
ids = [f"chatgpt_{convo_id}_{j}" for j in range(len(chunks))]
|
|
metadatas = [{
|
|
"source": f"ChatGPT: {title}",
|
|
"filepath": str(json_path),
|
|
"date": date_str,
|
|
"type": "chatgpt_conversation"
|
|
} for _ in chunks]
|
|
|
|
collection.upsert(
|
|
documents=chunks,
|
|
embeddings=embeddings,
|
|
ids=ids,
|
|
metadatas=metadatas
|
|
)
|
|
|
|
total_chunks += len(chunks)
|
|
print(f" [{i+1}/{len(data)}] {title[:60]} — {len(chunks)} chunks ({date_str})")
|
|
|
|
print(f"\nDone with {json_path.name}: {total_chunks} chunks indexed, {skipped} conversations skipped")
|
|
return total_chunks
|
|
|
|
def main():
|
|
export_dir = Path(EXPORT_DIR)
|
|
files = [
|
|
export_dir / "conversations-000.json",
|
|
export_dir / "conversations-001.json"
|
|
]
|
|
|
|
grand_total = 0
|
|
for f in files:
|
|
if f.exists():
|
|
grand_total += ingest_file(f)
|
|
else:
|
|
print(f"Not found: {f}")
|
|
|
|
print(f"\nTotal chunks added to corpus: {grand_total}")
|
|
print(f"Database at: {db_path}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|