aaronAI/scripts/orientation_indexer.py

"""
Orientation Indexer — feeds Stage 2's document-level orientations into pgvector
so they're searchable alongside chunk text by the retrieve_documents tool.

Each completed row in stage_3_queue has an `orientation` string (active_frames
+ frame_relationships + extraction_orientation + one_sentence_summary) that
describes the document at a conceptual level. Indexing it as its own row in
the embeddings table gives the cross-encoder a second surface to rank against
— "what is this document about" rather than just "what does this chunk say."

This worker is part of the "read-only Graphiti + orientation-into-pgvector"
plan B that replaced the Stage 3 → Graphiti write path. The graph layer is
queried directly via the search_facts chat tool; orientations land here.

State tracking: a row is considered indexed if the embeddings table already
holds a row with source=<source> and metadata->>'kind'='orientation'. The
worker is idempotent — restart-safe, resumable.

Runs as systemd: aaronai-orientation-indexer.service
"""

import logging
import os
import sys
import time
from pathlib import Path

from dotenv import load_dotenv
import psycopg2
from sentence_transformers import SentenceTransformer

load_dotenv(Path.home() / "aaronai" / ".env", override=True)

sys.path.insert(0, str(Path(__file__).parent))
from encoding import write_embeddings_batch

PG_DSN = os.getenv("PG_DSN")
EMBED_MODEL = "all-MiniLM-L6-v2"
BATCH_SIZE = 25
POLL_INTERVAL_SECS = 30
LOG_FILE = "/var/log/aaronai/orientation-indexer.log"
HEARTBEAT_FILE = "/var/log/aaronai/orientation-indexer-heartbeat"

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [orientation-indexer] %(levelname)s %(message)s",
    handlers=[logging.FileHandler(LOG_FILE, mode="a")],
)
log = logging.getLogger("orientation-indexer")


def get_pg():
    return psycopg2.connect(PG_DSN)


def fetch_unindexed(cur, limit):
    """Pull stage_3_queue rows with a non-null orientation whose orientation
    hasn't been written to the embeddings table yet."""
    cur.execute(
        """
        SELECT s.source, s.orientation
        FROM stage_3_queue s
        WHERE s.orientation IS NOT NULL
          AND NOT EXISTS (
              SELECT 1 FROM embeddings e
              WHERE e.source = s.source
                AND e.metadata->>'kind' = 'orientation'
          )
        ORDER BY s.enqueued_at
        LIMIT %s
        """,
        (limit,),
    )
    return cur.fetchall()


def _row_for(source: str, orientation: str, embedding) -> dict:
    """Build an embeddings row for the orientation. id is deterministic so
    re-runs don't create duplicates if the unique check above ever races."""
    import hashlib
    chunk_id = hashlib.md5(f"orientation:{source}".encode()).hexdigest()[:8] + "_orient"
    return {
        "id": chunk_id,
        "document": orientation,
        "embedding": embedding,
        "source": source,
        "type": "document",
        "metadata": {
            "source": source,
            "kind": "orientation",
        },
    }


def write_heartbeat():
    try:
        Path(HEARTBEAT_FILE).write_text(str(time.time()))
    except Exception:
        pass


def main():
    log.info("Orientation indexer starting...")
    log.info(f"Loading embedding model: {EMBED_MODEL}")
    embedder = SentenceTransformer(EMBED_MODEL)
    log.info("Embedding model ready.")

    while True:
        write_heartbeat()
        try:
            pg = get_pg()
            try:
                cur = pg.cursor()
                rows = fetch_unindexed(cur, BATCH_SIZE)
                if not rows:
                    pg.close()
                    time.sleep(POLL_INTERVAL_SECS)
                    continue

                orientations = [r[1] for r in rows]
                embeddings = embedder.encode(orientations).tolist()
                batch = [
                    _row_for(source, orient, emb)
                    for (source, orient), emb in zip(rows, embeddings)
                ]
                write_embeddings_batch(pg, batch)
                log.info(f"Indexed {len(batch)} orientation(s)")
            finally:
                pg.close()
        except Exception as e:
            log.error(f"Indexing loop iteration failed: {e}")
            time.sleep(POLL_INTERVAL_SECS)


if __name__ == "__main__":
    main()