aaronAI/scripts/encoding.py

"""
Aaron AI Stage 1 encoding helpers — single canonical implementation of:
  - extract_text(filepath) — four-extension text extraction
  - chunk_text(text, chunk_size, overlap) — word-based chunking
  - chunk_and_embed(text, source, embedder, filepath, folder) — produce ready-to-write rows
  - write_embeddings_batch(conn, batch) — server-side NOW() canonical INSERT

Used by watcher.py, ingest.py, corpus_integrity.py, and api.py /api/corpus/retry.
Replaces four separate extract reimplementations and two extract-chunk-embed paths.
"""

import hashlib
import json
import logging
import re
from pathlib import Path

from docx import Document as DocxDocument
from pypdf import PdfReader
from pptx import Presentation

log = logging.getLogger("encoding")

SUPPORTED = {".docx", ".pdf", ".pptx", ".txt", ".md"}
DEFAULT_CHUNK_SIZE = 500
DEFAULT_CHUNK_OVERLAP = 50

_BOLD_KV_RE = re.compile(r"^\*\*[\w +/-]+?:\*\*")


def _strip_md_frontmatter(text: str) -> str:
    """Strip a leading frontmatter block from markdown, if present.

    Recognizes two formats:
      - YAML-style: file's first non-empty line is `---`, terminated by `---`.
        Only triggered when no heading precedes — guards against `---`
        horizontal rules that follow an H1.
      - Capture-style: optional H1 heading, then one or more `**key:** value`
        lines (and blanks), terminated by `---`. The H1 is preserved; the
        key/value block + separator are removed.

    Body `---` rules and body `**bold:**` lines are never touched — the scan
    aborts as soon as a non-frontmatter line appears in the leading block.
    """
    lines = text.splitlines()
    n = len(lines)
    i = 0
    while i < n and not lines[i].strip():
        i += 1
    heading = None
    if i < n and lines[i].startswith("# "):
        heading = lines[i]
        i += 1
        while i < n and not lines[i].strip():
            i += 1
    if i >= n:
        return text
    first = lines[i].strip()
    if heading is None and first == "---":
        j = i + 1
        while j < n and lines[j].strip() != "---":
            j += 1
        if j >= n:
            return text
        body_start = j + 1
    elif _BOLD_KV_RE.match(first):
        j = i
        while j < n:
            s = lines[j].strip()
            if not s or _BOLD_KV_RE.match(s):
                j += 1
                continue
            if s == "---":
                body_start = j + 1
                break
            return text
        else:
            return text
    else:
        return text
    body = "\n".join(lines[body_start:]).lstrip("\n")
    return f"{heading}\n\n{body}" if heading else body


def _docx_cell_paragraphs(cell):
    yield from (p for p in cell.paragraphs if p.text.strip())
    for nested in cell.tables:
        for row in nested.rows:
            for c in row.cells:
                yield from _docx_cell_paragraphs(c)


def _pptx_shape_text(shape):
    from pptx.enum.shapes import MSO_SHAPE_TYPE
    parts = []
    if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
        for sub in shape.shapes:
            parts.extend(_pptx_shape_text(sub))
        return parts
    if hasattr(shape, "text") and shape.text.strip():
        parts.append(shape.text)
    if getattr(shape, "has_table", False):
        for cell in shape.table.iter_cells():
            if cell.text.strip():
                parts.append(cell.text)
    return parts


def extract_text(filepath: Path) -> str:
    """Return the text of a supported file. Returns "" on any failure or
    unsupported extension. Does not write to ingest_failures — caller decides."""
    suffix = filepath.suffix.lower()
    try:
        if suffix == ".docx":
            doc = DocxDocument(filepath)
            parts = [p.text for p in doc.paragraphs if p.text.strip()]
            for tbl in doc.tables:
                for row in tbl.rows:
                    for cell in row.cells:
                        parts.extend(p.text for p in _docx_cell_paragraphs(cell))
            for section in doc.sections:
                parts.extend(p.text for p in section.header.paragraphs if p.text.strip())
                parts.extend(p.text for p in section.footer.paragraphs if p.text.strip())
            from docx.oxml.ns import qn
            for txbx in doc.element.body.findall(".//" + qn("w:txbxContent")):
                for p in txbx.findall(".//" + qn("w:p")):
                    text = "".join(t.text or "" for t in p.findall(".//" + qn("w:t")))
                    if text.strip():
                        parts.append(text)
            return "\n".join(parts)
        elif suffix == ".pdf":
            reader = PdfReader(filepath)
            return "".join(
                page.extract_text() + "\n"
                for page in reader.pages if page.extract_text()
            )
        elif suffix == ".pptx":
            prs = Presentation(filepath)
            parts = []
            for slide in prs.slides:
                for shape in slide.shapes:
                    parts.extend(_pptx_shape_text(shape))
                if slide.has_notes_slide:
                    notes = slide.notes_slide.notes_text_frame.text
                    if notes.strip():
                        parts.append(notes)
            return "\n".join(parts)
        elif suffix in {".txt", ".md"}:
            text = filepath.read_text(encoding="utf-8", errors="ignore")
            if suffix == ".md":
                return _strip_md_frontmatter(text)
            return text
    except Exception as e:
        log.warning(f"Text extraction failed for {filepath.name}: {e}")
    return ""


def chunk_text(text: str,
               chunk_size: int = DEFAULT_CHUNK_SIZE,
               overlap: int = DEFAULT_CHUNK_OVERLAP) -> list[str]:
    """Word-based chunking. Empty chunks filtered."""
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        chunk = " ".join(words[start:start + chunk_size])
        if chunk.strip():
            chunks.append(chunk)
        start += chunk_size - overlap
    return chunks


def _chunk_id(filepath, source: str, index: int) -> str:
    basis = str(filepath) if filepath else source
    return f"{hashlib.md5(basis.encode()).hexdigest()[:8]}_{index}"


def chunk_and_embed(text: str,
                    source: str,
                    embedder,
                    filepath=None,
                    folder=None) -> list[dict]:
    """Chunk text, embed each chunk, return rows ready for write_embeddings_batch."""
    chunks = chunk_text(text)
    if not chunks:
        return []
    embeddings = embedder.encode(chunks).tolist()
    rows = []
    for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
        rows.append({
            "id": _chunk_id(filepath, source, i),
            "document": chunk,
            "embedding": emb,
            "source": source,
            "type": "document",
            "metadata": {
                "source": source,
                "filepath": str(filepath) if filepath else source,
                "folder": folder,
            },
        })
    return rows


def write_embeddings_batch(conn, batch: list[dict]) -> int:
    """Single canonical INSERT. Sets created_at = NOW() server-side. Commits.

    Every row dict must supply 'type'. created_at is SQL-supplied (NOW()), so
    callers do not need to provide it. The application-layer assertion is the
    primary enforcement point for type — the column lacks NOT NULL because
    historical NULLs were resolved by the Improvement #2 backfill, and a
    Python-level raise gives a faster, more debuggable failure than a
    Postgres constraint error.
    """
    if not batch:
        return 0
    cur = conn.cursor()
    for row in batch:
        if not row.get("type"):
            raise ValueError(
                f"row {row.get('id')!r} missing 'type'; writers must supply it "
                f"(see Improvement #2 in docs/birdai-component-inventory)"
            )
        cur.execute("""
            INSERT INTO embeddings (id, document, embedding, source, type, created_at, metadata)
            VALUES (%s, %s, %s::vector, %s, %s, NOW(), %s)
            ON CONFLICT (id) DO UPDATE SET
                document   = EXCLUDED.document,
                embedding  = EXCLUDED.embedding,
                source     = EXCLUDED.source,
                type       = EXCLUDED.type,
                created_at = COALESCE(embeddings.created_at, EXCLUDED.created_at),
                metadata   = EXCLUDED.metadata
        """, (row["id"], row["document"], row["embedding"],
              row["source"], row["type"], json.dumps(row["metadata"])))
    conn.commit()
    return len(batch)