""" Aaron AI Stage 1 encoding helpers — single canonical implementation of: - extract_blocks(filepath) — section-aware extraction (docx heading-bounded sections, pptx per-slide, pdf/txt/md single-block) - extract_text(filepath) — back-compat string concatenation over blocks - chunk_text(text, chunk_size, overlap) — word-based blind chunking - chunk_and_embed(text_or_blocks, source, embedder, filepath, folder) — produce ready-to-write rows. Accepts str (blind) or list[dict] (section-aware). - write_embeddings_batch(conn, batch) — server-side NOW() canonical INSERT Used by watcher.py, ingest.py, corpus_integrity.py, and api.py /api/corpus/retry. """ import hashlib import json import logging import re from pathlib import Path from docx import Document as DocxDocument from pypdf import PdfReader from pptx import Presentation log = logging.getLogger("encoding") SUPPORTED = {".docx", ".pdf", ".pptx", ".txt", ".md"} DEFAULT_CHUNK_SIZE = 500 DEFAULT_CHUNK_OVERLAP = 50 _BOLD_KV_RE = re.compile(r"^\*\*[\w +/-]+?:\*\*") def _strip_md_frontmatter(text: str) -> str: """Strip a leading frontmatter block from markdown, if present. Recognizes two formats: - YAML-style: file's first non-empty line is `---`, terminated by `---`. Only triggered when no heading precedes — guards against `---` horizontal rules that follow an H1. - Capture-style: optional H1 heading, then one or more `**key:** value` lines (and blanks), terminated by `---`. The H1 is preserved; the key/value block + separator are removed. Body `---` rules and body `**bold:**` lines are never touched — the scan aborts as soon as a non-frontmatter line appears in the leading block. """ lines = text.splitlines() n = len(lines) i = 0 while i < n and not lines[i].strip(): i += 1 heading = None if i < n and lines[i].startswith("# "): heading = lines[i] i += 1 while i < n and not lines[i].strip(): i += 1 if i >= n: return text first = lines[i].strip() if heading is None and first == "---": j = i + 1 while j < n and lines[j].strip() != "---": j += 1 if j >= n: return text body_start = j + 1 elif _BOLD_KV_RE.match(first): j = i while j < n: s = lines[j].strip() if not s or _BOLD_KV_RE.match(s): j += 1 continue if s == "---": body_start = j + 1 break return text else: return text else: return text body = "\n".join(lines[body_start:]).lstrip("\n") return f"{heading}\n\n{body}" if heading else body def _docx_cell_paragraphs(cell): yield from (p for p in cell.paragraphs if p.text.strip()) for nested in cell.tables: for row in nested.rows: for c in row.cells: yield from _docx_cell_paragraphs(c) def _pptx_shape_text(shape): from pptx.enum.shapes import MSO_SHAPE_TYPE parts = [] if shape.shape_type == MSO_SHAPE_TYPE.GROUP: for sub in shape.shapes: parts.extend(_pptx_shape_text(sub)) return parts if hasattr(shape, "text") and shape.text.strip(): parts.append(shape.text) if getattr(shape, "has_table", False): for cell in shape.table.iter_cells(): if cell.text.strip(): parts.append(cell.text) return parts def _extract_docx_blocks(filepath: Path) -> list[dict]: """Return docx content as a single block. Earlier attempt at section-aware chunking via Heading styles was rolled back: the user's docs are mostly Normal-styled with bold-as-heading, and tying chunk boundaries to formatting choices locks future-them into preserving those choices forever. Lexical + cross-encoder retrieval already finds the right substrings within a blind-chunked CV, so the section structure isn't load-bearing for retrieval.""" from docx.oxml.ns import qn doc = DocxDocument(filepath) parts = [p.text for p in doc.paragraphs if p.text.strip()] for tbl in doc.tables: for row in tbl.rows: for cell in row.cells: parts.extend(p.text for p in _docx_cell_paragraphs(cell)) for section in doc.sections: parts.extend(p.text for p in section.header.paragraphs if p.text.strip()) parts.extend(p.text for p in section.footer.paragraphs if p.text.strip()) for txbx in doc.element.body.findall(".//" + qn("w:txbxContent")): for p in txbx.findall(".//" + qn("w:p")): text = "".join(t.text or "" for t in p.findall(".//" + qn("w:t"))) if text.strip(): parts.append(text) text = "\n".join(parts) return [{"heading": None, "text": text, "kind": "doc"}] if text.strip() else [] def _extract_pptx_blocks(filepath: Path) -> list[dict]: """One block per slide. Heading = slide title (or 'Slide N' fallback). Body = non-title shape text + speaker notes.""" prs = Presentation(filepath) blocks = [] for i, slide in enumerate(prs.slides, 1): title_shape = None try: title_shape = slide.shapes.title except (AttributeError, KeyError): pass title = None body_parts = [] for shape in slide.shapes: if title_shape is not None and shape == title_shape and shape.has_text_frame: title = shape.text_frame.text.strip() or None continue body_parts.extend(_pptx_shape_text(shape)) if slide.has_notes_slide: notes = slide.notes_slide.notes_text_frame.text if notes.strip(): body_parts.append(f"[Notes] {notes}") if title or body_parts: blocks.append({ "heading": title or f"Slide {i}", "text": "\n".join(body_parts), "kind": "slide", }) return blocks def extract_blocks(filepath: Path) -> list[dict]: """Structured extraction. Returns list of {heading, text, kind} blocks. - docx: section-aware via Heading-style paragraphs (kind='section'). - pptx: one block per slide (kind='slide'). - pdf/txt/md: single block, no heading (kind='doc'). Empty list on any failure or unsupported extension.""" suffix = filepath.suffix.lower() try: if suffix == ".docx": return _extract_docx_blocks(filepath) if suffix == ".pptx": return _extract_pptx_blocks(filepath) if suffix == ".pdf": reader = PdfReader(filepath) text = "".join( page.extract_text() + "\n" for page in reader.pages if page.extract_text() ) return [{"heading": None, "text": text, "kind": "doc"}] if text.strip() else [] if suffix in {".txt", ".md"}: text = filepath.read_text(encoding="utf-8", errors="ignore") if suffix == ".md": text = _strip_md_frontmatter(text) return [{"heading": None, "text": text, "kind": "doc"}] if text.strip() else [] except Exception as e: log.warning(f"Extraction failed for {filepath.name}: {e}") return [] def extract_text(filepath: Path) -> str: """Back-compat wrapper: concatenate extract_blocks() output. Section structure is lost; use extract_blocks() directly for chunking.""" blocks = extract_blocks(filepath) parts = [] for b in blocks: if b.get("heading"): parts.append(b["heading"]) if b.get("text"): parts.append(b["text"]) return "\n".join(parts) def chunk_text(text: str, chunk_size: int = DEFAULT_CHUNK_SIZE, overlap: int = DEFAULT_CHUNK_OVERLAP) -> list[str]: """Word-based chunking. Empty chunks filtered.""" words = text.split() chunks = [] start = 0 while start < len(words): chunk = " ".join(words[start:start + chunk_size]) if chunk.strip(): chunks.append(chunk) start += chunk_size - overlap return chunks def _chunk_id(filepath, source: str, index: int) -> str: basis = str(filepath) if filepath else source return f"{hashlib.md5(basis.encode()).hexdigest()[:8]}_{index}" def chunk_and_embed(text_or_blocks, source: str, embedder, filepath=None, folder=None) -> list[dict]: """Chunk + embed for write_embeddings_batch. Accepts either: - str: blind chunking with 500-word windows (pdf/txt/md legacy path). - list[dict]: section-aware path (docx Heading-bounded sections, pptx slides). Each block emits one chunk if its text fits within DEFAULT_CHUNK_SIZE words, otherwise is blind-split with overlap. The block heading is prepended to the chunk text (so retrieval sees the section context) and stored in metadata as heading/kind.""" if isinstance(text_or_blocks, str): blocks = [{"heading": None, "text": text_or_blocks, "kind": "doc"}] else: blocks = text_or_blocks chunks = [] for block in blocks: body = block.get("text") or "" heading = block.get("heading") kind = block.get("kind", "doc") if not body.strip() and not (heading and heading.strip()): continue if heading and body.strip(): contextualized = f"{heading}\n\n{body}" elif heading: contextualized = heading else: contextualized = body if len(contextualized.split()) <= DEFAULT_CHUNK_SIZE: chunks.append((contextualized, heading, kind)) else: for sub in chunk_text(contextualized): chunks.append((sub, heading, kind)) if not chunks: return [] embeddings = embedder.encode([c[0] for c in chunks]).tolist() rows = [] for i, ((chunk, heading, kind), emb) in enumerate(zip(chunks, embeddings)): rows.append({ "id": _chunk_id(filepath, source, i), "document": chunk, "embedding": emb, "source": source, "type": "document", "metadata": { "source": source, "filepath": str(filepath) if filepath else source, "folder": folder, "heading": heading, "kind": kind, }, }) return rows def write_embeddings_batch(conn, batch: list[dict], commit: bool = True) -> int: """Single canonical INSERT. Sets created_at = NOW() server-side. Every row dict must supply 'type'. created_at is SQL-supplied (NOW()), so callers do not need to provide it. The application-layer assertion is the primary enforcement point for type — the column lacks NOT NULL because historical NULLs were resolved by the Improvement #2 backfill, and a Python-level raise gives a faster, more debuggable failure than a Postgres constraint error. When commit=True (default), this function commits the connection itself. When commit=False, the caller is responsible for committing. Use commit=False when composing this write with other writes that must land atomically in the same transaction. """ if not batch: return 0 cur = conn.cursor() for row in batch: if not row.get("type"): raise ValueError( f"row {row.get('id')!r} missing 'type'; writers must supply it " f"(see Improvement #2 in docs/birdai-component-inventory)" ) cur.execute(""" INSERT INTO embeddings (id, document, embedding, source, type, created_at, metadata) VALUES (%s, %s, %s::vector, %s, %s, NOW(), %s) ON CONFLICT (id) DO UPDATE SET document = EXCLUDED.document, embedding = EXCLUDED.embedding, source = EXCLUDED.source, type = EXCLUDED.type, created_at = COALESCE(embeddings.created_at, EXCLUDED.created_at), metadata = EXCLUDED.metadata """, (row["id"], row["document"], row["embedding"], row["source"], row["type"], json.dumps(row["metadata"]))) if commit: conn.commit() return len(batch)