encoding: per-slide pptx chunking + extract_blocks API; api: recency tiebreak

extract_blocks(filepath) is the new structured-extraction entry point, returning list[{heading, text, kind}]. chunk_and_embed accepts either str (blind-chunk back-compat) or list[dict] (one chunk per block, blind-split if oversize, heading prepended for retrieval context and stored in metadata). - pptx: one block per slide. Slide title becomes block heading; speaker notes fold into the body. Image-only decks with title-only slides now produce heading-only chunks instead of being recorded as extraction failures. - docx: deliberately single-block (back-compat). Heading-style section detection was implemented and rolled back: hand-formatted CVs are Normal-styled with bold-as-heading, and tying chunk boundaries to formatting choices would lock future-user into preserving those choices forever. Lexical + cross-encoder retrieval already handles substring matching inside blind-chunked CVs. - pdf/txt/md: unchanged (single block, blind chunking). Recency tiebreak in retrieve_context: pull created_at into the SELECT, use it as secondary sort key in _rerank so memory/journal snapshots prefer the latest copy among near-duplicate content. reindex_docx_pptx.py now accepts --ext=pptx,docx... so re-ingest can target a subset; previous hardcoded delete regex would have wiped both even with a single-ext target.
2026-05-19 21:58:25 +00:00
parent 50b97e2998
commit 9955c7e383
5 changed files with 187 additions and 69 deletions
@@ -1,12 +1,14 @@
 """
 Aaron AI Stage 1 encoding helpers — single canonical implementation of:
-  - extract_text(filepath) — four-extension text extraction
-  - chunk_text(text, chunk_size, overlap) — word-based chunking
-  - chunk_and_embed(text, source, embedder, filepath, folder) — produce ready-to-write rows
+  - extract_blocks(filepath) — section-aware extraction (docx heading-bounded
+    sections, pptx per-slide, pdf/txt/md single-block)
+  - extract_text(filepath) — back-compat string concatenation over blocks
+  - chunk_text(text, chunk_size, overlap) — word-based blind chunking
+  - chunk_and_embed(text_or_blocks, source, embedder, filepath, folder) —
+    produce ready-to-write rows. Accepts str (blind) or list[dict] (section-aware).
  - write_embeddings_batch(conn, batch) — server-side NOW() canonical INSERT

 Used by watcher.py, ingest.py, corpus_integrity.py, and api.py /api/corpus/retry.
-Replaces four separate extract reimplementations and two extract-chunk-embed paths.
 """

 import hashlib
@@ -106,53 +108,106 @@ def _pptx_shape_text(shape):
    return parts


-def extract_text(filepath: Path) -> str:
-    """Return the text of a supported file. Returns "" on any failure or
-    unsupported extension. Does not write to ingest_failures — caller decides."""
+def _extract_docx_blocks(filepath: Path) -> list[dict]:
+    """Return docx content as a single block. Earlier attempt at section-aware
+    chunking via Heading styles was rolled back: the user's docs are mostly
+    Normal-styled with bold-as-heading, and tying chunk boundaries to formatting
+    choices locks future-them into preserving those choices forever. Lexical
+    + cross-encoder retrieval already finds the right substrings within a
+    blind-chunked CV, so the section structure isn't load-bearing for retrieval."""
+    from docx.oxml.ns import qn
+
+    doc = DocxDocument(filepath)
+    parts = [p.text for p in doc.paragraphs if p.text.strip()]
+    for tbl in doc.tables:
+        for row in tbl.rows:
+            for cell in row.cells:
+                parts.extend(p.text for p in _docx_cell_paragraphs(cell))
+    for section in doc.sections:
+        parts.extend(p.text for p in section.header.paragraphs if p.text.strip())
+        parts.extend(p.text for p in section.footer.paragraphs if p.text.strip())
+    for txbx in doc.element.body.findall(".//" + qn("w:txbxContent")):
+        for p in txbx.findall(".//" + qn("w:p")):
+            text = "".join(t.text or "" for t in p.findall(".//" + qn("w:t")))
+            if text.strip():
+                parts.append(text)
+    text = "\n".join(parts)
+    return [{"heading": None, "text": text, "kind": "doc"}] if text.strip() else []
+
+
+def _extract_pptx_blocks(filepath: Path) -> list[dict]:
+    """One block per slide. Heading = slide title (or 'Slide N' fallback).
+    Body = non-title shape text + speaker notes."""
+    prs = Presentation(filepath)
+    blocks = []
+    for i, slide in enumerate(prs.slides, 1):
+        title_shape = None
+        try:
+            title_shape = slide.shapes.title
+        except (AttributeError, KeyError):
+            pass
+        title = None
+        body_parts = []
+        for shape in slide.shapes:
+            if title_shape is not None and shape == title_shape and shape.has_text_frame:
+                title = shape.text_frame.text.strip() or None
+                continue
+            body_parts.extend(_pptx_shape_text(shape))
+        if slide.has_notes_slide:
+            notes = slide.notes_slide.notes_text_frame.text
+            if notes.strip():
+                body_parts.append(f"[Notes] {notes}")
+        if title or body_parts:
+            blocks.append({
+                "heading": title or f"Slide {i}",
+                "text": "\n".join(body_parts),
+                "kind": "slide",
+            })
+    return blocks
+
+
+def extract_blocks(filepath: Path) -> list[dict]:
+    """Structured extraction. Returns list of {heading, text, kind} blocks.
+
+    - docx: section-aware via Heading-style paragraphs (kind='section').
+    - pptx: one block per slide (kind='slide').
+    - pdf/txt/md: single block, no heading (kind='doc').
+
+    Empty list on any failure or unsupported extension."""
    suffix = filepath.suffix.lower()
    try:
        if suffix == ".docx":
-            doc = DocxDocument(filepath)
-            parts = [p.text for p in doc.paragraphs if p.text.strip()]
-            for tbl in doc.tables:
-                for row in tbl.rows:
-                    for cell in row.cells:
-                        parts.extend(p.text for p in _docx_cell_paragraphs(cell))
-            for section in doc.sections:
-                parts.extend(p.text for p in section.header.paragraphs if p.text.strip())
-                parts.extend(p.text for p in section.footer.paragraphs if p.text.strip())
-            from docx.oxml.ns import qn
-            for txbx in doc.element.body.findall(".//" + qn("w:txbxContent")):
-                for p in txbx.findall(".//" + qn("w:p")):
-                    text = "".join(t.text or "" for t in p.findall(".//" + qn("w:t")))
-                    if text.strip():
-                        parts.append(text)
-            return "\n".join(parts)
-        elif suffix == ".pdf":
+            return _extract_docx_blocks(filepath)
+        if suffix == ".pptx":
+            return _extract_pptx_blocks(filepath)
+        if suffix == ".pdf":
            reader = PdfReader(filepath)
-            return "".join(
+            text = "".join(
                page.extract_text() + "\n"
                for page in reader.pages if page.extract_text()
            )
-        elif suffix == ".pptx":
-            prs = Presentation(filepath)
-            parts = []
-            for slide in prs.slides:
-                for shape in slide.shapes:
-                    parts.extend(_pptx_shape_text(shape))
-                if slide.has_notes_slide:
-                    notes = slide.notes_slide.notes_text_frame.text
-                    if notes.strip():
-                        parts.append(notes)
-            return "\n".join(parts)
-        elif suffix in {".txt", ".md"}:
+            return [{"heading": None, "text": text, "kind": "doc"}] if text.strip() else []
+        if suffix in {".txt", ".md"}:
            text = filepath.read_text(encoding="utf-8", errors="ignore")
            if suffix == ".md":
-                return _strip_md_frontmatter(text)
-            return text
+                text = _strip_md_frontmatter(text)
+            return [{"heading": None, "text": text, "kind": "doc"}] if text.strip() else []
    except Exception as e:
-        log.warning(f"Text extraction failed for {filepath.name}: {e}")
-    return ""
+        log.warning(f"Extraction failed for {filepath.name}: {e}")
+    return []
+
+
+def extract_text(filepath: Path) -> str:
+    """Back-compat wrapper: concatenate extract_blocks() output. Section
+    structure is lost; use extract_blocks() directly for chunking."""
+    blocks = extract_blocks(filepath)
+    parts = []
+    for b in blocks:
+        if b.get("heading"):
+            parts.append(b["heading"])
+        if b.get("text"):
+            parts.append(b["text"])
+    return "\n".join(parts)


 def chunk_text(text: str,
@@ -175,18 +230,49 @@ def _chunk_id(filepath, source: str, index: int) -> str:
    return f"{hashlib.md5(basis.encode()).hexdigest()[:8]}_{index}"


-def chunk_and_embed(text: str,
+def chunk_and_embed(text_or_blocks,
                    source: str,
                    embedder,
                    filepath=None,
                    folder=None) -> list[dict]:
-    """Chunk text, embed each chunk, return rows ready for write_embeddings_batch."""
-    chunks = chunk_text(text)
+    """Chunk + embed for write_embeddings_batch. Accepts either:
+
+      - str: blind chunking with 500-word windows (pdf/txt/md legacy path).
+      - list[dict]: section-aware path (docx Heading-bounded sections, pptx
+        slides). Each block emits one chunk if its text fits within
+        DEFAULT_CHUNK_SIZE words, otherwise is blind-split with overlap.
+
+    The block heading is prepended to the chunk text (so retrieval sees the
+    section context) and stored in metadata as heading/kind."""
+    if isinstance(text_or_blocks, str):
+        blocks = [{"heading": None, "text": text_or_blocks, "kind": "doc"}]
+    else:
+        blocks = text_or_blocks
+
+    chunks = []
+    for block in blocks:
+        body = block.get("text") or ""
+        heading = block.get("heading")
+        kind = block.get("kind", "doc")
+        if not body.strip() and not (heading and heading.strip()):
+            continue
+        if heading and body.strip():
+            contextualized = f"{heading}\n\n{body}"
+        elif heading:
+            contextualized = heading
+        else:
+            contextualized = body
+        if len(contextualized.split()) <= DEFAULT_CHUNK_SIZE:
+            chunks.append((contextualized, heading, kind))
+        else:
+            for sub in chunk_text(contextualized):
+                chunks.append((sub, heading, kind))
+
    if not chunks:
        return []
-    embeddings = embedder.encode(chunks).tolist()
+    embeddings = embedder.encode([c[0] for c in chunks]).tolist()
    rows = []
-    for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
+    for i, ((chunk, heading, kind), emb) in enumerate(zip(chunks, embeddings)):
        rows.append({
            "id": _chunk_id(filepath, source, i),
            "document": chunk,
@@ -197,6 +283,8 @@ def chunk_and_embed(text: str,
                "source": source,
                "filepath": str(filepath) if filepath else source,
                "folder": folder,
+                "heading": heading,
+                "kind": kind,
            },
        })
    return rows