From 93c0d8930852a79af31a936c635ad8161f85d055 Mon Sep 17 00:00:00 2001
From: Aaron Nelson <aaron@aaronnelson.studio>
Date: Mon, 4 May 2026 16:12:56 +0000
Subject: [PATCH] encoding.py: extend docx and pptx extractors to walk tables,
 headers/footers, text-boxes, group shapes, and notes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous extractors walked only top-level body paragraphs (docx) and
top-level shape.text (pptx). Diagnostic on the 17 non-PDF "no_text"
ingest failures revealed that 13 docx files in the failure cohort have
100% of their content in tables (paras_with_text=0, table_cells=6-108).
These are syllabi, rosters, rubrics, and homework worksheets structured
as a single document-wide table — high-value academic content the corpus
was silently missing.

docx walker now covers:
- body paragraphs (existing)
- tables, including nested tables in cells (recursive helper)
- header and footer paragraphs per section
- text-box content via XPath against w:txbxContent (no first-class API
  in python-docx; future-proofing — none of the current failure cohort
  has text-boxes)

pptx walker now covers:
- top-level shape text (existing)
- recursive descent into group shapes
- table cell text via shape.has_table / shape.table.iter_cells()
- speaker notes via slide.notes_slide.notes_text_frame.text

Out of scope: SmartArt diagrams, chart titles/labels, OLE objects,
content controls. None of the current failure cohort has these.

Recovery: 13 of 17 failures now ingest successfully. The 4 remaining are
image-only pptx files (Renders.pptx, Ribbon Cutting Slideshow.pptx, two
GH Slicer Notes variants — all PICTURE-shape decks with no text in any
walkable structure). They stay in ingest_failures unresolved, awaiting
OCR or path exclusion.

Side effect worth noting: the regression check on 4 known-good files
that were already producing embeddings showed all four gained content
under the new walker — a Mod03 pptx grew from 23,993 to 57,462 chars
(+33,469), Braskem Report docx grew 33,050 to 38,977 (+5,927), DDF MA
program docx grew 37,210 to 47,603 (+10,393), SUNY PIF GRANT pptx grew
22,259 to 23,546 (+1,287). These files have been in the corpus all
along with table or notes content silently dropped. They will surface
the additional content on next re-ingest, improving retrieval quality
for any future query that touches them.

Cleanup: ingest_file already calls resolve_ingest_failure on successful
ingest, so the 13 recovered files were marked resolved=TRUE during the
retry pass. No separate cleanup SQL was needed.
---
 scripts/encoding.py | 54 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 48 insertions(+), 6 deletions(-)

diff --git a/scripts/encoding.py b/scripts/encoding.py
index 1c0c7ff..3e7092c 100644
--- a/scripts/encoding.py
+++ b/scripts/encoding.py
@@ -25,6 +25,30 @@ DEFAULT_CHUNK_SIZE = 500
 DEFAULT_CHUNK_OVERLAP = 50
 
 
+def _docx_cell_paragraphs(cell):
+    yield from (p for p in cell.paragraphs if p.text.strip())
+    for nested in cell.tables:
+        for row in nested.rows:
+            for c in row.cells:
+                yield from _docx_cell_paragraphs(c)
+
+
+def _pptx_shape_text(shape):
+    from pptx.enum.shapes import MSO_SHAPE_TYPE
+    parts = []
+    if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
+        for sub in shape.shapes:
+            parts.extend(_pptx_shape_text(sub))
+        return parts
+    if hasattr(shape, "text") and shape.text.strip():
+        parts.append(shape.text)
+    if getattr(shape, "has_table", False):
+        for cell in shape.table.iter_cells():
+            if cell.text.strip():
+                parts.append(cell.text)
+    return parts
+
+
 def extract_text(filepath: Path) -> str:
     """Return the text of a supported file. Returns "" on any failure or
     unsupported extension. Does not write to ingest_failures — caller decides."""
@@ -32,7 +56,21 @@ def extract_text(filepath: Path) -> str:
     try:
         if suffix == ".docx":
             doc = DocxDocument(filepath)
-            return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
+            parts = [p.text for p in doc.paragraphs if p.text.strip()]
+            for tbl in doc.tables:
+                for row in tbl.rows:
+                    for cell in row.cells:
+                        parts.extend(p.text for p in _docx_cell_paragraphs(cell))
+            for section in doc.sections:
+                parts.extend(p.text for p in section.header.paragraphs if p.text.strip())
+                parts.extend(p.text for p in section.footer.paragraphs if p.text.strip())
+            from docx.oxml.ns import qn
+            for txbx in doc.element.body.findall(".//" + qn("w:txbxContent")):
+                for p in txbx.findall(".//" + qn("w:p")):
+                    text = "".join(t.text or "" for t in p.findall(".//" + qn("w:t")))
+                    if text.strip():
+                        parts.append(text)
+            return "\n".join(parts)
         elif suffix == ".pdf":
             reader = PdfReader(filepath)
             return "".join(
@@ -41,11 +79,15 @@ def extract_text(filepath: Path) -> str:
             )
         elif suffix == ".pptx":
             prs = Presentation(filepath)
-            return "\n".join(
-                shape.text for slide in prs.slides
-                for shape in slide.shapes
-                if hasattr(shape, "text") and shape.text.strip()
-            )
+            parts = []
+            for slide in prs.slides:
+                for shape in slide.shapes:
+                    parts.extend(_pptx_shape_text(shape))
+                if slide.has_notes_slide:
+                    notes = slide.notes_slide.notes_text_frame.text
+                    if notes.strip():
+                        parts.append(notes)
+            return "\n".join(parts)
         elif suffix in {".txt", ".md"}:
             return filepath.read_text(encoding="utf-8", errors="ignore")
     except Exception as e: