From 93c0d8930852a79af31a936c635ad8161f85d055 Mon Sep 17 00:00:00 2001 From: Aaron Nelson Date: Mon, 4 May 2026 16:12:56 +0000 Subject: [PATCH] encoding.py: extend docx and pptx extractors to walk tables, headers/footers, text-boxes, group shapes, and notes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous extractors walked only top-level body paragraphs (docx) and top-level shape.text (pptx). Diagnostic on the 17 non-PDF "no_text" ingest failures revealed that 13 docx files in the failure cohort have 100% of their content in tables (paras_with_text=0, table_cells=6-108). These are syllabi, rosters, rubrics, and homework worksheets structured as a single document-wide table — high-value academic content the corpus was silently missing. docx walker now covers: - body paragraphs (existing) - tables, including nested tables in cells (recursive helper) - header and footer paragraphs per section - text-box content via XPath against w:txbxContent (no first-class API in python-docx; future-proofing — none of the current failure cohort has text-boxes) pptx walker now covers: - top-level shape text (existing) - recursive descent into group shapes - table cell text via shape.has_table / shape.table.iter_cells() - speaker notes via slide.notes_slide.notes_text_frame.text Out of scope: SmartArt diagrams, chart titles/labels, OLE objects, content controls. None of the current failure cohort has these. Recovery: 13 of 17 failures now ingest successfully. The 4 remaining are image-only pptx files (Renders.pptx, Ribbon Cutting Slideshow.pptx, two GH Slicer Notes variants — all PICTURE-shape decks with no text in any walkable structure). They stay in ingest_failures unresolved, awaiting OCR or path exclusion. Side effect worth noting: the regression check on 4 known-good files that were already producing embeddings showed all four gained content under the new walker — a Mod03 pptx grew from 23,993 to 57,462 chars (+33,469), Braskem Report docx grew 33,050 to 38,977 (+5,927), DDF MA program docx grew 37,210 to 47,603 (+10,393), SUNY PIF GRANT pptx grew 22,259 to 23,546 (+1,287). These files have been in the corpus all along with table or notes content silently dropped. They will surface the additional content on next re-ingest, improving retrieval quality for any future query that touches them. Cleanup: ingest_file already calls resolve_ingest_failure on successful ingest, so the 13 recovered files were marked resolved=TRUE during the retry pass. No separate cleanup SQL was needed. --- scripts/encoding.py | 54 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 6 deletions(-) diff --git a/scripts/encoding.py b/scripts/encoding.py index 1c0c7ff..3e7092c 100644 --- a/scripts/encoding.py +++ b/scripts/encoding.py @@ -25,6 +25,30 @@ DEFAULT_CHUNK_SIZE = 500 DEFAULT_CHUNK_OVERLAP = 50 +def _docx_cell_paragraphs(cell): + yield from (p for p in cell.paragraphs if p.text.strip()) + for nested in cell.tables: + for row in nested.rows: + for c in row.cells: + yield from _docx_cell_paragraphs(c) + + +def _pptx_shape_text(shape): + from pptx.enum.shapes import MSO_SHAPE_TYPE + parts = [] + if shape.shape_type == MSO_SHAPE_TYPE.GROUP: + for sub in shape.shapes: + parts.extend(_pptx_shape_text(sub)) + return parts + if hasattr(shape, "text") and shape.text.strip(): + parts.append(shape.text) + if getattr(shape, "has_table", False): + for cell in shape.table.iter_cells(): + if cell.text.strip(): + parts.append(cell.text) + return parts + + def extract_text(filepath: Path) -> str: """Return the text of a supported file. Returns "" on any failure or unsupported extension. Does not write to ingest_failures — caller decides.""" @@ -32,7 +56,21 @@ def extract_text(filepath: Path) -> str: try: if suffix == ".docx": doc = DocxDocument(filepath) - return "\n".join(p.text for p in doc.paragraphs if p.text.strip()) + parts = [p.text for p in doc.paragraphs if p.text.strip()] + for tbl in doc.tables: + for row in tbl.rows: + for cell in row.cells: + parts.extend(p.text for p in _docx_cell_paragraphs(cell)) + for section in doc.sections: + parts.extend(p.text for p in section.header.paragraphs if p.text.strip()) + parts.extend(p.text for p in section.footer.paragraphs if p.text.strip()) + from docx.oxml.ns import qn + for txbx in doc.element.body.findall(".//" + qn("w:txbxContent")): + for p in txbx.findall(".//" + qn("w:p")): + text = "".join(t.text or "" for t in p.findall(".//" + qn("w:t"))) + if text.strip(): + parts.append(text) + return "\n".join(parts) elif suffix == ".pdf": reader = PdfReader(filepath) return "".join( @@ -41,11 +79,15 @@ def extract_text(filepath: Path) -> str: ) elif suffix == ".pptx": prs = Presentation(filepath) - return "\n".join( - shape.text for slide in prs.slides - for shape in slide.shapes - if hasattr(shape, "text") and shape.text.strip() - ) + parts = [] + for slide in prs.slides: + for shape in slide.shapes: + parts.extend(_pptx_shape_text(shape)) + if slide.has_notes_slide: + notes = slide.notes_slide.notes_text_frame.text + if notes.strip(): + parts.append(notes) + return "\n".join(parts) elif suffix in {".txt", ".md"}: return filepath.read_text(encoding="utf-8", errors="ignore") except Exception as e: