diff --git a/scripts/encoding.py b/scripts/encoding.py index 1c0c7ff..3e7092c 100644 --- a/scripts/encoding.py +++ b/scripts/encoding.py @@ -25,6 +25,30 @@ DEFAULT_CHUNK_SIZE = 500 DEFAULT_CHUNK_OVERLAP = 50 +def _docx_cell_paragraphs(cell): + yield from (p for p in cell.paragraphs if p.text.strip()) + for nested in cell.tables: + for row in nested.rows: + for c in row.cells: + yield from _docx_cell_paragraphs(c) + + +def _pptx_shape_text(shape): + from pptx.enum.shapes import MSO_SHAPE_TYPE + parts = [] + if shape.shape_type == MSO_SHAPE_TYPE.GROUP: + for sub in shape.shapes: + parts.extend(_pptx_shape_text(sub)) + return parts + if hasattr(shape, "text") and shape.text.strip(): + parts.append(shape.text) + if getattr(shape, "has_table", False): + for cell in shape.table.iter_cells(): + if cell.text.strip(): + parts.append(cell.text) + return parts + + def extract_text(filepath: Path) -> str: """Return the text of a supported file. Returns "" on any failure or unsupported extension. Does not write to ingest_failures — caller decides.""" @@ -32,7 +56,21 @@ def extract_text(filepath: Path) -> str: try: if suffix == ".docx": doc = DocxDocument(filepath) - return "\n".join(p.text for p in doc.paragraphs if p.text.strip()) + parts = [p.text for p in doc.paragraphs if p.text.strip()] + for tbl in doc.tables: + for row in tbl.rows: + for cell in row.cells: + parts.extend(p.text for p in _docx_cell_paragraphs(cell)) + for section in doc.sections: + parts.extend(p.text for p in section.header.paragraphs if p.text.strip()) + parts.extend(p.text for p in section.footer.paragraphs if p.text.strip()) + from docx.oxml.ns import qn + for txbx in doc.element.body.findall(".//" + qn("w:txbxContent")): + for p in txbx.findall(".//" + qn("w:p")): + text = "".join(t.text or "" for t in p.findall(".//" + qn("w:t"))) + if text.strip(): + parts.append(text) + return "\n".join(parts) elif suffix == ".pdf": reader = PdfReader(filepath) return "".join( @@ -41,11 +79,15 @@ def extract_text(filepath: Path) -> str: ) elif suffix == ".pptx": prs = Presentation(filepath) - return "\n".join( - shape.text for slide in prs.slides - for shape in slide.shapes - if hasattr(shape, "text") and shape.text.strip() - ) + parts = [] + for slide in prs.slides: + for shape in slide.shapes: + parts.extend(_pptx_shape_text(shape)) + if slide.has_notes_slide: + notes = slide.notes_slide.notes_text_frame.text + if notes.strip(): + parts.append(notes) + return "\n".join(parts) elif suffix in {".txt", ".md"}: return filepath.read_text(encoding="utf-8", errors="ignore") except Exception as e: