diff --git a/scripts/encoding.py b/scripts/encoding.py index 3e7092c..41e06b6 100644 --- a/scripts/encoding.py +++ b/scripts/encoding.py @@ -12,6 +12,7 @@ Replaces four separate extract reimplementations and two extract-chunk-embed pat import hashlib import json import logging +import re from pathlib import Path from docx import Document as DocxDocument @@ -24,6 +25,62 @@ SUPPORTED = {".docx", ".pdf", ".pptx", ".txt", ".md"} DEFAULT_CHUNK_SIZE = 500 DEFAULT_CHUNK_OVERLAP = 50 +_BOLD_KV_RE = re.compile(r"^\*\*[\w +/-]+?:\*\*") + + +def _strip_md_frontmatter(text: str) -> str: + """Strip a leading frontmatter block from markdown, if present. + + Recognizes two formats: + - YAML-style: file's first non-empty line is `---`, terminated by `---`. + Only triggered when no heading precedes — guards against `---` + horizontal rules that follow an H1. + - Capture-style: optional H1 heading, then one or more `**key:** value` + lines (and blanks), terminated by `---`. The H1 is preserved; the + key/value block + separator are removed. + + Body `---` rules and body `**bold:**` lines are never touched — the scan + aborts as soon as a non-frontmatter line appears in the leading block. + """ + lines = text.splitlines() + n = len(lines) + i = 0 + while i < n and not lines[i].strip(): + i += 1 + heading = None + if i < n and lines[i].startswith("# "): + heading = lines[i] + i += 1 + while i < n and not lines[i].strip(): + i += 1 + if i >= n: + return text + first = lines[i].strip() + if heading is None and first == "---": + j = i + 1 + while j < n and lines[j].strip() != "---": + j += 1 + if j >= n: + return text + body_start = j + 1 + elif _BOLD_KV_RE.match(first): + j = i + while j < n: + s = lines[j].strip() + if not s or _BOLD_KV_RE.match(s): + j += 1 + continue + if s == "---": + body_start = j + 1 + break + return text + else: + return text + else: + return text + body = "\n".join(lines[body_start:]).lstrip("\n") + return f"{heading}\n\n{body}" if heading else body + def _docx_cell_paragraphs(cell): yield from (p for p in cell.paragraphs if p.text.strip()) @@ -89,7 +146,10 @@ def extract_text(filepath: Path) -> str: parts.append(notes) return "\n".join(parts) elif suffix in {".txt", ".md"}: - return filepath.read_text(encoding="utf-8", errors="ignore") + text = filepath.read_text(encoding="utf-8", errors="ignore") + if suffix == ".md": + return _strip_md_frontmatter(text) + return text except Exception as e: log.warning(f"Text extraction failed for {filepath.name}: {e}") return ""