encoding.py: strip frontmatter from .md at extraction time
The capture endpoint (api.py:702, 833) writes Journal/Captures/*.md
files with a markdown-bold-style header block (`**type:** voice`,
`**modality:** audio`, `**status:** unprocessed`, optional `**media:**`
and `**project:**`) followed by a `---` separator. extract_text for .md
was a bare filepath.read_text, so every capture-derived chunk in
pgvector embedded the frontmatter as raw text, polluting retrieval.
Fix adds _strip_md_frontmatter, called only for the .md branch:
- Capture-style: optional leading H1 (preserved), then consecutive
`**key:** value` lines (and blanks), terminated by `---`. The H1 is
retained; the key/value block + separator are removed.
- YAML-style: file's first non-empty line is `---`, terminated by `---`.
Only triggered when no heading precedes — guards against the common
`# Title` + `---` (horizontal rule under heading) pattern seen in
Journal/aaronai-architecture.md and four other Journal/*.md files.
Body `**bold:**` lines (e.g. `**Visual description:**` in image
captures) and body `---` horizontal rules are never touched: the scan
aborts as soon as a non-frontmatter line appears in the leading block.
briefing_generator_v2.py's split("---", 1) heuristic was reviewed and
not reused — fragile on substring matches and on documents with
multiple `---` rules.
Verified against:
- 2026-04-26-22-44-voice.md: frontmatter stripped, body retained, H1
retained.
- 2026-04-27-04-34-image.md: frontmatter stripped, `**Visual
description:**` and `**Voice annotation:**` body bold-headers
retained, trailing `---` not consumed.
- Journal/aaronai-architecture.md (5 body `---` rules): output
byte-identical to read_text (96101 chars).
- Synthetic YAML doc: stripped correctly when no leading heading.
- Synthetic plain markdown with body `---` rules: untouched.
- Empty input + heading-only file: untouched.
Existing capture chunks in pgvector retain polluted text; the fix only
affects future extractions. Backfill decision deferred — the cleanest
path is `touch -h Journal/Captures/*.md` to bump mtime and let the
watcher re-ingest naturally on the next cycle.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
+61
-1
@@ -12,6 +12,7 @@ Replaces four separate extract reimplementations and two extract-chunk-embed pat
|
|||||||
import hashlib
|
import hashlib
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docx import Document as DocxDocument
|
from docx import Document as DocxDocument
|
||||||
@@ -24,6 +25,62 @@ SUPPORTED = {".docx", ".pdf", ".pptx", ".txt", ".md"}
|
|||||||
DEFAULT_CHUNK_SIZE = 500
|
DEFAULT_CHUNK_SIZE = 500
|
||||||
DEFAULT_CHUNK_OVERLAP = 50
|
DEFAULT_CHUNK_OVERLAP = 50
|
||||||
|
|
||||||
|
_BOLD_KV_RE = re.compile(r"^\*\*[\w +/-]+?:\*\*")
|
||||||
|
|
||||||
|
|
||||||
|
def _strip_md_frontmatter(text: str) -> str:
|
||||||
|
"""Strip a leading frontmatter block from markdown, if present.
|
||||||
|
|
||||||
|
Recognizes two formats:
|
||||||
|
- YAML-style: file's first non-empty line is `---`, terminated by `---`.
|
||||||
|
Only triggered when no heading precedes — guards against `---`
|
||||||
|
horizontal rules that follow an H1.
|
||||||
|
- Capture-style: optional H1 heading, then one or more `**key:** value`
|
||||||
|
lines (and blanks), terminated by `---`. The H1 is preserved; the
|
||||||
|
key/value block + separator are removed.
|
||||||
|
|
||||||
|
Body `---` rules and body `**bold:**` lines are never touched — the scan
|
||||||
|
aborts as soon as a non-frontmatter line appears in the leading block.
|
||||||
|
"""
|
||||||
|
lines = text.splitlines()
|
||||||
|
n = len(lines)
|
||||||
|
i = 0
|
||||||
|
while i < n and not lines[i].strip():
|
||||||
|
i += 1
|
||||||
|
heading = None
|
||||||
|
if i < n and lines[i].startswith("# "):
|
||||||
|
heading = lines[i]
|
||||||
|
i += 1
|
||||||
|
while i < n and not lines[i].strip():
|
||||||
|
i += 1
|
||||||
|
if i >= n:
|
||||||
|
return text
|
||||||
|
first = lines[i].strip()
|
||||||
|
if heading is None and first == "---":
|
||||||
|
j = i + 1
|
||||||
|
while j < n and lines[j].strip() != "---":
|
||||||
|
j += 1
|
||||||
|
if j >= n:
|
||||||
|
return text
|
||||||
|
body_start = j + 1
|
||||||
|
elif _BOLD_KV_RE.match(first):
|
||||||
|
j = i
|
||||||
|
while j < n:
|
||||||
|
s = lines[j].strip()
|
||||||
|
if not s or _BOLD_KV_RE.match(s):
|
||||||
|
j += 1
|
||||||
|
continue
|
||||||
|
if s == "---":
|
||||||
|
body_start = j + 1
|
||||||
|
break
|
||||||
|
return text
|
||||||
|
else:
|
||||||
|
return text
|
||||||
|
else:
|
||||||
|
return text
|
||||||
|
body = "\n".join(lines[body_start:]).lstrip("\n")
|
||||||
|
return f"{heading}\n\n{body}" if heading else body
|
||||||
|
|
||||||
|
|
||||||
def _docx_cell_paragraphs(cell):
|
def _docx_cell_paragraphs(cell):
|
||||||
yield from (p for p in cell.paragraphs if p.text.strip())
|
yield from (p for p in cell.paragraphs if p.text.strip())
|
||||||
@@ -89,7 +146,10 @@ def extract_text(filepath: Path) -> str:
|
|||||||
parts.append(notes)
|
parts.append(notes)
|
||||||
return "\n".join(parts)
|
return "\n".join(parts)
|
||||||
elif suffix in {".txt", ".md"}:
|
elif suffix in {".txt", ".md"}:
|
||||||
return filepath.read_text(encoding="utf-8", errors="ignore")
|
text = filepath.read_text(encoding="utf-8", errors="ignore")
|
||||||
|
if suffix == ".md":
|
||||||
|
return _strip_md_frontmatter(text)
|
||||||
|
return text
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.warning(f"Text extraction failed for {filepath.name}: {e}")
|
log.warning(f"Text extraction failed for {filepath.name}: {e}")
|
||||||
return ""
|
return ""
|
||||||
|
|||||||
Reference in New Issue
Block a user