encoding: per-slide pptx chunking + extract_blocks API; api: recency tiebreak
extract_blocks(filepath) is the new structured-extraction entry point, returning
list[{heading, text, kind}]. chunk_and_embed accepts either str (blind-chunk
back-compat) or list[dict] (one chunk per block, blind-split if oversize, heading
prepended for retrieval context and stored in metadata).
- pptx: one block per slide. Slide title becomes block heading; speaker notes
fold into the body. Image-only decks with title-only slides now produce
heading-only chunks instead of being recorded as extraction failures.
- docx: deliberately single-block (back-compat). Heading-style section detection
was implemented and rolled back: hand-formatted CVs are Normal-styled with
bold-as-heading, and tying chunk boundaries to formatting choices would lock
future-user into preserving those choices forever. Lexical + cross-encoder
retrieval already handles substring matching inside blind-chunked CVs.
- pdf/txt/md: unchanged (single block, blind chunking).
Recency tiebreak in retrieve_context: pull created_at into the SELECT, use it
as secondary sort key in _rerank so memory/journal snapshots prefer the latest
copy among near-duplicate content.
reindex_docx_pptx.py now accepts --ext=pptx,docx... so re-ingest can target a
subset; previous hardcoded delete regex would have wiped both even with a
single-ext target.
This commit is contained in:
+12
-7
@@ -302,14 +302,19 @@ def classify_retrieval_intent(query: str):
|
|||||||
|
|
||||||
|
|
||||||
def _rerank(query: str, candidates: list[tuple]) -> list[tuple]:
|
def _rerank(query: str, candidates: list[tuple]) -> list[tuple]:
|
||||||
"""Cross-encoder rerank. Candidates are (id, document, source, folder) tuples.
|
"""Cross-encoder rerank. Candidates are (id, document, source, folder, created_at)
|
||||||
Returns the same tuples reordered by reranker score (highest first)."""
|
tuples. Returns the same tuples reordered by reranker score with created_at as
|
||||||
|
secondary key — so when two chunks score similarly the newer one wins, which
|
||||||
|
keeps memory/journal files biased toward the latest snapshot."""
|
||||||
if not candidates:
|
if not candidates:
|
||||||
return []
|
return []
|
||||||
pairs = [(query, row[1]) for row in candidates]
|
pairs = [(query, row[1]) for row in candidates]
|
||||||
scores = reranker.predict(pairs)
|
scores = reranker.predict(pairs)
|
||||||
return [row for row, _ in sorted(zip(candidates, scores),
|
return [row for row, _ in sorted(
|
||||||
key=lambda x: x[1], reverse=True)]
|
zip(candidates, scores),
|
||||||
|
key=lambda x: (float(x[1]), x[0][4] or ""),
|
||||||
|
reverse=True,
|
||||||
|
)]
|
||||||
|
|
||||||
|
|
||||||
def _format_source(source: str, folder: str) -> str:
|
def _format_source(source: str, folder: str) -> str:
|
||||||
@@ -374,7 +379,7 @@ def retrieve_context(query, n_results=FINAL_LIMIT,
|
|||||||
cur.execute("SET LOCAL hnsw.ef_search = 500")
|
cur.execute("SET LOCAL hnsw.ef_search = 500")
|
||||||
|
|
||||||
cur.execute(f"""
|
cur.execute(f"""
|
||||||
SELECT id, document, source, metadata->>'folder' AS folder
|
SELECT id, document, source, metadata->>'folder' AS folder, created_at
|
||||||
FROM embeddings
|
FROM embeddings
|
||||||
{common_where}
|
{common_where}
|
||||||
ORDER BY embedding <=> %s::vector
|
ORDER BY embedding <=> %s::vector
|
||||||
@@ -387,7 +392,7 @@ def retrieve_context(query, n_results=FINAL_LIMIT,
|
|||||||
lex_match = "to_tsvector('english', document) @@ websearch_to_tsquery('english', %s)"
|
lex_match = "to_tsvector('english', document) @@ websearch_to_tsquery('english', %s)"
|
||||||
lex_where = ("WHERE " + " AND ".join([lex_match] + where_clauses))
|
lex_where = ("WHERE " + " AND ".join([lex_match] + where_clauses))
|
||||||
cur.execute(f"""
|
cur.execute(f"""
|
||||||
SELECT id, document, source, metadata->>'folder' AS folder
|
SELECT id, document, source, metadata->>'folder' AS folder, created_at
|
||||||
FROM embeddings
|
FROM embeddings
|
||||||
{lex_where}
|
{lex_where}
|
||||||
ORDER BY ts_rank(to_tsvector('english', document),
|
ORDER BY ts_rank(to_tsvector('english', document),
|
||||||
@@ -411,7 +416,7 @@ def retrieve_context(query, n_results=FINAL_LIMIT,
|
|||||||
candidates = [rows_by_id[doc_id] for doc_id, _ in rrf_ranked]
|
candidates = [rows_by_id[doc_id] for doc_id, _ in rrf_ranked]
|
||||||
|
|
||||||
seen = set()
|
seen = set()
|
||||||
for _id, doc, source, folder in _rerank(query, candidates):
|
for _id, doc, source, folder, _created_at in _rerank(query, candidates):
|
||||||
key = _dedup_key(doc)
|
key = _dedup_key(doc)
|
||||||
if key in seen:
|
if key in seen:
|
||||||
continue
|
continue
|
||||||
|
|||||||
+122
-34
@@ -1,12 +1,14 @@
|
|||||||
"""
|
"""
|
||||||
Aaron AI Stage 1 encoding helpers — single canonical implementation of:
|
Aaron AI Stage 1 encoding helpers — single canonical implementation of:
|
||||||
- extract_text(filepath) — four-extension text extraction
|
- extract_blocks(filepath) — section-aware extraction (docx heading-bounded
|
||||||
- chunk_text(text, chunk_size, overlap) — word-based chunking
|
sections, pptx per-slide, pdf/txt/md single-block)
|
||||||
- chunk_and_embed(text, source, embedder, filepath, folder) — produce ready-to-write rows
|
- extract_text(filepath) — back-compat string concatenation over blocks
|
||||||
|
- chunk_text(text, chunk_size, overlap) — word-based blind chunking
|
||||||
|
- chunk_and_embed(text_or_blocks, source, embedder, filepath, folder) —
|
||||||
|
produce ready-to-write rows. Accepts str (blind) or list[dict] (section-aware).
|
||||||
- write_embeddings_batch(conn, batch) — server-side NOW() canonical INSERT
|
- write_embeddings_batch(conn, batch) — server-side NOW() canonical INSERT
|
||||||
|
|
||||||
Used by watcher.py, ingest.py, corpus_integrity.py, and api.py /api/corpus/retry.
|
Used by watcher.py, ingest.py, corpus_integrity.py, and api.py /api/corpus/retry.
|
||||||
Replaces four separate extract reimplementations and two extract-chunk-embed paths.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import hashlib
|
import hashlib
|
||||||
@@ -106,12 +108,15 @@ def _pptx_shape_text(shape):
|
|||||||
return parts
|
return parts
|
||||||
|
|
||||||
|
|
||||||
def extract_text(filepath: Path) -> str:
|
def _extract_docx_blocks(filepath: Path) -> list[dict]:
|
||||||
"""Return the text of a supported file. Returns "" on any failure or
|
"""Return docx content as a single block. Earlier attempt at section-aware
|
||||||
unsupported extension. Does not write to ingest_failures — caller decides."""
|
chunking via Heading styles was rolled back: the user's docs are mostly
|
||||||
suffix = filepath.suffix.lower()
|
Normal-styled with bold-as-heading, and tying chunk boundaries to formatting
|
||||||
try:
|
choices locks future-them into preserving those choices forever. Lexical
|
||||||
if suffix == ".docx":
|
+ cross-encoder retrieval already finds the right substrings within a
|
||||||
|
blind-chunked CV, so the section structure isn't load-bearing for retrieval."""
|
||||||
|
from docx.oxml.ns import qn
|
||||||
|
|
||||||
doc = DocxDocument(filepath)
|
doc = DocxDocument(filepath)
|
||||||
parts = [p.text for p in doc.paragraphs if p.text.strip()]
|
parts = [p.text for p in doc.paragraphs if p.text.strip()]
|
||||||
for tbl in doc.tables:
|
for tbl in doc.tables:
|
||||||
@@ -121,38 +126,88 @@ def extract_text(filepath: Path) -> str:
|
|||||||
for section in doc.sections:
|
for section in doc.sections:
|
||||||
parts.extend(p.text for p in section.header.paragraphs if p.text.strip())
|
parts.extend(p.text for p in section.header.paragraphs if p.text.strip())
|
||||||
parts.extend(p.text for p in section.footer.paragraphs if p.text.strip())
|
parts.extend(p.text for p in section.footer.paragraphs if p.text.strip())
|
||||||
from docx.oxml.ns import qn
|
|
||||||
for txbx in doc.element.body.findall(".//" + qn("w:txbxContent")):
|
for txbx in doc.element.body.findall(".//" + qn("w:txbxContent")):
|
||||||
for p in txbx.findall(".//" + qn("w:p")):
|
for p in txbx.findall(".//" + qn("w:p")):
|
||||||
text = "".join(t.text or "" for t in p.findall(".//" + qn("w:t")))
|
text = "".join(t.text or "" for t in p.findall(".//" + qn("w:t")))
|
||||||
if text.strip():
|
if text.strip():
|
||||||
parts.append(text)
|
parts.append(text)
|
||||||
return "\n".join(parts)
|
text = "\n".join(parts)
|
||||||
elif suffix == ".pdf":
|
return [{"heading": None, "text": text, "kind": "doc"}] if text.strip() else []
|
||||||
reader = PdfReader(filepath)
|
|
||||||
return "".join(
|
|
||||||
page.extract_text() + "\n"
|
def _extract_pptx_blocks(filepath: Path) -> list[dict]:
|
||||||
for page in reader.pages if page.extract_text()
|
"""One block per slide. Heading = slide title (or 'Slide N' fallback).
|
||||||
)
|
Body = non-title shape text + speaker notes."""
|
||||||
elif suffix == ".pptx":
|
|
||||||
prs = Presentation(filepath)
|
prs = Presentation(filepath)
|
||||||
parts = []
|
blocks = []
|
||||||
for slide in prs.slides:
|
for i, slide in enumerate(prs.slides, 1):
|
||||||
|
title_shape = None
|
||||||
|
try:
|
||||||
|
title_shape = slide.shapes.title
|
||||||
|
except (AttributeError, KeyError):
|
||||||
|
pass
|
||||||
|
title = None
|
||||||
|
body_parts = []
|
||||||
for shape in slide.shapes:
|
for shape in slide.shapes:
|
||||||
parts.extend(_pptx_shape_text(shape))
|
if title_shape is not None and shape == title_shape and shape.has_text_frame:
|
||||||
|
title = shape.text_frame.text.strip() or None
|
||||||
|
continue
|
||||||
|
body_parts.extend(_pptx_shape_text(shape))
|
||||||
if slide.has_notes_slide:
|
if slide.has_notes_slide:
|
||||||
notes = slide.notes_slide.notes_text_frame.text
|
notes = slide.notes_slide.notes_text_frame.text
|
||||||
if notes.strip():
|
if notes.strip():
|
||||||
parts.append(notes)
|
body_parts.append(f"[Notes] {notes}")
|
||||||
return "\n".join(parts)
|
if title or body_parts:
|
||||||
elif suffix in {".txt", ".md"}:
|
blocks.append({
|
||||||
|
"heading": title or f"Slide {i}",
|
||||||
|
"text": "\n".join(body_parts),
|
||||||
|
"kind": "slide",
|
||||||
|
})
|
||||||
|
return blocks
|
||||||
|
|
||||||
|
|
||||||
|
def extract_blocks(filepath: Path) -> list[dict]:
|
||||||
|
"""Structured extraction. Returns list of {heading, text, kind} blocks.
|
||||||
|
|
||||||
|
- docx: section-aware via Heading-style paragraphs (kind='section').
|
||||||
|
- pptx: one block per slide (kind='slide').
|
||||||
|
- pdf/txt/md: single block, no heading (kind='doc').
|
||||||
|
|
||||||
|
Empty list on any failure or unsupported extension."""
|
||||||
|
suffix = filepath.suffix.lower()
|
||||||
|
try:
|
||||||
|
if suffix == ".docx":
|
||||||
|
return _extract_docx_blocks(filepath)
|
||||||
|
if suffix == ".pptx":
|
||||||
|
return _extract_pptx_blocks(filepath)
|
||||||
|
if suffix == ".pdf":
|
||||||
|
reader = PdfReader(filepath)
|
||||||
|
text = "".join(
|
||||||
|
page.extract_text() + "\n"
|
||||||
|
for page in reader.pages if page.extract_text()
|
||||||
|
)
|
||||||
|
return [{"heading": None, "text": text, "kind": "doc"}] if text.strip() else []
|
||||||
|
if suffix in {".txt", ".md"}:
|
||||||
text = filepath.read_text(encoding="utf-8", errors="ignore")
|
text = filepath.read_text(encoding="utf-8", errors="ignore")
|
||||||
if suffix == ".md":
|
if suffix == ".md":
|
||||||
return _strip_md_frontmatter(text)
|
text = _strip_md_frontmatter(text)
|
||||||
return text
|
return [{"heading": None, "text": text, "kind": "doc"}] if text.strip() else []
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.warning(f"Text extraction failed for {filepath.name}: {e}")
|
log.warning(f"Extraction failed for {filepath.name}: {e}")
|
||||||
return ""
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def extract_text(filepath: Path) -> str:
|
||||||
|
"""Back-compat wrapper: concatenate extract_blocks() output. Section
|
||||||
|
structure is lost; use extract_blocks() directly for chunking."""
|
||||||
|
blocks = extract_blocks(filepath)
|
||||||
|
parts = []
|
||||||
|
for b in blocks:
|
||||||
|
if b.get("heading"):
|
||||||
|
parts.append(b["heading"])
|
||||||
|
if b.get("text"):
|
||||||
|
parts.append(b["text"])
|
||||||
|
return "\n".join(parts)
|
||||||
|
|
||||||
|
|
||||||
def chunk_text(text: str,
|
def chunk_text(text: str,
|
||||||
@@ -175,18 +230,49 @@ def _chunk_id(filepath, source: str, index: int) -> str:
|
|||||||
return f"{hashlib.md5(basis.encode()).hexdigest()[:8]}_{index}"
|
return f"{hashlib.md5(basis.encode()).hexdigest()[:8]}_{index}"
|
||||||
|
|
||||||
|
|
||||||
def chunk_and_embed(text: str,
|
def chunk_and_embed(text_or_blocks,
|
||||||
source: str,
|
source: str,
|
||||||
embedder,
|
embedder,
|
||||||
filepath=None,
|
filepath=None,
|
||||||
folder=None) -> list[dict]:
|
folder=None) -> list[dict]:
|
||||||
"""Chunk text, embed each chunk, return rows ready for write_embeddings_batch."""
|
"""Chunk + embed for write_embeddings_batch. Accepts either:
|
||||||
chunks = chunk_text(text)
|
|
||||||
|
- str: blind chunking with 500-word windows (pdf/txt/md legacy path).
|
||||||
|
- list[dict]: section-aware path (docx Heading-bounded sections, pptx
|
||||||
|
slides). Each block emits one chunk if its text fits within
|
||||||
|
DEFAULT_CHUNK_SIZE words, otherwise is blind-split with overlap.
|
||||||
|
|
||||||
|
The block heading is prepended to the chunk text (so retrieval sees the
|
||||||
|
section context) and stored in metadata as heading/kind."""
|
||||||
|
if isinstance(text_or_blocks, str):
|
||||||
|
blocks = [{"heading": None, "text": text_or_blocks, "kind": "doc"}]
|
||||||
|
else:
|
||||||
|
blocks = text_or_blocks
|
||||||
|
|
||||||
|
chunks = []
|
||||||
|
for block in blocks:
|
||||||
|
body = block.get("text") or ""
|
||||||
|
heading = block.get("heading")
|
||||||
|
kind = block.get("kind", "doc")
|
||||||
|
if not body.strip() and not (heading and heading.strip()):
|
||||||
|
continue
|
||||||
|
if heading and body.strip():
|
||||||
|
contextualized = f"{heading}\n\n{body}"
|
||||||
|
elif heading:
|
||||||
|
contextualized = heading
|
||||||
|
else:
|
||||||
|
contextualized = body
|
||||||
|
if len(contextualized.split()) <= DEFAULT_CHUNK_SIZE:
|
||||||
|
chunks.append((contextualized, heading, kind))
|
||||||
|
else:
|
||||||
|
for sub in chunk_text(contextualized):
|
||||||
|
chunks.append((sub, heading, kind))
|
||||||
|
|
||||||
if not chunks:
|
if not chunks:
|
||||||
return []
|
return []
|
||||||
embeddings = embedder.encode(chunks).tolist()
|
embeddings = embedder.encode([c[0] for c in chunks]).tolist()
|
||||||
rows = []
|
rows = []
|
||||||
for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
|
for i, ((chunk, heading, kind), emb) in enumerate(zip(chunks, embeddings)):
|
||||||
rows.append({
|
rows.append({
|
||||||
"id": _chunk_id(filepath, source, i),
|
"id": _chunk_id(filepath, source, i),
|
||||||
"document": chunk,
|
"document": chunk,
|
||||||
@@ -197,6 +283,8 @@ def chunk_and_embed(text: str,
|
|||||||
"source": source,
|
"source": source,
|
||||||
"filepath": str(filepath) if filepath else source,
|
"filepath": str(filepath) if filepath else source,
|
||||||
"folder": folder,
|
"folder": folder,
|
||||||
|
"heading": heading,
|
||||||
|
"kind": kind,
|
||||||
},
|
},
|
||||||
})
|
})
|
||||||
return rows
|
return rows
|
||||||
|
|||||||
+12
-5
@@ -15,7 +15,7 @@ from dotenv import load_dotenv
|
|||||||
import psycopg2
|
import psycopg2
|
||||||
from sentence_transformers import SentenceTransformer
|
from sentence_transformers import SentenceTransformer
|
||||||
|
|
||||||
from encoding import extract_text, chunk_and_embed, write_embeddings_batch, SUPPORTED
|
from encoding import extract_blocks, chunk_and_embed, write_embeddings_batch, SUPPORTED
|
||||||
from failures import (
|
from failures import (
|
||||||
record_ingest_failure as _record_failure_sql,
|
record_ingest_failure as _record_failure_sql,
|
||||||
resolve_ingest_failure as _resolve_failure_sql,
|
resolve_ingest_failure as _resolve_failure_sql,
|
||||||
@@ -83,8 +83,11 @@ def _ingest_one(filepath: Path, embedder, root: Path = None) -> int:
|
|||||||
return 0
|
return 0
|
||||||
if filepath.suffix.lower() not in SUPPORTED:
|
if filepath.suffix.lower() not in SUPPORTED:
|
||||||
return 0
|
return 0
|
||||||
text = extract_text(filepath)
|
blocks = extract_blocks(filepath)
|
||||||
if not text.strip():
|
if not blocks or not any(
|
||||||
|
(b.get("text") or "").strip() or (b.get("heading") or "").strip()
|
||||||
|
for b in blocks
|
||||||
|
):
|
||||||
_record_failure(filepath, "Text extraction failed or empty")
|
_record_failure(filepath, "Text extraction failed or empty")
|
||||||
return 0
|
return 0
|
||||||
folder_rel = None
|
folder_rel = None
|
||||||
@@ -94,7 +97,7 @@ def _ingest_one(filepath: Path, embedder, root: Path = None) -> int:
|
|||||||
except ValueError:
|
except ValueError:
|
||||||
pass
|
pass
|
||||||
try:
|
try:
|
||||||
rows = chunk_and_embed(text, filepath.name, embedder,
|
rows = chunk_and_embed(blocks, filepath.name, embedder,
|
||||||
filepath=filepath, folder=folder_rel)
|
filepath=filepath, folder=folder_rel)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
_record_failure(filepath, f"Embedding failed: {e}")
|
_record_failure(filepath, f"Embedding failed: {e}")
|
||||||
@@ -113,7 +116,11 @@ def _ingest_one(filepath: Path, embedder, root: Path = None) -> int:
|
|||||||
print(f" Indexed {len(rows)} chunks: {filepath.name}")
|
print(f" Indexed {len(rows)} chunks: {filepath.name}")
|
||||||
_resolve_failure(filepath.name)
|
_resolve_failure(filepath.name)
|
||||||
if not os.getenv("SKIP_STAGE2_ENQUEUE"):
|
if not os.getenv("SKIP_STAGE2_ENQUEUE"):
|
||||||
enqueue_stage2(filepath.name, text)
|
full_text = "\n".join(
|
||||||
|
f"{b['heading']}\n{b['text']}" if b.get("heading") else b.get("text", "")
|
||||||
|
for b in blocks
|
||||||
|
)
|
||||||
|
enqueue_stage2(filepath.name, full_text)
|
||||||
return len(rows)
|
return len(rows)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ Without --apply: dry-run. Counts files and chunks, prints a sample, writes nothi
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -28,19 +29,29 @@ sys.path.insert(0, str(Path(__file__).parent))
|
|||||||
from ingest import _ingest_one, get_pg
|
from ingest import _ingest_one, get_pg
|
||||||
|
|
||||||
NEXTCLOUD_PATH = Path("/home/aaron/nextcloud/data/data/aaron/files")
|
NEXTCLOUD_PATH = Path("/home/aaron/nextcloud/data/data/aaron/files")
|
||||||
TARGET_EXTS = {".docx", ".pptx"}
|
|
||||||
|
|
||||||
APPLY = "--apply" in sys.argv
|
APPLY = "--apply" in sys.argv
|
||||||
|
_ext_args = [a for a in sys.argv[1:] if a.startswith("--ext=")]
|
||||||
|
if _ext_args:
|
||||||
|
TARGET_EXTS = {("." + e.lstrip(".")) for arg in _ext_args
|
||||||
|
for e in arg.split("=", 1)[1].split(",")}
|
||||||
|
else:
|
||||||
|
TARGET_EXTS = {".docx", ".pptx"}
|
||||||
|
|
||||||
|
|
||||||
|
def _ext_regex():
|
||||||
|
inner = "|".join(re.escape(e.lstrip(".")) for e in sorted(TARGET_EXTS))
|
||||||
|
return f"\\.({inner})$"
|
||||||
|
|
||||||
|
|
||||||
def count_stale():
|
def count_stale():
|
||||||
pg = get_pg()
|
pg = get_pg()
|
||||||
cur = pg.cursor()
|
cur = pg.cursor()
|
||||||
cur.execute(
|
cur.execute(
|
||||||
"SELECT lower(substring(source from '\\.[^.]+$')) AS ext, "
|
f"SELECT lower(substring(source from '\\.[^.]+$')) AS ext, "
|
||||||
"COUNT(DISTINCT source) AS files, COUNT(*) AS chunks "
|
f"COUNT(DISTINCT source) AS files, COUNT(*) AS chunks "
|
||||||
"FROM embeddings WHERE lower(source) ~ '\\.(docx|pptx)$' "
|
f"FROM embeddings WHERE lower(source) ~ '{_ext_regex()}' "
|
||||||
"GROUP BY 1 ORDER BY 1"
|
f"GROUP BY 1 ORDER BY 1"
|
||||||
)
|
)
|
||||||
rows = cur.fetchall()
|
rows = cur.fetchall()
|
||||||
pg.close()
|
pg.close()
|
||||||
@@ -50,7 +61,7 @@ def count_stale():
|
|||||||
def delete_stale():
|
def delete_stale():
|
||||||
pg = get_pg()
|
pg = get_pg()
|
||||||
cur = pg.cursor()
|
cur = pg.cursor()
|
||||||
cur.execute("DELETE FROM embeddings WHERE lower(source) ~ '\\.(docx|pptx)$'")
|
cur.execute(f"DELETE FROM embeddings WHERE lower(source) ~ '{_ext_regex()}'")
|
||||||
deleted = cur.rowcount
|
deleted = cur.rowcount
|
||||||
pg.commit()
|
pg.commit()
|
||||||
pg.close()
|
pg.close()
|
||||||
|
|||||||
+12
-5
@@ -29,7 +29,7 @@ from sentence_transformers import SentenceTransformer
|
|||||||
from watchdog.observers import Observer
|
from watchdog.observers import Observer
|
||||||
from watchdog.events import FileSystemEventHandler
|
from watchdog.events import FileSystemEventHandler
|
||||||
|
|
||||||
from encoding import extract_text, chunk_and_embed, write_embeddings_batch, SUPPORTED
|
from encoding import extract_blocks, chunk_and_embed, write_embeddings_batch, SUPPORTED
|
||||||
from failures import (
|
from failures import (
|
||||||
record_ingest_failure as _record_failure_sql,
|
record_ingest_failure as _record_failure_sql,
|
||||||
resolve_ingest_failure as _resolve_failure_sql,
|
resolve_ingest_failure as _resolve_failure_sql,
|
||||||
@@ -128,8 +128,11 @@ def ingest_file(filepath: Path, embedder) -> int:
|
|||||||
return 0
|
return 0
|
||||||
if filepath.suffix.lower() not in SUPPORTED:
|
if filepath.suffix.lower() not in SUPPORTED:
|
||||||
return 0
|
return 0
|
||||||
text = extract_text(filepath)
|
blocks = extract_blocks(filepath)
|
||||||
if not text.strip():
|
if not blocks or not any(
|
||||||
|
(b.get("text") or "").strip() or (b.get("heading") or "").strip()
|
||||||
|
for b in blocks
|
||||||
|
):
|
||||||
record_ingest_failure(filepath, "Text extraction failed or empty")
|
record_ingest_failure(filepath, "Text extraction failed or empty")
|
||||||
return 0
|
return 0
|
||||||
folder_rel = None
|
folder_rel = None
|
||||||
@@ -138,7 +141,7 @@ def ingest_file(filepath: Path, embedder) -> int:
|
|||||||
except ValueError:
|
except ValueError:
|
||||||
pass
|
pass
|
||||||
try:
|
try:
|
||||||
rows = chunk_and_embed(text, filepath.name, embedder,
|
rows = chunk_and_embed(blocks, filepath.name, embedder,
|
||||||
filepath=filepath, folder=folder_rel)
|
filepath=filepath, folder=folder_rel)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.error(f"Embedding failed for {filepath.name}: {e}")
|
log.error(f"Embedding failed for {filepath.name}: {e}")
|
||||||
@@ -159,7 +162,11 @@ def ingest_file(filepath: Path, embedder) -> int:
|
|||||||
return 0
|
return 0
|
||||||
log.info(f"Indexed {len(rows)} chunks: {filepath.name}")
|
log.info(f"Indexed {len(rows)} chunks: {filepath.name}")
|
||||||
resolve_ingest_failure(source)
|
resolve_ingest_failure(source)
|
||||||
enqueue_stage2(source, text)
|
full_text = "\n".join(
|
||||||
|
f"{b['heading']}\n{b['text']}" if b.get("heading") else b.get("text", "")
|
||||||
|
for b in blocks
|
||||||
|
)
|
||||||
|
enqueue_stage2(source, full_text)
|
||||||
return len(rows)
|
return len(rows)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user