diff --git a/scripts/encoding.py b/scripts/encoding.py index 41e06b6..a3db6bb 100644 --- a/scripts/encoding.py +++ b/scripts/encoding.py @@ -202,8 +202,8 @@ def chunk_and_embed(text: str, return rows -def write_embeddings_batch(conn, batch: list[dict]) -> int: - """Single canonical INSERT. Sets created_at = NOW() server-side. Commits. +def write_embeddings_batch(conn, batch: list[dict], commit: bool = True) -> int: + """Single canonical INSERT. Sets created_at = NOW() server-side. Every row dict must supply 'type'. created_at is SQL-supplied (NOW()), so callers do not need to provide it. The application-layer assertion is the @@ -211,6 +211,11 @@ def write_embeddings_batch(conn, batch: list[dict]) -> int: historical NULLs were resolved by the Improvement #2 backfill, and a Python-level raise gives a faster, more debuggable failure than a Postgres constraint error. + + When commit=True (default), this function commits the connection itself. + When commit=False, the caller is responsible for committing. Use + commit=False when composing this write with other writes that must land + atomically in the same transaction. """ if not batch: return 0 @@ -233,5 +238,6 @@ def write_embeddings_batch(conn, batch: list[dict]) -> int: metadata = EXCLUDED.metadata """, (row["id"], row["document"], row["embedding"], row["source"], row["type"], json.dumps(row["metadata"]))) - conn.commit() + if commit: + conn.commit() return len(batch)