From a0bf28007582074f2c1449665beac0d4b045c2d1 Mon Sep 17 00:00:00 2001 From: Aaron Nelson Date: Sat, 2 May 2026 02:22:30 +0000 Subject: [PATCH] Add Pattern 1 async job model migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds graphiti_jobs table for sidecar's async ingest queue and external_job_id column on stage_3_queue for worker's polling reference. Tonight's smoke test diagnosed that bulk ingest against the 4,222-entity graph commits successfully but the worker's 600s HTTP read-timeout fires before the sidecar's response returns. Three days of 'saga deadlock' failures were false negatives — the work succeeded; the worker just stopped listening. Pattern 1 separates submission from completion observation so the worker can't false-negative this way. Migration only — sidecar and worker code changes follow in subsequent commits. --- migrations/20260502-001_async_job_model.sql | 55 +++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 migrations/20260502-001_async_job_model.sql diff --git a/migrations/20260502-001_async_job_model.sql b/migrations/20260502-001_async_job_model.sql new file mode 100644 index 0000000..68156c7 --- /dev/null +++ b/migrations/20260502-001_async_job_model.sql @@ -0,0 +1,55 @@ +-- Migration: 20260502-001_async_job_model +-- Purpose: Pattern 1 async job model — sidecar processes ingest jobs serially +-- via Postgres-backed queue. Worker submits and polls rather than +-- blocking on synchronous HTTP response. +-- +-- Architectural rationale: tonight's smoke test (2026-05-02 ~01:40-01:50 UTC) +-- diagnosed that bulk ingest against a 4,222-entity graph commits successfully +-- but the worker's HTTP read-timeout fires before the response returns. Three +-- days of "saga deadlock" failures were false negatives — the work succeeded; +-- the worker just stopped listening. Pattern 1 separates submission from +-- completion observation so the worker can't false-negative this way. +-- +-- The job model is also the natural data source for Phase A items 6-7 +-- (metrics tables) — graphiti_jobs records duration, status transitions, +-- and per-job summary that those tables will aggregate. +-- +-- Idempotent: safe to re-run. + +-- Job state for sidecar's async ingest queue. +-- One row per submitted bulk-or-single ingest. Sidecar reads queued jobs +-- on startup to resume after restart. Worker polls status until terminal. +CREATE TABLE IF NOT EXISTS graphiti_jobs ( + job_id UUID PRIMARY KEY, + job_type TEXT NOT NULL CHECK (job_type IN ('bulk', 'single')), + payload JSONB NOT NULL, -- full submitted request body + status TEXT NOT NULL DEFAULT 'queued' -- 'queued'|'running'|'committed'|'failed' + CHECK (status IN ('queued', 'running', 'committed', 'failed')), + enqueued_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + started_at TIMESTAMPTZ, + finished_at TIMESTAMPTZ, + error TEXT, -- non-null when status='failed' + summary JSONB, -- {nodes: N, edges: N, episodes: N} + submitted_by TEXT -- worker name for traceability +); + +-- Index supporting sidecar's "pick next queued job" query +CREATE INDEX IF NOT EXISTS idx_graphiti_jobs_queued + ON graphiti_jobs (enqueued_at) + WHERE status = 'queued'; + +-- Index supporting worker's "poll my job by id" query (PK already does this, +-- but explicit index aids ANALYZE behavior on small tables) +CREATE INDEX IF NOT EXISTS idx_graphiti_jobs_status + ON graphiti_jobs (status); + +-- Stage 3 queue gains a reference to the sidecar job processing the row. +-- When set, worker polls graphiti_jobs.status rather than blocking on HTTP. +-- NULL means: row not yet submitted, or pre-Pattern-1 row. +ALTER TABLE stage_3_queue + ADD COLUMN IF NOT EXISTS external_job_id UUID; + +-- Index for "find rows that submitted but didn't complete" recovery scans +CREATE INDEX IF NOT EXISTS idx_stage_3_queue_external_job + ON stage_3_queue (external_job_id) + WHERE external_job_id IS NOT NULL AND completed_at IS NULL AND failed_at IS NULL;