Add Pattern 1 async job model migration

Adds graphiti_jobs table for sidecar's async ingest queue and external_job_id column on stage_3_queue for worker's polling reference. Tonight's smoke test diagnosed that bulk ingest against the 4,222-entity graph commits successfully but the worker's 600s HTTP read-timeout fires before the sidecar's response returns. Three days of 'saga deadlock' failures were false negatives — the work succeeded; the worker just stopped listening. Pattern 1 separates submission from completion observation so the worker can't false-negative this way. Migration only — sidecar and worker code changes follow in subsequent commits.
2026-05-02 02:22:30 +00:00
parent 30beeb3a26
commit a0bf280075
1 changed files with 55 additions and 0 deletions
@@ -0,0 +1,55 @@
+-- Migration: 20260502-001_async_job_model
+-- Purpose: Pattern 1 async job model — sidecar processes ingest jobs serially
+--          via Postgres-backed queue. Worker submits and polls rather than
+--          blocking on synchronous HTTP response.
+--
+-- Architectural rationale: tonight's smoke test (2026-05-02 ~01:40-01:50 UTC)
+-- diagnosed that bulk ingest against a 4,222-entity graph commits successfully
+-- but the worker's HTTP read-timeout fires before the response returns. Three
+-- days of "saga deadlock" failures were false negatives — the work succeeded;
+-- the worker just stopped listening. Pattern 1 separates submission from
+-- completion observation so the worker can't false-negative this way.
+--
+-- The job model is also the natural data source for Phase A items 6-7
+-- (metrics tables) — graphiti_jobs records duration, status transitions,
+-- and per-job summary that those tables will aggregate.
+--
+-- Idempotent: safe to re-run.
+
+-- Job state for sidecar's async ingest queue.
+-- One row per submitted bulk-or-single ingest. Sidecar reads queued jobs
+-- on startup to resume after restart. Worker polls status until terminal.
+CREATE TABLE IF NOT EXISTS graphiti_jobs (
+    job_id          UUID PRIMARY KEY,
+    job_type        TEXT NOT NULL CHECK (job_type IN ('bulk', 'single')),
+    payload         JSONB NOT NULL,                  -- full submitted request body
+    status          TEXT NOT NULL DEFAULT 'queued'   -- 'queued'|'running'|'committed'|'failed'
+                    CHECK (status IN ('queued', 'running', 'committed', 'failed')),
+    enqueued_at     TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+    started_at      TIMESTAMPTZ,
+    finished_at     TIMESTAMPTZ,
+    error           TEXT,                            -- non-null when status='failed'
+    summary         JSONB,                           -- {nodes: N, edges: N, episodes: N}
+    submitted_by    TEXT                             -- worker name for traceability
+);
+
+-- Index supporting sidecar's "pick next queued job" query
+CREATE INDEX IF NOT EXISTS idx_graphiti_jobs_queued
+    ON graphiti_jobs (enqueued_at)
+    WHERE status = 'queued';
+
+-- Index supporting worker's "poll my job by id" query (PK already does this,
+-- but explicit index aids ANALYZE behavior on small tables)
+CREATE INDEX IF NOT EXISTS idx_graphiti_jobs_status
+    ON graphiti_jobs (status);
+
+-- Stage 3 queue gains a reference to the sidecar job processing the row.
+-- When set, worker polls graphiti_jobs.status rather than blocking on HTTP.
+-- NULL means: row not yet submitted, or pre-Pattern-1 row.
+ALTER TABLE stage_3_queue
+    ADD COLUMN IF NOT EXISTS external_job_id UUID;
+
+-- Index for "find rows that submitted but didn't complete" recovery scans
+CREATE INDEX IF NOT EXISTS idx_stage_3_queue_external_job
+    ON stage_3_queue (external_job_id)
+    WHERE external_job_id IS NOT NULL AND completed_at IS NULL AND failed_at IS NULL;