Add Pattern 1 async job model migration

Adds graphiti_jobs table for sidecar's async ingest queue and external_job_id column on stage_3_queue for worker's polling reference. Tonight's smoke test diagnosed that bulk ingest against the 4,222-entity graph commits successfully but the worker's 600s HTTP read-timeout fires before the sidecar's response returns. Three days of 'saga deadlock' failures were false negatives — the work succeeded; the worker just stopped listening. Pattern 1 separates submission from completion observation so the worker can't false-negative this way. Migration only — sidecar and worker code changes follow in subsequent commits.
2026-05-02 02:22:30 +00:00
parent 30beeb3a26
commit a0bf280075
1 changed files with 55 additions and 0 deletions
@@ -0,0 +1,55 @@
 -- Migration: 20260502-001_async_job_model
 -- Purpose: Pattern 1 async job model — sidecar processes ingest jobs serially
 --          via Postgres-backed queue. Worker submits and polls rather than
 --          blocking on synchronous HTTP response.
 --
 -- Architectural rationale: tonight's smoke test (2026-05-02 ~01:40-01:50 UTC)
 -- diagnosed that bulk ingest against a 4,222-entity graph commits successfully
 -- but the worker's HTTP read-timeout fires before the response returns. Three
 -- days of "saga deadlock" failures were false negatives — the work succeeded;
 -- the worker just stopped listening. Pattern 1 separates submission from
 -- completion observation so the worker can't false-negative this way.
 --
 -- The job model is also the natural data source for Phase A items 6-7
 -- (metrics tables) — graphiti_jobs records duration, status transitions,
 -- and per-job summary that those tables will aggregate.
 --
 -- Idempotent: safe to re-run.
 -- Job state for sidecar's async ingest queue.
 -- One row per submitted bulk-or-single ingest. Sidecar reads queued jobs
 -- on startup to resume after restart. Worker polls status until terminal.
 CREATE TABLE IF NOT EXISTS graphiti_jobs (
    job_id          UUID PRIMARY KEY,
    job_type        TEXT NOT NULL CHECK (job_type IN ('bulk', 'single')),
    payload         JSONB NOT NULL,                  -- full submitted request body
    status          TEXT NOT NULL DEFAULT 'queued'   -- 'queued'|'running'|'committed'|'failed'
                    CHECK (status IN ('queued', 'running', 'committed', 'failed')),
    enqueued_at     TIMESTAMPTZ NOT NULL DEFAULT NOW(),
    started_at      TIMESTAMPTZ,
    finished_at     TIMESTAMPTZ,
    error           TEXT,                            -- non-null when status='failed'
    summary         JSONB,                           -- {nodes: N, edges: N, episodes: N}
    submitted_by    TEXT                             -- worker name for traceability
 );
 -- Index supporting sidecar's "pick next queued job" query
 CREATE INDEX IF NOT EXISTS idx_graphiti_jobs_queued
    ON graphiti_jobs (enqueued_at)
    WHERE status = 'queued';
 -- Index supporting worker's "poll my job by id" query (PK already does this,
 -- but explicit index aids ANALYZE behavior on small tables)
 CREATE INDEX IF NOT EXISTS idx_graphiti_jobs_status
    ON graphiti_jobs (status);
 -- Stage 3 queue gains a reference to the sidecar job processing the row.
 -- When set, worker polls graphiti_jobs.status rather than blocking on HTTP.
 -- NULL means: row not yet submitted, or pre-Pattern-1 row.
 ALTER TABLE stage_3_queue
    ADD COLUMN IF NOT EXISTS external_job_id UUID;
 -- Index for "find rows that submitted but didn't complete" recovery scans
 CREATE INDEX IF NOT EXISTS idx_stage_3_queue_external_job
    ON stage_3_queue (external_job_id)
    WHERE external_job_id IS NOT NULL AND completed_at IS NULL AND failed_at IS NULL;