embeddings: backfill type and created_at (Improvement #2 part A)
Backfills 9,815 type-NULL rows to 'document' (extension classifier, 100% hit)
and 12,109 created_at-NULL rows via five batches:
C1 filepath_stat: 9,649 filesystem mtime via metadata.filepath
C2 watcher_state_unique: 676 unique source-name lookup in watcher_state
C3 watcher_state_collision_pick_latest_of_N:
234 collision; most-recent watcher mtime
C4 chatgpt_export: 1,548 convo create_time from export JSONs
(168/168 distinct convo_ids resolved)
C5 sentinel: 2 2026-04-26T00:00:00Z (pgvector migration date)
Provenance written to metadata.type_source and metadata.created_at_source
on every row changed by this run. type_source is empty on rows where the
type field was already populated pre-run; in those cases the snapshot
table is the source of truth for what changed.
Snapshot: embeddings_backup_2026_05_03 (CREATE TABLE AS SELECT id, type,
created_at, metadata FROM embeddings; 14,069 rows; revertable via id-join).
Verification:
V1 live counts: type_null=0 ca_null=0
V2 spot-check 11 rows across cohorts: provenance correct
V3 snapshot intact: 14,069 rows, pre-backfill NULL counts preserved
V4 cross-check vs snapshot: reconciles per-provenance to dry-run
Read-side use (B + C: writer enforcement + minimal retrieval read) deferred
to a separate session. The backfill is complete and verified, but the type
and created_at fields are not yet load-bearing — every current reader still
ignores them. Without B+C this lands as data prep, not behavior change.
This commit is contained in:
@@ -0,0 +1,857 @@
|
||||
{
|
||||
"generated_at": "2026-05-03T23:47:54.802182+00:00",
|
||||
"section_1": {
|
||||
"overall": {
|
||||
"total": 14069,
|
||||
"type_null": 9815,
|
||||
"ca_null": 12109,
|
||||
"both_null": 9815,
|
||||
"both_set": 1960
|
||||
},
|
||||
"cohorts": [
|
||||
{
|
||||
"type": "aaronai_conversation",
|
||||
"ca_null": false,
|
||||
"n": 71
|
||||
},
|
||||
{
|
||||
"type": "chatgpt_conversation",
|
||||
"ca_null": true,
|
||||
"n": 1548
|
||||
},
|
||||
{
|
||||
"type": "claude_conversation",
|
||||
"ca_null": false,
|
||||
"n": 1074
|
||||
},
|
||||
{
|
||||
"type": "claude_memory",
|
||||
"ca_null": true,
|
||||
"n": 1
|
||||
},
|
||||
{
|
||||
"type": "document",
|
||||
"ca_null": false,
|
||||
"n": 815
|
||||
},
|
||||
{
|
||||
"type": "document",
|
||||
"ca_null": true,
|
||||
"n": 745
|
||||
},
|
||||
{
|
||||
"type": null,
|
||||
"ca_null": true,
|
||||
"n": 9815
|
||||
}
|
||||
]
|
||||
},
|
||||
"section_2": {
|
||||
"by_ext": [
|
||||
{
|
||||
"ext": ".pdf",
|
||||
"rows": 6886
|
||||
},
|
||||
{
|
||||
"ext": ".txt",
|
||||
"rows": 1501
|
||||
},
|
||||
{
|
||||
"ext": ".docx",
|
||||
"rows": 1048
|
||||
},
|
||||
{
|
||||
"ext": ".pptx",
|
||||
"rows": 353
|
||||
},
|
||||
{
|
||||
"ext": ".md",
|
||||
"rows": 27
|
||||
}
|
||||
],
|
||||
"classified": 9815,
|
||||
"unclassifiable": 0
|
||||
},
|
||||
"section_3": {
|
||||
"watcher_state_paths": 1462,
|
||||
"watcher_state_basenames": 1183,
|
||||
"watcher_state_collisions": 109,
|
||||
"rows_with_filepath": {
|
||||
"total": 9816,
|
||||
"exists": 9649,
|
||||
"missing": 167,
|
||||
"outside_root": 0,
|
||||
"sample": [
|
||||
{
|
||||
"id": "f317f238_0",
|
||||
"source": "NO thesis proposal.docx",
|
||||
"filepath": "/home/aaron/nextcloud/data/data/aaron/files/Academic/DDF790 Thesis/Nic OConnor/NO thesis proposal.docx",
|
||||
"mtime": "2024-01-26T15:06:09Z"
|
||||
},
|
||||
{
|
||||
"id": "81047646_0",
|
||||
"source": "Metals II Syllabus.pdf",
|
||||
"filepath": "/home/aaron/nextcloud/data/data/aaron/files/Professional/Job Applications/Job Apps Fall 2015/App State/Metals II Syllabus.pdf",
|
||||
"mtime": "2012-02-26T22:45:15Z"
|
||||
},
|
||||
{
|
||||
"id": "81047646_1",
|
||||
"source": "Metals II Syllabus.pdf",
|
||||
"filepath": "/home/aaron/nextcloud/data/data/aaron/files/Professional/Job Applications/Job Apps Fall 2015/App State/Metals II Syllabus.pdf",
|
||||
"mtime": "2012-02-26T22:45:15Z"
|
||||
},
|
||||
{
|
||||
"id": "4e49d3b4_4",
|
||||
"source": "Circuit Intro.pdf",
|
||||
"filepath": "/home/aaron/nextcloud/data/data/aaron/files/Academic/DDF310 Mechatronics/Week 1/Circuit Intro.pdf",
|
||||
"mtime": "2022-01-31T23:28:56Z"
|
||||
},
|
||||
{
|
||||
"id": "81047646_2",
|
||||
"source": "Metals II Syllabus.pdf",
|
||||
"filepath": "/home/aaron/nextcloud/data/data/aaron/files/Professional/Job Applications/Job Apps Fall 2015/App State/Metals II Syllabus.pdf",
|
||||
"mtime": "2012-02-26T22:45:15Z"
|
||||
}
|
||||
]
|
||||
},
|
||||
"rows_without_filepath": {
|
||||
"total": 744,
|
||||
"distinct_basenames": 228,
|
||||
"unique_hit": 211,
|
||||
"collision_hit": 16,
|
||||
"unfound": 1
|
||||
},
|
||||
"collision_shapes": {
|
||||
"total": 109,
|
||||
"shape_counts": {
|
||||
"multi-live": 95,
|
||||
"live+archive": 14
|
||||
},
|
||||
"rows_affected_by_shape": {
|
||||
"multi-live": 85,
|
||||
"live+archive": 0
|
||||
},
|
||||
"samples": {
|
||||
"multi-live": [
|
||||
{
|
||||
"name": "README.md",
|
||||
"rows_no_fp_using_this_name": 0,
|
||||
"candidates": [
|
||||
{
|
||||
"path": "/home/aaron/nextcloud/data/data/aaron/files/README.md",
|
||||
"mtime": "2026-04-25T17:08:01Z"
|
||||
},
|
||||
{
|
||||
"path": "/home/aaron/nextcloud/data/data/aaron/files/Code/Processing/Nature of Code/The-Nature-of-Code-Examples/The-Nature-of-Code-Examples-master/README.md",
|
||||
"mtime": "2017-03-09T23:32:59Z"
|
||||
},
|
||||
{
|
||||
"path": "/home/aaron/nextcloud/data/data/aaron/files/Code/Python/open CV/opencv/sources/samples/hal/README.md",
|
||||
"mtime": "2016-12-21T10:37:05Z"
|
||||
},
|
||||
{
|
||||
"path": "/home/aaron/nextcloud/data/data/aaron/files/Code/Python/open CV/opencv/sources/platforms/maven/README.md",
|
||||
"mtime": "2016-12-21T10:37:05Z"
|
||||
},
|
||||
{
|
||||
"path": "/home/aaron/nextcloud/data/data/aaron/files/Code/Python/open CV/opencv/sources/README.md",
|
||||
"mtime": "2016-12-21T10:37:03Z"
|
||||
},
|
||||
{
|
||||
"path": "/home/aaron/nextcloud/data/data/aaron/files/Code/Python/open CV/opencv/sources/3rdparty/openvx/README.md",
|
||||
"mtime": "2016-12-21T10:37:03Z"
|
||||
},
|
||||
{
|
||||
"path": "/home/aaron/nextcloud/data/data/aaron/files/Code/Python/open CV/opencv/sources/3rdparty/openvx/hal/README.md",
|
||||
"mtime": "2016-12-21T10:37:03Z"
|
||||
},
|
||||
{
|
||||
"path": "/home/aaron/nextcloud/data/data/aaron/files/Code/Python/open CV/opencv/sources/3rdparty/carotene/README.md",
|
||||
"mtime": "2016-12-21T10:37:02Z"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "3DPrinting_v2.pptx",
|
||||
"rows_no_fp_using_this_name": 4,
|
||||
"candidates": [
|
||||
{
|
||||
"path": "/home/aaron/nextcloud/data/data/aaron/files/Presentations/Invited/Innovation Center/3DPrinting_v2.pptx",
|
||||
"mtime": "2026-04-24T19:34:49Z"
|
||||
},
|
||||
{
|
||||
"path": "/home/aaron/nextcloud/data/data/aaron/files/Presentations/Invited/Cuba/Assets/3DPrinting_v2.pptx",
|
||||
"mtime": "2026-04-24T19:34:18Z"
|
||||
},
|
||||
{
|
||||
"path": "/home/aaron/nextcloud/data/data/aaron/files/Presentations/Conference/3D Printing/3DPrinting_v2.pptx",
|
||||
"mtime": "2026-04-24T19:34:15Z"
|
||||
},
|
||||
{
|
||||
"path": "/home/aaron/nextcloud/data/data/aaron/files/Workshops/3DPrinting_v2.pptx",
|
||||
"mtime": "2026-04-24T19:30:14Z"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Print in Place.docx",
|
||||
"rows_no_fp_using_this_name": 0,
|
||||
"candidates": [
|
||||
{
|
||||
"path": "/home/aaron/nextcloud/data/data/aaron/files/Academic/DDF205 CAD1/Print in Place.docx",
|
||||
"mtime": "2017-08-24T03:50:36Z"
|
||||
},
|
||||
{
|
||||
"path": "/home/aaron/nextcloud/data/data/aaron/files/Academic/ARS393 CVS1/Print in Place.docx",
|
||||
"mtime": "2015-10-28T20:36:52Z"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"live+archive": [
|
||||
{
|
||||
"name": "dreamer-design-spec.md",
|
||||
"rows_no_fp_using_this_name": 0,
|
||||
"candidates": [
|
||||
{
|
||||
"path": "/home/aaron/nextcloud/data/data/aaron/files/Journal/dreamer-design-spec.md",
|
||||
"mtime": "2026-04-25T22:55:11Z"
|
||||
},
|
||||
{
|
||||
"path": "/home/aaron/nextcloud/data/data/aaron/files/Archive/dreamer-design-spec.md",
|
||||
"mtime": "2026-04-25T22:55:11Z"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "BirdAI-Ingest-Architecture.md",
|
||||
"rows_no_fp_using_this_name": 0,
|
||||
"candidates": [
|
||||
{
|
||||
"path": "/home/aaron/nextcloud/data/data/aaron/files/Journal/BirdAI-Ingest-Architecture.md",
|
||||
"mtime": "2026-04-28T00:08:38Z"
|
||||
},
|
||||
{
|
||||
"path": "/home/aaron/nextcloud/data/data/aaron/files/Archive/BirdAI-Ingest-Architecture.md",
|
||||
"mtime": "2026-04-28T00:08:38Z"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "graphiti-migration-plan.md",
|
||||
"rows_no_fp_using_this_name": 0,
|
||||
"candidates": [
|
||||
{
|
||||
"path": "/home/aaron/nextcloud/data/data/aaron/files/Journal/graphiti-migration-plan.md",
|
||||
"mtime": "2026-04-27T17:54:40Z"
|
||||
},
|
||||
{
|
||||
"path": "/home/aaron/nextcloud/data/data/aaron/files/Archive/Migration Plans/graphiti-migration-plan.md",
|
||||
"mtime": "2026-04-27T17:54:40Z"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"section_4": {
|
||||
"export_dir_exists": true,
|
||||
"files": [
|
||||
{
|
||||
"name": "conversations-000.json",
|
||||
"size": 19050556,
|
||||
"mtime": "2026-04-24T19:55:44Z"
|
||||
},
|
||||
{
|
||||
"name": "conversations-001.json",
|
||||
"size": 29057594,
|
||||
"mtime": "2026-04-24T19:55:44Z"
|
||||
}
|
||||
],
|
||||
"convo_index_size": 169,
|
||||
"sample_results": [
|
||||
{
|
||||
"id": "chatgpt_87cc0c47-aaf9-42da-8169-3b8922f3afba_0",
|
||||
"source": "ChatGPT: Dog named Bird",
|
||||
"convo_id": "87cc0c47-aaf9-42da-8169-3b8922f3afba",
|
||||
"create_time": 1708835138.51948,
|
||||
"create_time_iso": "2024-02-25T04:25:38.519480Z",
|
||||
"resolved": true
|
||||
},
|
||||
{
|
||||
"id": "chatgpt_689fab3e-d79c-8333-aeb5-7da4e9ca160d_0",
|
||||
"source": "ChatGPT: Video understanding limitations",
|
||||
"convo_id": "689fab3e-d79c-8333-aeb5-7da4e9ca160d",
|
||||
"create_time": 1755294541.894811,
|
||||
"create_time_iso": "2025-08-15T21:49:01.894811Z",
|
||||
"resolved": true
|
||||
},
|
||||
{
|
||||
"id": "chatgpt_611ff391-7fc0-42ea-bfd9-18dbe1739f19_7",
|
||||
"source": "ChatGPT: Calculating Truncated Cone Angle",
|
||||
"convo_id": "611ff391-7fc0-42ea-bfd9-18dbe1739f19",
|
||||
"create_time": 1724020869.471264,
|
||||
"create_time_iso": "2024-08-18T22:41:09.471264Z",
|
||||
"resolved": true
|
||||
},
|
||||
{
|
||||
"id": "chatgpt_68ce1921-084c-8330-877c-78df1e03e54c_50",
|
||||
"source": "ChatGPT: Soul music playlist ideas",
|
||||
"convo_id": "68ce1921-084c-8330-877c-78df1e03e54c",
|
||||
"create_time": 1758337313.438344,
|
||||
"create_time_iso": "2025-09-20T03:01:53.438344Z",
|
||||
"resolved": true
|
||||
},
|
||||
{
|
||||
"id": "chatgpt_c02e94f0-17db-4fd9-be04-13aaa1b728cb_1",
|
||||
"source": "ChatGPT: Create Rhino plugin in Python",
|
||||
"convo_id": "c02e94f0-17db-4fd9-be04-13aaa1b728cb",
|
||||
"create_time": 1682716259.557353,
|
||||
"create_time_iso": "2023-04-28T21:10:59.557353Z",
|
||||
"resolved": true
|
||||
}
|
||||
],
|
||||
"sample_resolved": 5,
|
||||
"full_cohort": {
|
||||
"distinct_convo_ids": 168,
|
||||
"resolvable_from_export": 168,
|
||||
"unresolvable": 0
|
||||
}
|
||||
},
|
||||
"section_5": {
|
||||
"earliest_per_type": [
|
||||
{
|
||||
"type": "aaronai_conversation",
|
||||
"earliest": "2026-04-26T17:43:28.056503",
|
||||
"latest": "2026-05-03T01:45:21.469613",
|
||||
"rows": 71
|
||||
},
|
||||
{
|
||||
"type": "claude_conversation",
|
||||
"earliest": "2026-02-28T20:33:36.146998Z",
|
||||
"latest": "2026-04-23T04:26:00.015419Z",
|
||||
"rows": 1074
|
||||
},
|
||||
{
|
||||
"type": "document",
|
||||
"earliest": "2026-04-30 16:42:55.360736+00",
|
||||
"latest": "2026-05-03 20:14:33.13663+00",
|
||||
"rows": 815
|
||||
}
|
||||
],
|
||||
"git_findings": [
|
||||
"037d7475738352dd13620486b5154d58fa6c037b 2026-04-28 00:15:46 +0000 chore: archive deprecated chromadb and migration scripts",
|
||||
"67766371789276ec4bcb8bac271b6eb9ddafa888 2026-04-27 05:16:37 +0000 Remove hardcoded PG password fallbacks \u2014 require PG_DSN env var in all scripts",
|
||||
"f78b83042bf2bb3d95c3604ee5d4431e76b103df 2026-04-26 21:16:04 +0000 Migrate to pgvector \u2014 remove ChromaDB from api.py, ingest scripts, dream.py",
|
||||
"8c8fba11b8d1b359b9b7722fc19b6ef562b812d8 2026-04-26 21:28:40 +0000 Add nightly conversation indexing \u2014 Aaron AI conversations into pgvector at 2:30AM",
|
||||
"f78b83042bf2bb3d95c3604ee5d4431e76b103df 2026-04-26 21:16:04 +0000 Migrate to pgvector \u2014 remove ChromaDB from api.py, ingest scripts, dream.py",
|
||||
"d2eed9890665a78a37fb5d336e8af75e7f2acb42 2026-04-26 20:19:49 +0000 Pre-pgvector migration checkpoint \u2014 upsert, allow_replace_deleted, maintenance timer"
|
||||
],
|
||||
"chromadb_candidates": [],
|
||||
"proposed_sentinel": "2026-04-26T00:00:00Z",
|
||||
"reasoning": "git f78b830 'Migrate to pgvector \u2014 remove ChromaDB from api.py, ingest scripts, dream.py' is dated 2026-04-26. The earliest type='document' row with a non-NULL created_at lands 2026-04-30 (the F11 canonical-encoding cutover). Rows with NULL created_at all predate F11 and most predate the pgvector cutover itself. 2026-04-26 is the date the ChromaDB->pgvector migration script was committed, so any row currently in the embeddings table with NULL created_at must have been ingested on or after that date (when the table came into existence in current form). It is the tightest defensible upper bound on 'the row entered pgvector before timestamps were tracked', so it is the right sentinel."
|
||||
},
|
||||
"section_6": [
|
||||
{
|
||||
"cohort": "A (type NULL, ca NULL)",
|
||||
"id": "f66c7390_6",
|
||||
"source": "Design Guide - FDM for Composite Tooling 2.0.pdf",
|
||||
"existing_type": null,
|
||||
"existing_ca": null,
|
||||
"inferred_type": "document",
|
||||
"inferred_ca": "2023-08-24T18:17:01Z",
|
||||
"inferred_ca_source": "filepath_stat"
|
||||
},
|
||||
{
|
||||
"cohort": "A (type NULL, ca NULL)",
|
||||
"id": "9cf798f8_151",
|
||||
"source": "Shop Class as Soulcraft An inquiry into the value of the -- Crawford, Matthew.pdf",
|
||||
"existing_type": null,
|
||||
"existing_ca": null,
|
||||
"inferred_type": "document",
|
||||
"inferred_ca": "2026-04-30T21:17:40.708026Z",
|
||||
"inferred_ca_source": "filepath_stat"
|
||||
},
|
||||
{
|
||||
"cohort": "A (type NULL, ca NULL)",
|
||||
"id": "fc378df0_329",
|
||||
"source": "ulysses.txt",
|
||||
"existing_type": null,
|
||||
"existing_ca": null,
|
||||
"inferred_type": "document",
|
||||
"inferred_ca": "2017-10-12T14:20:59Z",
|
||||
"inferred_ca_source": "filepath_stat"
|
||||
},
|
||||
{
|
||||
"cohort": "A (type NULL, ca NULL)",
|
||||
"id": "812bd5c6_0",
|
||||
"source": "Bennington College Cover Letter.pdf",
|
||||
"existing_type": null,
|
||||
"existing_ca": null,
|
||||
"inferred_type": "document",
|
||||
"inferred_ca": "2013-03-29T20:32:23Z",
|
||||
"inferred_ca_source": "filepath_stat"
|
||||
},
|
||||
{
|
||||
"cohort": "A (type NULL, ca NULL)",
|
||||
"id": "91ccefdd_185",
|
||||
"source": "Cognition in the Wild (A Bradford Book) -- Hutchins, Edwin.pdf",
|
||||
"existing_type": null,
|
||||
"existing_ca": null,
|
||||
"inferred_type": "document",
|
||||
"inferred_ca": "2026-04-25T17:21:35Z",
|
||||
"inferred_ca_source": "filepath_stat"
|
||||
},
|
||||
{
|
||||
"cohort": "A (type NULL, ca NULL)",
|
||||
"id": "48fa3d53_2",
|
||||
"source": "CMakeLists.txt",
|
||||
"existing_type": null,
|
||||
"existing_ca": null,
|
||||
"inferred_type": "document",
|
||||
"inferred_ca": "2016-12-21T10:37:05Z",
|
||||
"inferred_ca_source": "filepath_stat"
|
||||
},
|
||||
{
|
||||
"cohort": "A (type NULL, ca NULL)",
|
||||
"id": "49e3545d_9",
|
||||
"source": "RH50-TM-L1-EN-20140902.pdf",
|
||||
"existing_type": null,
|
||||
"existing_ca": null,
|
||||
"inferred_type": "document",
|
||||
"inferred_ca": "2014-09-02T18:44:08Z",
|
||||
"inferred_ca_source": "filepath_stat"
|
||||
},
|
||||
{
|
||||
"cohort": "A (type NULL, ca NULL)",
|
||||
"id": "a8366d89_144",
|
||||
"source": "Hackers and Painters_ Big Ideas from the Computer Age -- Graham, Paul.pdf",
|
||||
"existing_type": null,
|
||||
"existing_ca": null,
|
||||
"inferred_type": "document",
|
||||
"inferred_ca": "2026-04-24T22:25:03Z",
|
||||
"inferred_ca_source": "filepath_stat"
|
||||
},
|
||||
{
|
||||
"cohort": "A (type NULL, ca NULL)",
|
||||
"id": "3e3097f8_46",
|
||||
"source": "The Nature and Art of Workmanship -- David Pye.pdf",
|
||||
"existing_type": null,
|
||||
"existing_ca": null,
|
||||
"inferred_type": "document",
|
||||
"inferred_ca": "2026-04-24T22:24:03Z",
|
||||
"inferred_ca_source": "filepath_stat"
|
||||
},
|
||||
{
|
||||
"cohort": "A (type NULL, ca NULL)",
|
||||
"id": "87f9a5cf_269",
|
||||
"source": "Supersizing the Mind_ Embodiment, Action, and Cognitive -- Andy Clark.pdf",
|
||||
"existing_type": null,
|
||||
"existing_ca": null,
|
||||
"inferred_type": "document",
|
||||
"inferred_ca": "2026-04-25T17:14:25Z",
|
||||
"inferred_ca_source": "filepath_stat"
|
||||
},
|
||||
{
|
||||
"cohort": "B-doc-old (type='document', ca NULL)",
|
||||
"id": "cd3d1914_61",
|
||||
"source": "The world beyond your head _ on becoming an individual in an -- Crawford, Matthew B.pdf",
|
||||
"existing_type": "document",
|
||||
"existing_ca": null,
|
||||
"inferred_type": "document",
|
||||
"inferred_ca": "2026-04-27T16:04:25Z",
|
||||
"inferred_ca_source": "filepath_stat"
|
||||
},
|
||||
{
|
||||
"cohort": "B-doc-old (type='document', ca NULL)",
|
||||
"id": "592a1366_0",
|
||||
"source": "2026-04-29-synthesis.md",
|
||||
"existing_type": "document",
|
||||
"existing_ca": null,
|
||||
"inferred_type": "document",
|
||||
"inferred_ca": "2026-04-29T08:00:57.634567Z",
|
||||
"inferred_ca_source": "filepath_stat"
|
||||
},
|
||||
{
|
||||
"cohort": "B-doc-old (type='document', ca NULL)",
|
||||
"id": "cfb0a691_3",
|
||||
"source": "Consolidator-0.1-Specification.md",
|
||||
"existing_type": "document",
|
||||
"existing_ca": null,
|
||||
"inferred_type": "document",
|
||||
"inferred_ca": "2026-04-29T03:34:31Z",
|
||||
"inferred_ca_source": "watcher_state_unique"
|
||||
},
|
||||
{
|
||||
"cohort": "B-doc-old (type='document', ca NULL)",
|
||||
"id": "cd3d1914_57",
|
||||
"source": "The world beyond your head _ on becoming an individual in an -- Crawford, Matthew B.pdf",
|
||||
"existing_type": "document",
|
||||
"existing_ca": null,
|
||||
"inferred_type": "document",
|
||||
"inferred_ca": "2026-04-27T16:04:25Z",
|
||||
"inferred_ca_source": "filepath_stat"
|
||||
},
|
||||
{
|
||||
"cohort": "B-doc-old (type='document', ca NULL)",
|
||||
"id": "e65ef61c_8",
|
||||
"source": "BirdAI-Research-Context.md",
|
||||
"existing_type": "document",
|
||||
"existing_ca": null,
|
||||
"inferred_type": "document",
|
||||
"inferred_ca": "2026-04-29T15:57:07Z",
|
||||
"inferred_ca_source": "filepath_stat"
|
||||
},
|
||||
{
|
||||
"cohort": "B-doc-old (type='document', ca NULL)",
|
||||
"id": "4dce2922_3",
|
||||
"source": "cascade-optimization-protocol.md",
|
||||
"existing_type": "document",
|
||||
"existing_ca": null,
|
||||
"inferred_type": "document",
|
||||
"inferred_ca": "2026-04-28T05:46:24Z",
|
||||
"inferred_ca_source": "filepath_stat"
|
||||
},
|
||||
{
|
||||
"cohort": "B-doc-old (type='document', ca NULL)",
|
||||
"id": "077cc52d_1",
|
||||
"source": "graphiti-migration-plan.md",
|
||||
"existing_type": "document",
|
||||
"existing_ca": null,
|
||||
"inferred_type": "document",
|
||||
"inferred_ca": "2026-04-27T17:54:40Z",
|
||||
"inferred_ca_source": "watcher_state_collision_pick_latest_of_2"
|
||||
},
|
||||
{
|
||||
"cohort": "B-doc-old (type='document', ca NULL)",
|
||||
"id": "db356b14_70",
|
||||
"source": "Finite and infinite games -- James Carse.pdf",
|
||||
"existing_type": "document",
|
||||
"existing_ca": null,
|
||||
"inferred_type": "document",
|
||||
"inferred_ca": "2026-04-27T06:11:55Z",
|
||||
"inferred_ca_source": "watcher_state_collision_pick_latest_of_2"
|
||||
},
|
||||
{
|
||||
"cohort": "B-doc-old (type='document', ca NULL)",
|
||||
"id": "1f15bccf_38",
|
||||
"source": "BirdAI-Experiments-Log.md",
|
||||
"existing_type": "document",
|
||||
"existing_ca": null,
|
||||
"inferred_type": "document",
|
||||
"inferred_ca": "2026-05-01T16:40:02Z",
|
||||
"inferred_ca_source": "filepath_stat"
|
||||
},
|
||||
{
|
||||
"cohort": "B-doc-old (type='document', ca NULL)",
|
||||
"id": "db356b14_13",
|
||||
"source": "Finite and infinite games -- James Carse.pdf",
|
||||
"existing_type": "document",
|
||||
"existing_ca": null,
|
||||
"inferred_type": "document",
|
||||
"inferred_ca": "2026-04-27T06:11:55Z",
|
||||
"inferred_ca_source": "watcher_state_collision_pick_latest_of_2"
|
||||
},
|
||||
{
|
||||
"cohort": "B-chatgpt (type='chatgpt_conversation', ca NULL)",
|
||||
"id": "chatgpt_68fd20c6-d838-832d-90f4-154f63281f49_30",
|
||||
"source": "ChatGPT: External review for tenure",
|
||||
"existing_type": "chatgpt_conversation",
|
||||
"existing_ca": null,
|
||||
"inferred_type": "chatgpt_conversation",
|
||||
"inferred_ca": "2026-04-26T00:00:00Z",
|
||||
"inferred_ca_source": "sentinel"
|
||||
},
|
||||
{
|
||||
"cohort": "B-chatgpt (type='chatgpt_conversation', ca NULL)",
|
||||
"id": "chatgpt_691d6420-f544-8329-ae4b-f2b78da44c0e_7",
|
||||
"source": "ChatGPT: Website styling changes",
|
||||
"existing_type": "chatgpt_conversation",
|
||||
"existing_ca": null,
|
||||
"inferred_type": "chatgpt_conversation",
|
||||
"inferred_ca": "2026-04-26T00:00:00Z",
|
||||
"inferred_ca_source": "sentinel"
|
||||
},
|
||||
{
|
||||
"cohort": "B-chatgpt (type='chatgpt_conversation', ca NULL)",
|
||||
"id": "chatgpt_67fc4254-ef50-8009-9e0f-81864cca7cec_1",
|
||||
"source": "ChatGPT: Job Application Review",
|
||||
"existing_type": "chatgpt_conversation",
|
||||
"existing_ca": null,
|
||||
"inferred_type": "chatgpt_conversation",
|
||||
"inferred_ca": "2026-04-26T00:00:00Z",
|
||||
"inferred_ca_source": "sentinel"
|
||||
},
|
||||
{
|
||||
"cohort": "B-chatgpt (type='chatgpt_conversation', ca NULL)",
|
||||
"id": "chatgpt_68f3d936-d74c-8329-91df-fe838e292170_5",
|
||||
"source": "ChatGPT: SEC coaches with OSU ties",
|
||||
"existing_type": "chatgpt_conversation",
|
||||
"existing_ca": null,
|
||||
"inferred_type": "chatgpt_conversation",
|
||||
"inferred_ca": "2026-04-26T00:00:00Z",
|
||||
"inferred_ca_source": "sentinel"
|
||||
},
|
||||
{
|
||||
"cohort": "B-chatgpt (type='chatgpt_conversation', ca NULL)",
|
||||
"id": "chatgpt_691d1b5b-bb4c-832b-8d2e-11a86a569fcc_4",
|
||||
"source": "ChatGPT: Hosting app platforms",
|
||||
"existing_type": "chatgpt_conversation",
|
||||
"existing_ca": null,
|
||||
"inferred_type": "chatgpt_conversation",
|
||||
"inferred_ca": "2026-04-26T00:00:00Z",
|
||||
"inferred_ca_source": "sentinel"
|
||||
},
|
||||
{
|
||||
"cohort": "B-chatgpt (type='chatgpt_conversation', ca NULL)",
|
||||
"id": "chatgpt_bfa1cd2f-b8ab-4b11-b844-c47b2fa70612_1",
|
||||
"source": "ChatGPT: New chat",
|
||||
"existing_type": "chatgpt_conversation",
|
||||
"existing_ca": null,
|
||||
"inferred_type": "chatgpt_conversation",
|
||||
"inferred_ca": "2026-04-26T00:00:00Z",
|
||||
"inferred_ca_source": "sentinel"
|
||||
},
|
||||
{
|
||||
"cohort": "B-chatgpt (type='chatgpt_conversation', ca NULL)",
|
||||
"id": "chatgpt_68ce1921-084c-8330-877c-78df1e03e54c_37",
|
||||
"source": "ChatGPT: Soul music playlist ideas",
|
||||
"existing_type": "chatgpt_conversation",
|
||||
"existing_ca": null,
|
||||
"inferred_type": "chatgpt_conversation",
|
||||
"inferred_ca": "2026-04-26T00:00:00Z",
|
||||
"inferred_ca_source": "sentinel"
|
||||
},
|
||||
{
|
||||
"cohort": "B-chatgpt (type='chatgpt_conversation', ca NULL)",
|
||||
"id": "chatgpt_68fd20c6-d838-832d-90f4-154f63281f49_10",
|
||||
"source": "ChatGPT: External review for tenure",
|
||||
"existing_type": "chatgpt_conversation",
|
||||
"existing_ca": null,
|
||||
"inferred_type": "chatgpt_conversation",
|
||||
"inferred_ca": "2026-04-26T00:00:00Z",
|
||||
"inferred_ca_source": "sentinel"
|
||||
},
|
||||
{
|
||||
"cohort": "B-chatgpt (type='chatgpt_conversation', ca NULL)",
|
||||
"id": "chatgpt_691d6420-f544-8329-ae4b-f2b78da44c0e_10",
|
||||
"source": "ChatGPT: Website styling changes",
|
||||
"existing_type": "chatgpt_conversation",
|
||||
"existing_ca": null,
|
||||
"inferred_type": "chatgpt_conversation",
|
||||
"inferred_ca": "2026-04-26T00:00:00Z",
|
||||
"inferred_ca_source": "sentinel"
|
||||
},
|
||||
{
|
||||
"cohort": "B-chatgpt (type='chatgpt_conversation', ca NULL)",
|
||||
"id": "chatgpt_690286bd-0758-8332-8491-5d00c77f4696_1",
|
||||
"source": "ChatGPT: Airbrushing and finishing setup",
|
||||
"existing_type": "chatgpt_conversation",
|
||||
"existing_ca": null,
|
||||
"inferred_type": "chatgpt_conversation",
|
||||
"inferred_ca": "2026-04-26T00:00:00Z",
|
||||
"inferred_ca_source": "sentinel"
|
||||
},
|
||||
{
|
||||
"cohort": "C-doc-new (type='document', ca set)",
|
||||
"id": "6ef0e329_0",
|
||||
"source": "schematic-substrate-analysis.md",
|
||||
"existing_type": "document",
|
||||
"existing_ca": "2026-05-01 16:42:13.360795+00",
|
||||
"inferred_type": "document",
|
||||
"inferred_ca": "2026-05-01 16:42:13.360795+00",
|
||||
"inferred_ca_source": "preserved"
|
||||
},
|
||||
{
|
||||
"cohort": "C-doc-new (type='document', ca set)",
|
||||
"id": "02db1224_208",
|
||||
"source": "How Buildings Learn What Happens After They are Built -- Stewart Brand.pdf",
|
||||
"existing_type": "document",
|
||||
"existing_ca": "2026-04-30 22:21:56.211381+00",
|
||||
"inferred_type": "document",
|
||||
"inferred_ca": "2026-04-30 22:21:56.211381+00",
|
||||
"inferred_ca_source": "preserved"
|
||||
},
|
||||
{
|
||||
"cohort": "C-doc-new (type='document', ca set)",
|
||||
"id": "ead32317_93",
|
||||
"source": "Richard Sennett - The Craftsman.pdf",
|
||||
"existing_type": "document",
|
||||
"existing_ca": "2026-04-30 22:23:34.012202+00",
|
||||
"inferred_type": "document",
|
||||
"inferred_ca": "2026-04-30 22:23:34.012202+00",
|
||||
"inferred_ca_source": "preserved"
|
||||
},
|
||||
{
|
||||
"cohort": "C-doc-new (type='document', ca set)",
|
||||
"id": "6ef0e329_4",
|
||||
"source": "schematic-substrate-analysis.md",
|
||||
"existing_type": "document",
|
||||
"existing_ca": "2026-05-01 16:42:13.360795+00",
|
||||
"inferred_type": "document",
|
||||
"inferred_ca": "2026-05-01 16:42:13.360795+00",
|
||||
"inferred_ca_source": "preserved"
|
||||
},
|
||||
{
|
||||
"cohort": "C-doc-new (type='document', ca set)",
|
||||
"id": "02db1224_175",
|
||||
"source": "How Buildings Learn What Happens After They are Built -- Stewart Brand.pdf",
|
||||
"existing_type": "document",
|
||||
"existing_ca": "2026-04-30 22:21:56.211381+00",
|
||||
"inferred_type": "document",
|
||||
"inferred_ca": "2026-04-30 22:21:56.211381+00",
|
||||
"inferred_ca_source": "preserved"
|
||||
},
|
||||
{
|
||||
"cohort": "C-doc-new (type='document', ca set)",
|
||||
"id": "02db1224_101",
|
||||
"source": "How Buildings Learn What Happens After They are Built -- Stewart Brand.pdf",
|
||||
"existing_type": "document",
|
||||
"existing_ca": "2026-04-30 22:21:56.211381+00",
|
||||
"inferred_type": "document",
|
||||
"inferred_ca": "2026-04-30 22:21:56.211381+00",
|
||||
"inferred_ca_source": "preserved"
|
||||
},
|
||||
{
|
||||
"cohort": "C-doc-new (type='document', ca set)",
|
||||
"id": "02db1224_268",
|
||||
"source": "How Buildings Learn What Happens After They are Built -- Stewart Brand.pdf",
|
||||
"existing_type": "document",
|
||||
"existing_ca": "2026-04-30 22:21:56.211381+00",
|
||||
"inferred_type": "document",
|
||||
"inferred_ca": "2026-04-30 22:21:56.211381+00",
|
||||
"inferred_ca_source": "preserved"
|
||||
},
|
||||
{
|
||||
"cohort": "C-doc-new (type='document', ca set)",
|
||||
"id": "6ef0e329_5",
|
||||
"source": "schematic-substrate-analysis.md",
|
||||
"existing_type": "document",
|
||||
"existing_ca": "2026-05-01 16:42:13.360795+00",
|
||||
"inferred_type": "document",
|
||||
"inferred_ca": "2026-05-01 16:42:13.360795+00",
|
||||
"inferred_ca_source": "preserved"
|
||||
},
|
||||
{
|
||||
"cohort": "C-doc-new (type='document', ca set)",
|
||||
"id": "ead32317_132",
|
||||
"source": "Richard Sennett - The Craftsman.pdf",
|
||||
"existing_type": "document",
|
||||
"existing_ca": "2026-04-30 22:23:34.012202+00",
|
||||
"inferred_type": "document",
|
||||
"inferred_ca": "2026-04-30 22:23:34.012202+00",
|
||||
"inferred_ca_source": "preserved"
|
||||
},
|
||||
{
|
||||
"cohort": "C-doc-new (type='document', ca set)",
|
||||
"id": "02db1224_86",
|
||||
"source": "How Buildings Learn What Happens After They are Built -- Stewart Brand.pdf",
|
||||
"existing_type": "document",
|
||||
"existing_ca": "2026-04-30 22:21:56.211381+00",
|
||||
"inferred_type": "document",
|
||||
"inferred_ca": "2026-04-30 22:21:56.211381+00",
|
||||
"inferred_ca_source": "preserved"
|
||||
},
|
||||
{
|
||||
"cohort": "C-claude (type='claude_conversation', ca set)",
|
||||
"id": "claude_dacf89e3-1ee7-400d-8461-ef5920c82fe3_96",
|
||||
"source": "Claude: University of Utah interview teaching example",
|
||||
"existing_type": "claude_conversation",
|
||||
"existing_ca": "2026-03-11T18:05:57.594832Z",
|
||||
"inferred_type": "claude_conversation",
|
||||
"inferred_ca": "2026-03-11T18:05:57.594832Z",
|
||||
"inferred_ca_source": "preserved"
|
||||
},
|
||||
{
|
||||
"cohort": "C-claude (type='claude_conversation', ca set)",
|
||||
"id": "claude_c0baf4b0-a7bb-4664-ac7b-98d7b02f56a6_26",
|
||||
"source": "Claude: Weighing Utah versus Oklahoma",
|
||||
"existing_type": "claude_conversation",
|
||||
"existing_ca": "2026-04-01T19:08:26.722197Z",
|
||||
"inferred_type": "claude_conversation",
|
||||
"inferred_ca": "2026-04-01T19:08:26.722197Z",
|
||||
"inferred_ca_source": "preserved"
|
||||
},
|
||||
{
|
||||
"cohort": "C-claude (type='claude_conversation', ca set)",
|
||||
"id": "claude_bbe0172d-3087-4238-a51c-7dca6c0b6f28_92",
|
||||
"source": "Claude: Setting up a custom OpenClaw instance",
|
||||
"existing_type": "claude_conversation",
|
||||
"existing_ca": "2026-04-23T04:26:00.015419Z",
|
||||
"inferred_type": "claude_conversation",
|
||||
"inferred_ca": "2026-04-23T04:26:00.015419Z",
|
||||
"inferred_ca_source": "preserved"
|
||||
},
|
||||
{
|
||||
"cohort": "C-claude (type='claude_conversation', ca set)",
|
||||
"id": "claude_42dbddc5-12ba-4de7-a685-043473189da9_6",
|
||||
"source": "Claude: I filling out my annual report...",
|
||||
"existing_type": "claude_conversation",
|
||||
"existing_ca": "2026-03-24T14:34:47.870625Z",
|
||||
"inferred_type": "claude_conversation",
|
||||
"inferred_ca": "2026-03-24T14:34:47.870625Z",
|
||||
"inferred_ca_source": "preserved"
|
||||
},
|
||||
{
|
||||
"cohort": "C-claude (type='claude_conversation', ca set)",
|
||||
"id": "claude_bbe0172d-3087-4238-a51c-7dca6c0b6f28_1344",
|
||||
"source": "Claude: Setting up a custom OpenClaw instance",
|
||||
"existing_type": "claude_conversation",
|
||||
"existing_ca": "2026-04-23T04:26:00.015419Z",
|
||||
"inferred_type": "claude_conversation",
|
||||
"inferred_ca": "2026-04-23T04:26:00.015419Z",
|
||||
"inferred_ca_source": "preserved"
|
||||
},
|
||||
{
|
||||
"cohort": "C-aaronai (type='aaronai_conversation', ca set)",
|
||||
"id": "aaronai_conv_28ee8a447d3fc922_6",
|
||||
"source": "Aaron AI: I'm working on you",
|
||||
"existing_type": "aaronai_conversation",
|
||||
"existing_ca": "2026-04-26T17:43:28.056503",
|
||||
"inferred_type": "aaronai_conversation",
|
||||
"inferred_ca": "2026-04-26T17:43:28.056503",
|
||||
"inferred_ca_source": "preserved"
|
||||
},
|
||||
{
|
||||
"cohort": "C-aaronai (type='aaronai_conversation', ca set)",
|
||||
"id": "aaronai_conv_7deef2e8001f0e45_20",
|
||||
"source": "Aaron AI: Who's covering for me on sabbatical?",
|
||||
"existing_type": "aaronai_conversation",
|
||||
"existing_ca": "2026-04-29T22:19:45.312349",
|
||||
"inferred_type": "aaronai_conversation",
|
||||
"inferred_ca": "2026-04-29T22:19:45.312349",
|
||||
"inferred_ca_source": "preserved"
|
||||
},
|
||||
{
|
||||
"cohort": "C-aaronai (type='aaronai_conversation', ca set)",
|
||||
"id": "aaronai_conv_21cabf771708df70_42",
|
||||
"source": "Aaron AI: What should I be the most excited about right now?",
|
||||
"existing_type": "aaronai_conversation",
|
||||
"existing_ca": "2026-04-27T07:06:03.996026",
|
||||
"inferred_type": "aaronai_conversation",
|
||||
"inferred_ca": "2026-04-27T07:06:03.996026",
|
||||
"inferred_ca_source": "preserved"
|
||||
},
|
||||
{
|
||||
"cohort": "C-aaronai (type='aaronai_conversation', ca set)",
|
||||
"id": "aaronai_conv_7deef2e8001f0e45_12",
|
||||
"source": "Aaron AI: Who's covering for me on sabbatical?",
|
||||
"existing_type": "aaronai_conversation",
|
||||
"existing_ca": "2026-04-29T22:19:45.312349",
|
||||
"inferred_type": "aaronai_conversation",
|
||||
"inferred_ca": "2026-04-29T22:19:45.312349",
|
||||
"inferred_ca_source": "preserved"
|
||||
},
|
||||
{
|
||||
"cohort": "C-aaronai (type='aaronai_conversation', ca set)",
|
||||
"id": "aaronai_conv_ed40b4278a9c8110_4",
|
||||
"source": "Aaron AI: Let's say you're building an analog of the human brain, and ...",
|
||||
"existing_type": "aaronai_conversation",
|
||||
"existing_ca": "2026-05-03T01:45:21.469613",
|
||||
"inferred_type": "aaronai_conversation",
|
||||
"inferred_ca": "2026-05-03T01:45:21.469613",
|
||||
"inferred_ca_source": "preserved"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,304 @@
|
||||
"""Backfill embeddings.type and embeddings.created_at (Improvement #2 / A.3).
|
||||
|
||||
Idempotent on cohort predicates (every WHERE clause includes IS NULL on the
|
||||
target column). Writes provenance to metadata.type_source and metadata.created_at_source
|
||||
so each row is auditable and revertable per-source. Default --dry-run=True.
|
||||
|
||||
Order of batches:
|
||||
T1. type backfill: WHERE type IS NULL -> 'document' (extension-classified, all hit).
|
||||
C1. created_at: WHERE ca IS NULL AND metadata.filepath stat-resolves -> filesystem mtime.
|
||||
C2. created_at: WHERE ca IS NULL AND source has unique watcher_state path -> watcher mtime.
|
||||
C3. created_at: WHERE ca IS NULL AND source has watcher_state collision -> most-recent mtime.
|
||||
C4. created_at: WHERE type='chatgpt_conversation' AND ca IS NULL -> export-resolved create_time.
|
||||
C5. created_at: WHERE ca IS NULL (residual) -> sentinel.
|
||||
|
||||
Snapshot table embeddings_backup_2026_05_03 must exist before --apply.
|
||||
|
||||
Usage:
|
||||
venv/bin/python3 scripts/experiments/embeddings_backfill_apply.py # dry-run
|
||||
venv/bin/python3 scripts/experiments/embeddings_backfill_apply.py --apply # write
|
||||
|
||||
Exits non-zero if snapshot is missing on --apply.
|
||||
"""
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from collections import Counter, defaultdict
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor, Json
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
PG_DSN = os.getenv("PG_DSN")
|
||||
WATCHER_STATE = Path.home() / "aaronai" / "watcher_state.json"
|
||||
CHATGPT_EXPORT_DIR = Path("/home/aaron/nextcloud/data/data/aaron/files/Archive/Misc/ChatGPT Export")
|
||||
SNAPSHOT_TABLE = "embeddings_backup_2026_05_03"
|
||||
SENTINEL_ISO = "2026-04-26T00:00:00Z"
|
||||
|
||||
|
||||
# ─── Helpers ────────────────────────────────────────────────────────────────
|
||||
|
||||
def get_pg():
|
||||
return psycopg2.connect(PG_DSN, cursor_factory=RealDictCursor)
|
||||
|
||||
|
||||
def header(t):
|
||||
bar = "=" * 70
|
||||
print(f"\n{bar}\n{t}\n{bar}")
|
||||
|
||||
|
||||
def fmt_ts_unix(ts):
|
||||
return datetime.fromtimestamp(float(ts), tz=timezone.utc).isoformat().replace("+00:00", "Z")
|
||||
|
||||
|
||||
def fmt_ts_mtime(p):
|
||||
try:
|
||||
return datetime.fromtimestamp(p.stat().st_mtime, tz=timezone.utc).isoformat().replace("+00:00", "Z")
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def load_watcher_state():
|
||||
state = json.loads(WATCHER_STATE.read_text())
|
||||
by_name = defaultdict(list)
|
||||
for path, mtime in state.items():
|
||||
by_name[Path(path).name].append((path, mtime))
|
||||
return by_name
|
||||
|
||||
|
||||
def load_chatgpt_index():
|
||||
if not CHATGPT_EXPORT_DIR.exists():
|
||||
return {}
|
||||
index = {}
|
||||
for f in sorted(CHATGPT_EXPORT_DIR.glob("conversations*.json")):
|
||||
try:
|
||||
data = json.loads(f.read_text(encoding="utf-8"))
|
||||
except Exception:
|
||||
continue
|
||||
for convo in data:
|
||||
cid = convo.get("id") or convo.get("conversation_id")
|
||||
ct = convo.get("create_time")
|
||||
if cid and ct is not None:
|
||||
index[cid] = ct
|
||||
return index
|
||||
|
||||
|
||||
def assert_snapshot(cur):
|
||||
cur.execute("SELECT to_regclass(%s) AS t;", (SNAPSHOT_TABLE,))
|
||||
if cur.fetchone()["t"] is None:
|
||||
print(f"ERROR: snapshot table '{SNAPSHOT_TABLE}' not found. Run A.2 first.")
|
||||
sys.exit(2)
|
||||
cur.execute(f"SELECT COUNT(*) AS n FROM {SNAPSHOT_TABLE};")
|
||||
snap = cur.fetchone()["n"]
|
||||
cur.execute("SELECT COUNT(*) AS n FROM embeddings;")
|
||||
live = cur.fetchone()["n"]
|
||||
print(f"snapshot {SNAPSHOT_TABLE}: {snap} rows; live embeddings: {live} rows")
|
||||
if snap != live:
|
||||
print(f"ERROR: snapshot row count != live ({snap} vs {live}). Refresh snapshot before --apply.")
|
||||
sys.exit(2)
|
||||
|
||||
|
||||
# ─── Batch primitive ────────────────────────────────────────────────────────
|
||||
|
||||
def run_batch(cur, label, candidates, apply_mode):
|
||||
"""candidates: list of (id, set_type, set_ca, type_source, ca_source).
|
||||
set_type / set_ca may be None to leave that column alone.
|
||||
In dry-run we still execute UPDATEs inside an outer transaction (rolled back
|
||||
at the end) so subsequent batches' SELECTs see the correct intermediate state."""
|
||||
n = len(candidates)
|
||||
print(f" {label}: {n} rows queued")
|
||||
if n == 0:
|
||||
return 0
|
||||
for c in candidates[:3]:
|
||||
print(f" sample: id={c[0]} type={c[1]!r} ca={c[2]!r} type_src={c[3]} ca_src={c[4]}")
|
||||
n_written = 0
|
||||
for row_id, set_type, set_ca, type_src, ca_src in candidates:
|
||||
meta_patch = {}
|
||||
if type_src:
|
||||
meta_patch["type_source"] = type_src
|
||||
if ca_src:
|
||||
meta_patch["created_at_source"] = ca_src
|
||||
# Build set list dynamically.
|
||||
sets, params = [], []
|
||||
if set_type is not None:
|
||||
sets.append("type = %s")
|
||||
params.append(set_type)
|
||||
if set_ca is not None:
|
||||
sets.append("created_at = %s")
|
||||
params.append(set_ca)
|
||||
if meta_patch:
|
||||
sets.append("metadata = COALESCE(metadata, '{}'::jsonb) || %s::jsonb")
|
||||
params.append(json.dumps(meta_patch))
|
||||
params.append(row_id)
|
||||
cur.execute(f"UPDATE embeddings SET {', '.join(sets)} WHERE id = %s;", params)
|
||||
n_written += cur.rowcount
|
||||
print(f" {n_written} rows updated{' (will rollback)' if not apply_mode else ''}")
|
||||
return n_written
|
||||
|
||||
|
||||
# ─── Batches ────────────────────────────────────────────────────────────────
|
||||
|
||||
def batch_T1_type(cur, apply_mode):
|
||||
"""type IS NULL -> 'document'. All cohort A rows have a SUPPORTED extension."""
|
||||
cur.execute("""
|
||||
SELECT id, source FROM embeddings WHERE type IS NULL ORDER BY id;
|
||||
""")
|
||||
rows = cur.fetchall()
|
||||
cands = [(r["id"], "document", None, "inferred_extension", None) for r in rows]
|
||||
return run_batch(cur, "T1 type IS NULL -> 'document'", cands, apply_mode)
|
||||
|
||||
|
||||
def batch_C1_filepath_stat(cur, apply_mode):
|
||||
"""ca IS NULL AND metadata.filepath stat-resolves -> mtime."""
|
||||
cur.execute("""
|
||||
SELECT id, source, metadata->>'filepath' AS fp
|
||||
FROM embeddings
|
||||
WHERE created_at IS NULL AND metadata->>'filepath' IS NOT NULL
|
||||
ORDER BY id;
|
||||
""")
|
||||
rows = cur.fetchall()
|
||||
cands, n_skipped_missing = [], 0
|
||||
for r in rows:
|
||||
p = Path(r["fp"])
|
||||
if p.exists():
|
||||
mt = fmt_ts_mtime(p)
|
||||
if mt:
|
||||
cands.append((r["id"], None, mt, None, "filepath_stat"))
|
||||
continue
|
||||
n_skipped_missing += 1
|
||||
print(f" C1 candidates: {len(cands)} (skipped {n_skipped_missing} where filepath gone or unstattable)")
|
||||
return run_batch(cur, "C1 ca IS NULL AND filepath stat-resolves -> mtime", cands, apply_mode)
|
||||
|
||||
|
||||
def batch_C2_C3_watcher_state(cur, apply_mode):
|
||||
"""ca IS NULL AND filepath unresolvable -> watcher_state by source basename.
|
||||
C2 = unique hit, C3 = collision pick-latest."""
|
||||
by_name = load_watcher_state()
|
||||
cur.execute("""
|
||||
SELECT id, source, metadata->>'filepath' AS fp
|
||||
FROM embeddings
|
||||
WHERE created_at IS NULL
|
||||
ORDER BY id;
|
||||
""")
|
||||
rows = cur.fetchall()
|
||||
c2, c3 = [], []
|
||||
skipped_no_match = 0
|
||||
for r in rows:
|
||||
# skip rows already targeted by C1 path
|
||||
if r["fp"] and Path(r["fp"]).exists():
|
||||
continue
|
||||
src = r["source"]
|
||||
if not src or src not in by_name:
|
||||
skipped_no_match += 1
|
||||
continue
|
||||
candidates = by_name[src]
|
||||
if len(candidates) == 1:
|
||||
mt = fmt_ts_unix(candidates[0][1])
|
||||
c2.append((r["id"], None, mt, None, "watcher_state_unique"))
|
||||
else:
|
||||
latest = max(candidates, key=lambda x: float(x[1]))
|
||||
mt = fmt_ts_unix(latest[1])
|
||||
c3.append((r["id"], None, mt, None, f"watcher_state_collision_pick_latest_of_{len(candidates)}"))
|
||||
print(f" C2/C3 source-basename fallback: {len(c2)} unique, {len(c3)} collision, "
|
||||
f"{skipped_no_match} unmatched (will fall to C4/C5)")
|
||||
n2 = run_batch(cur, "C2 ca IS NULL AND watcher_state unique -> mtime", c2, apply_mode)
|
||||
n3 = run_batch(cur, "C3 ca IS NULL AND watcher_state collision -> latest mtime", c3, apply_mode)
|
||||
return n2 + n3
|
||||
|
||||
|
||||
def batch_C4_chatgpt_export(cur, apply_mode):
|
||||
index = load_chatgpt_index()
|
||||
cur.execute("""
|
||||
SELECT id, source FROM embeddings
|
||||
WHERE type='chatgpt_conversation' AND created_at IS NULL ORDER BY id;
|
||||
""")
|
||||
rows = cur.fetchall()
|
||||
cands, unresolved = [], 0
|
||||
for r in rows:
|
||||
m = re.match(r"^chatgpt_(.+)_(\d+)$", r["id"])
|
||||
cid = m.group(1) if m else None
|
||||
ct = index.get(cid)
|
||||
if ct is None:
|
||||
unresolved += 1
|
||||
continue
|
||||
ct_iso = datetime.fromtimestamp(float(ct), tz=timezone.utc).isoformat().replace("+00:00", "Z")
|
||||
cands.append((r["id"], None, ct_iso, None, "chatgpt_export"))
|
||||
print(f" C4 chatgpt export resolution: {len(cands)} resolved, {unresolved} unresolved (fall to C5)")
|
||||
return run_batch(cur, "C4 type='chatgpt_conversation' AND ca IS NULL -> export create_time", cands, apply_mode)
|
||||
|
||||
|
||||
def batch_C5_sentinel(cur, apply_mode):
|
||||
cur.execute("""
|
||||
SELECT id, type, source FROM embeddings WHERE created_at IS NULL ORDER BY id;
|
||||
""")
|
||||
rows = cur.fetchall()
|
||||
cands = [(r["id"], None, SENTINEL_ISO, None, "sentinel") for r in rows]
|
||||
if cands:
|
||||
sample_types = Counter(r["type"] for r in rows)
|
||||
print(f" C5 residual sentinel rows by type: {dict(sample_types)}")
|
||||
return run_batch(cur, f"C5 ca IS NULL residual -> sentinel {SENTINEL_ISO}", cands, apply_mode)
|
||||
|
||||
|
||||
# ─── Pre/post counts ────────────────────────────────────────────────────────
|
||||
|
||||
def print_counts(cur, label):
|
||||
cur.execute("""
|
||||
SELECT
|
||||
COUNT(*) AS total,
|
||||
COUNT(*) FILTER (WHERE type IS NULL) AS type_null,
|
||||
COUNT(*) FILTER (WHERE created_at IS NULL) AS ca_null
|
||||
FROM embeddings;
|
||||
""")
|
||||
r = cur.fetchone()
|
||||
print(f" [{label}] total={r['total']} type_null={r['type_null']} ca_null={r['ca_null']}")
|
||||
|
||||
|
||||
# ─── Driver ─────────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--apply", action="store_true", help="default false (dry-run)")
|
||||
args = ap.parse_args()
|
||||
apply_mode = args.apply
|
||||
|
||||
pg = get_pg()
|
||||
cur = pg.cursor()
|
||||
|
||||
print(f"Mode: {'APPLY (writes will commit)' if apply_mode else 'DRY-RUN (no writes)'}")
|
||||
print(f"Sentinel: {SENTINEL_ISO}")
|
||||
|
||||
if apply_mode:
|
||||
assert_snapshot(cur)
|
||||
|
||||
header("PRE-COUNTS")
|
||||
print_counts(cur, "before")
|
||||
|
||||
header("BATCHES")
|
||||
n_t1 = batch_T1_type(cur, apply_mode)
|
||||
n_c1 = batch_C1_filepath_stat(cur, apply_mode)
|
||||
n_c2c3 = batch_C2_C3_watcher_state(cur, apply_mode)
|
||||
n_c4 = batch_C4_chatgpt_export(cur, apply_mode)
|
||||
n_c5 = batch_C5_sentinel(cur, apply_mode)
|
||||
|
||||
header("POST-COUNTS")
|
||||
print_counts(cur, "after" if apply_mode else "after (in-transaction, will rollback)")
|
||||
|
||||
if apply_mode:
|
||||
pg.commit()
|
||||
print("\nCOMMITTED.")
|
||||
else:
|
||||
pg.rollback()
|
||||
print("\nROLLED BACK (dry-run).")
|
||||
|
||||
print(f"\nSummary: T1={n_t1} C1={n_c1} C2+C3={n_c2c3} C4={n_c4} C5={n_c5}")
|
||||
pg.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,557 @@
|
||||
"""Read-only inspection for the embeddings.type / embeddings.created_at backfill (Improvement #2 / A.1).
|
||||
|
||||
Produces a survey of every backfill source-of-truth question without writing
|
||||
to the database. Output is a human-readable report on stdout plus a JSON
|
||||
sidecar at experiments/embeddings_backfill_inspection_<date>.json.
|
||||
|
||||
Sections:
|
||||
1. Cohort recap (counts; should match prior investigation).
|
||||
2. Cohort A type inference: extension classifier coverage.
|
||||
3. created_at inference for cohort A + B-doc-old:
|
||||
- rows with metadata.filepath: stat the file, check existence.
|
||||
- rows without filepath: lookup source against watcher_state.json.
|
||||
- filename-collision shape audit (live+backup, live+archive, ambiguous).
|
||||
4. ChatGPT export resolution (Plan A.1 addition #1):
|
||||
- existence of /home/aaron/nextcloud/.../ChatGPT Export/.
|
||||
- sample 5 B-chatgpt rows; resolve convo_id -> create_time.
|
||||
5. Sentinel date discovery (Plan A.1 addition #3):
|
||||
- earliest non-NULL created_at per type (already-populated rows are the
|
||||
lower bound for when the substrate started carrying timestamps).
|
||||
- git log for the pgvector migration commit.
|
||||
- any ChromaDB sqlite still on disk.
|
||||
- propose a sentinel with reasoning, or flag as arbitrary.
|
||||
6. 50-row stratified sample: derived (type, created_at, source) per row.
|
||||
|
||||
Usage: venv/bin/python3 scripts/experiments/embeddings_backfill_inspection.py
|
||||
|
||||
Read-only. No DB writes. No filesystem writes outside experiments/.
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
from collections import Counter, defaultdict
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
PG_DSN = os.getenv("PG_DSN")
|
||||
WATCHER_STATE = Path.home() / "aaronai" / "watcher_state.json"
|
||||
CHATGPT_EXPORT_DIR = Path("/home/aaron/nextcloud/data/data/aaron/files/Archive/Misc/ChatGPT Export")
|
||||
NEXTCLOUD_ROOT = Path("/home/aaron/nextcloud/data/data/aaron/files")
|
||||
OUT_PATH = Path.home() / "aaronai" / "experiments" / f"embeddings_backfill_inspection_{datetime.now().strftime('%Y-%m-%d')}.json"
|
||||
|
||||
SUPPORTED_EXT = {".pdf", ".docx", ".pptx", ".txt", ".md"}
|
||||
|
||||
random.seed(20260503)
|
||||
|
||||
|
||||
# ─── Helpers ────────────────────────────────────────────────────────────────
|
||||
|
||||
def get_pg():
|
||||
return psycopg2.connect(PG_DSN, cursor_factory=RealDictCursor)
|
||||
|
||||
|
||||
def header(title):
|
||||
bar = "=" * 70
|
||||
print(f"\n{bar}\n{title}\n{bar}")
|
||||
|
||||
|
||||
def sub(title):
|
||||
print(f"\n--- {title} ---")
|
||||
|
||||
|
||||
def fmt_ts_from_unix(ts):
|
||||
"""Watcher state stores unix timestamps as strings."""
|
||||
try:
|
||||
return datetime.fromtimestamp(float(ts), tz=timezone.utc).isoformat().replace("+00:00", "Z")
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def fmt_ts_from_st_mtime(p):
|
||||
try:
|
||||
return datetime.fromtimestamp(p.stat().st_mtime, tz=timezone.utc).isoformat().replace("+00:00", "Z")
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def load_watcher_state():
|
||||
"""Returns (path -> mtime_str), and (basename -> [(path, mtime_str), ...])."""
|
||||
state = json.loads(WATCHER_STATE.read_text())
|
||||
by_path = state
|
||||
by_name = defaultdict(list)
|
||||
for path, mtime in state.items():
|
||||
by_name[Path(path).name].append((path, mtime))
|
||||
return by_path, by_name
|
||||
|
||||
|
||||
def classify_collision_shape(paths):
|
||||
"""Categorize a filename-collision group:
|
||||
- 'live+backup' : exactly one path doesn't contain backup/.bak markers
|
||||
and others do
|
||||
- 'live+archive' : exactly one is outside Archive/ and others are inside
|
||||
- 'multi-live' : >=2 paths look like live (no backup/archive markers)
|
||||
- 'all-archive' : every path is inside Archive/ or backup-like
|
||||
- 'other'
|
||||
"""
|
||||
def is_backup(p):
|
||||
s = p.lower()
|
||||
return ".bak" in s or "/backup" in s or "backups/" in s
|
||||
def is_archive(p):
|
||||
s = p.lower()
|
||||
return "/archive/" in s
|
||||
backups = [p for p in paths if is_backup(p)]
|
||||
archives = [p for p in paths if is_archive(p)]
|
||||
live = [p for p in paths if not is_backup(p) and not is_archive(p)]
|
||||
if len(live) == 1 and len(backups) >= 1 and len(archives) == 0:
|
||||
return "live+backup"
|
||||
if len(live) == 1 and len(archives) >= 1 and len(backups) == 0:
|
||||
return "live+archive"
|
||||
if len(live) == 1 and (len(backups) + len(archives)) >= 1:
|
||||
return "live+mixed-old"
|
||||
if len(live) >= 2:
|
||||
return "multi-live"
|
||||
if len(live) == 0:
|
||||
return "all-archive-or-backup"
|
||||
return "other"
|
||||
|
||||
|
||||
# ─── Section 1: Cohort recap ────────────────────────────────────────────────
|
||||
|
||||
def section_1_cohort_recap(cur):
|
||||
header("1. COHORT RECAP")
|
||||
cur.execute("""
|
||||
SELECT
|
||||
COUNT(*) AS total,
|
||||
COUNT(*) FILTER (WHERE type IS NULL) AS type_null,
|
||||
COUNT(*) FILTER (WHERE created_at IS NULL) AS ca_null,
|
||||
COUNT(*) FILTER (WHERE type IS NULL AND created_at IS NULL) AS both_null,
|
||||
COUNT(*) FILTER (WHERE type IS NOT NULL AND created_at IS NOT NULL) AS both_set
|
||||
FROM embeddings;
|
||||
""")
|
||||
overall = cur.fetchone()
|
||||
print(f"Total: {overall['total']} type_null: {overall['type_null']} "
|
||||
f"ca_null: {overall['ca_null']} both_null: {overall['both_null']} "
|
||||
f"both_set: {overall['both_set']}")
|
||||
|
||||
cur.execute("""
|
||||
SELECT type, created_at IS NULL AS ca_null, COUNT(*) AS n
|
||||
FROM embeddings GROUP BY type, ca_null ORDER BY type NULLS LAST, ca_null;
|
||||
""")
|
||||
cohorts = cur.fetchall()
|
||||
sub("Per-(type, ca_null) cohorts")
|
||||
for r in cohorts:
|
||||
print(f" type={r['type'] or 'NULL':<22} ca_null={r['ca_null']!s:<5} n={r['n']}")
|
||||
return {"overall": overall, "cohorts": cohorts}
|
||||
|
||||
|
||||
# ─── Section 2: Cohort A type inference ─────────────────────────────────────
|
||||
|
||||
def section_2_type_inference(cur):
|
||||
header("2. COHORT A TYPE INFERENCE (extension classifier)")
|
||||
cur.execute("""
|
||||
SELECT LOWER(SUBSTRING(source FROM '\.[^.]+$')) AS ext, COUNT(*) AS rows
|
||||
FROM embeddings WHERE type IS NULL
|
||||
GROUP BY ext ORDER BY rows DESC;
|
||||
""")
|
||||
by_ext = cur.fetchall()
|
||||
classified = sum(r["rows"] for r in by_ext if r["ext"] in SUPPORTED_EXT)
|
||||
unknown = sum(r["rows"] for r in by_ext if r["ext"] not in SUPPORTED_EXT)
|
||||
print(f"NULL-type rows by extension:")
|
||||
for r in by_ext:
|
||||
flag = "OK" if r["ext"] in SUPPORTED_EXT else "??"
|
||||
print(f" {flag} {r['ext'] or '(none)':<8} rows={r['rows']}")
|
||||
print(f"\nClassified as 'document' via extension: {classified}")
|
||||
print(f"Unclassifiable (no SUPPORTED extension): {unknown}")
|
||||
return {"by_ext": by_ext, "classified": classified, "unclassifiable": unknown}
|
||||
|
||||
|
||||
# ─── Section 3: created_at inference ────────────────────────────────────────
|
||||
|
||||
def section_3_created_at_inference(cur):
|
||||
header("3. CREATED_AT INFERENCE — file-derived rows")
|
||||
by_path, by_name = load_watcher_state()
|
||||
print(f"watcher_state.json: {len(by_path)} tracked paths, "
|
||||
f"{len(by_name)} distinct filenames, "
|
||||
f"{sum(1 for v in by_name.values() if len(v) > 1)} filename collisions")
|
||||
|
||||
# 3a. Rows with metadata.filepath: probe stat()
|
||||
sub("3a. Rows with metadata.filepath — stat probe")
|
||||
cur.execute("""
|
||||
SELECT id, source, metadata->>'filepath' AS filepath
|
||||
FROM embeddings
|
||||
WHERE created_at IS NULL AND metadata->>'filepath' IS NOT NULL;
|
||||
""")
|
||||
rows_with_fp = cur.fetchall()
|
||||
fp_exists = 0
|
||||
fp_missing = 0
|
||||
fp_outside_root = 0
|
||||
sample_resolved = []
|
||||
for r in rows_with_fp:
|
||||
p = Path(r["filepath"])
|
||||
if not str(p).startswith(str(NEXTCLOUD_ROOT)):
|
||||
fp_outside_root += 1
|
||||
if p.exists():
|
||||
fp_exists += 1
|
||||
if len(sample_resolved) < 5:
|
||||
sample_resolved.append({
|
||||
"id": r["id"], "source": r["source"],
|
||||
"filepath": str(p), "mtime": fmt_ts_from_st_mtime(p),
|
||||
})
|
||||
else:
|
||||
fp_missing += 1
|
||||
print(f" rows with metadata.filepath: {len(rows_with_fp)}")
|
||||
print(f" exists on disk: {fp_exists}")
|
||||
print(f" missing on disk: {fp_missing}")
|
||||
print(f" outside Nextcloud root: {fp_outside_root}")
|
||||
print(f" Sample of 5 resolved mtimes:")
|
||||
for s in sample_resolved:
|
||||
print(f" {s['id']:<15} {s['source'][:60]:<60} mtime={s['mtime']}")
|
||||
|
||||
# 3b. Rows without metadata.filepath: watcher_state lookup
|
||||
sub("3b. Rows without metadata.filepath — watcher_state lookup")
|
||||
cur.execute("""
|
||||
SELECT id, source FROM embeddings
|
||||
WHERE created_at IS NULL
|
||||
AND metadata->>'filepath' IS NULL
|
||||
AND type IS NULL OR (type='document' AND created_at IS NULL AND metadata->>'filepath' IS NULL);
|
||||
""")
|
||||
rows_no_fp = cur.fetchall()
|
||||
# Distinct source basenames to look up
|
||||
basenames_to_resolve = sorted({r["source"] for r in rows_no_fp if r["source"]})
|
||||
n_resolved_unique = sum(1 for n in basenames_to_resolve if len(by_name.get(n, [])) == 1)
|
||||
n_collision_unique = sum(1 for n in basenames_to_resolve if len(by_name.get(n, [])) > 1)
|
||||
n_unfound = sum(1 for n in basenames_to_resolve if n not in by_name)
|
||||
print(f" rows without filepath: {len(rows_no_fp)}")
|
||||
print(f" distinct source basenames to resolve: {len(basenames_to_resolve)}")
|
||||
print(f" unique watcher_state hit (no collision): {n_resolved_unique}")
|
||||
print(f" collision in watcher_state (>1 path): {n_collision_unique}")
|
||||
print(f" not in watcher_state at all: {n_unfound}")
|
||||
|
||||
# 3c. Collision-shape audit
|
||||
sub("3c. Collision-shape audit — all collisions in watcher_state")
|
||||
collisions = {n: [(p, m) for p, m in by_name[n]] for n in by_name if len(by_name[n]) > 1}
|
||||
shape_counts = Counter()
|
||||
rows_affected_by_shape = Counter()
|
||||
# Map from basename to count of NULL-ca rows that need it (rows_no_fp)
|
||||
rows_no_fp_by_name = Counter(r["source"] for r in rows_no_fp)
|
||||
sample_per_shape = defaultdict(list)
|
||||
for name, paths_mtimes in collisions.items():
|
||||
paths = [p for p, _ in paths_mtimes]
|
||||
shape = classify_collision_shape(paths)
|
||||
shape_counts[shape] += 1
|
||||
rows_affected_by_shape[shape] += rows_no_fp_by_name.get(name, 0)
|
||||
if len(sample_per_shape[shape]) < 3:
|
||||
entry = {
|
||||
"name": name,
|
||||
"rows_no_fp_using_this_name": rows_no_fp_by_name.get(name, 0),
|
||||
"candidates": [
|
||||
{"path": p, "mtime": fmt_ts_from_unix(m)}
|
||||
for p, m in sorted(paths_mtimes, key=lambda x: -float(x[1]))
|
||||
],
|
||||
}
|
||||
sample_per_shape[shape].append(entry)
|
||||
print(f" collisions in watcher_state: {len(collisions)}")
|
||||
print(f" shape breakdown:")
|
||||
for shape, n in shape_counts.most_common():
|
||||
print(f" {shape:<22} collisions={n:<4} rows_affected={rows_affected_by_shape[shape]}")
|
||||
print(f"\n Up-to-3 sample collisions per shape (sorted by mtime desc):")
|
||||
for shape, samples in sample_per_shape.items():
|
||||
print(f" [{shape}]")
|
||||
for s in samples:
|
||||
print(f" {s['name']} (rows_no_fp using this name: {s['rows_no_fp_using_this_name']})")
|
||||
for c in s["candidates"]:
|
||||
print(f" {c['mtime']} {c['path']}")
|
||||
|
||||
return {
|
||||
"watcher_state_paths": len(by_path),
|
||||
"watcher_state_basenames": len(by_name),
|
||||
"watcher_state_collisions": len(collisions),
|
||||
"rows_with_filepath": {
|
||||
"total": len(rows_with_fp),
|
||||
"exists": fp_exists, "missing": fp_missing,
|
||||
"outside_root": fp_outside_root,
|
||||
"sample": sample_resolved,
|
||||
},
|
||||
"rows_without_filepath": {
|
||||
"total": len(rows_no_fp),
|
||||
"distinct_basenames": len(basenames_to_resolve),
|
||||
"unique_hit": n_resolved_unique,
|
||||
"collision_hit": n_collision_unique,
|
||||
"unfound": n_unfound,
|
||||
},
|
||||
"collision_shapes": {
|
||||
"total": len(collisions),
|
||||
"shape_counts": dict(shape_counts),
|
||||
"rows_affected_by_shape": dict(rows_affected_by_shape),
|
||||
"samples": {k: v for k, v in sample_per_shape.items()},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# ─── Section 4: ChatGPT export resolution ───────────────────────────────────
|
||||
|
||||
def section_4_chatgpt_export(cur):
|
||||
header("4. CHATGPT EXPORT RESOLUTION (Plan addition #1)")
|
||||
print(f"Probing: {CHATGPT_EXPORT_DIR}")
|
||||
if not CHATGPT_EXPORT_DIR.exists():
|
||||
print(" NOT FOUND — plan on sentinel for entire B-chatgpt cohort.")
|
||||
return {"export_dir_exists": False, "files": []}
|
||||
files = sorted(CHATGPT_EXPORT_DIR.glob("conversations*.json"))
|
||||
print(f" found {len(files)} export file(s):")
|
||||
for f in files:
|
||||
print(f" {f.name} size={f.stat().st_size:,} mtime={fmt_ts_from_st_mtime(f)}")
|
||||
|
||||
# Build convo_id -> create_time index from all export files.
|
||||
print("\nLoading export(s) to build convo_id -> create_time index...")
|
||||
convo_index = {}
|
||||
for f in files:
|
||||
try:
|
||||
data = json.loads(f.read_text(encoding="utf-8"))
|
||||
except Exception as e:
|
||||
print(f" failed to parse {f.name}: {e}")
|
||||
continue
|
||||
for convo in data:
|
||||
cid = convo.get("id") or convo.get("conversation_id")
|
||||
ct = convo.get("create_time")
|
||||
if cid and ct is not None:
|
||||
convo_index[cid] = ct
|
||||
print(f" indexed {len(convo_index)} conversations across {len(files)} export files")
|
||||
|
||||
# Sample 5 chatgpt_conversation rows; resolve.
|
||||
cur.execute("""
|
||||
SELECT id, source FROM embeddings
|
||||
WHERE type='chatgpt_conversation' AND created_at IS NULL
|
||||
ORDER BY random() LIMIT 5;
|
||||
""")
|
||||
sample = cur.fetchall()
|
||||
sub("Sample of 5 B-chatgpt rows: convo lookup")
|
||||
resolved = 0
|
||||
sample_results = []
|
||||
for r in sample:
|
||||
# IDs look like chatgpt_<uuid>_<idx>; uuid extends until last underscore.
|
||||
m = re.match(r"^chatgpt_(.+)_(\d+)$", r["id"])
|
||||
cid = m.group(1) if m else None
|
||||
ct = convo_index.get(cid)
|
||||
ct_iso = None
|
||||
if ct is not None:
|
||||
try:
|
||||
ct_iso = datetime.fromtimestamp(float(ct), tz=timezone.utc).isoformat().replace("+00:00", "Z")
|
||||
except Exception:
|
||||
ct_iso = None
|
||||
if ct_iso:
|
||||
resolved += 1
|
||||
sample_results.append({
|
||||
"id": r["id"], "source": r["source"], "convo_id": cid,
|
||||
"create_time": ct, "create_time_iso": ct_iso,
|
||||
"resolved": ct_iso is not None,
|
||||
})
|
||||
print(f" {r['id']} cid={cid}")
|
||||
print(f" -> create_time={ct} iso={ct_iso}")
|
||||
print(f"\nResolved {resolved}/5. "
|
||||
f"{'PROCEED with re-derive for full cohort.' if resolved == 5 else 'PARTIAL — plan re-derive + sentinel for unresolved.'}")
|
||||
|
||||
# Estimate full-cohort coverage by counting how many B-chatgpt convo_ids appear in the index.
|
||||
cur.execute("""
|
||||
SELECT DISTINCT regexp_replace(id, '^chatgpt_(.+)_\\d+$', '\\1') AS cid
|
||||
FROM embeddings WHERE type='chatgpt_conversation' AND created_at IS NULL;
|
||||
""")
|
||||
distinct_cids = [r["cid"] for r in cur.fetchall()]
|
||||
in_index = sum(1 for c in distinct_cids if c in convo_index)
|
||||
print(f"Full-cohort coverage estimate: {in_index} / {len(distinct_cids)} distinct convo_ids "
|
||||
f"resolvable from export.")
|
||||
return {
|
||||
"export_dir_exists": True,
|
||||
"files": [{"name": f.name, "size": f.stat().st_size, "mtime": fmt_ts_from_st_mtime(f)} for f in files],
|
||||
"convo_index_size": len(convo_index),
|
||||
"sample_results": sample_results,
|
||||
"sample_resolved": resolved,
|
||||
"full_cohort": {
|
||||
"distinct_convo_ids": len(distinct_cids),
|
||||
"resolvable_from_export": in_index,
|
||||
"unresolvable": len(distinct_cids) - in_index,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# ─── Section 5: Sentinel date discovery ─────────────────────────────────────
|
||||
|
||||
def section_5_sentinel(cur):
|
||||
header("5. SENTINEL DATE DISCOVERY (Plan addition #3)")
|
||||
|
||||
# 5a. Earliest non-NULL created_at per type: lower bound on substrate age.
|
||||
sub("5a. Earliest non-NULL created_at per type")
|
||||
cur.execute("""
|
||||
SELECT type, MIN(created_at) AS earliest, MAX(created_at) AS latest, COUNT(*) AS rows
|
||||
FROM embeddings WHERE created_at IS NOT NULL GROUP BY type ORDER BY type;
|
||||
""")
|
||||
rows = cur.fetchall()
|
||||
for r in rows:
|
||||
print(f" {r['type']:<22} earliest={r['earliest']:<32} latest={r['latest']}")
|
||||
|
||||
# 5b. git log for the pgvector-migration commit.
|
||||
sub("5b. Git log — pgvector migration commits")
|
||||
git_findings = []
|
||||
try:
|
||||
out = subprocess.run(
|
||||
["git", "log", "--all", "--format=%H %ci %s",
|
||||
"--", "deprecated/migrate_to_pgvector.py", "scripts/migrate_to_pgvector.py"],
|
||||
cwd=str(Path.home() / "aaronai"), capture_output=True, text=True, timeout=10,
|
||||
)
|
||||
for line in out.stdout.strip().splitlines():
|
||||
print(f" {line}")
|
||||
git_findings.append(line)
|
||||
except Exception as e:
|
||||
print(f" git log failed: {e}")
|
||||
# Also: when did the api/ingest scripts cut over to pgvector?
|
||||
try:
|
||||
out = subprocess.run(
|
||||
["git", "log", "--all", "--format=%H %ci %s", "--grep=pgvector", "-i"],
|
||||
cwd=str(Path.home() / "aaronai"), capture_output=True, text=True, timeout=10,
|
||||
)
|
||||
print("\n Commits mentioning pgvector:")
|
||||
for line in out.stdout.strip().splitlines()[:10]:
|
||||
print(f" {line}")
|
||||
git_findings.append(line)
|
||||
except Exception as e:
|
||||
print(f" git log (pgvector grep) failed: {e}")
|
||||
|
||||
# 5c. ChromaDB sqlite still on disk?
|
||||
sub("5c. ChromaDB dump on disk?")
|
||||
candidates = []
|
||||
for root in [Path.home() / "aaronai", Path.home() / "aaronai" / "db"]:
|
||||
if root.exists():
|
||||
for p in root.rglob("chroma*.sqlite*"):
|
||||
candidates.append({"path": str(p), "mtime": fmt_ts_from_st_mtime(p)})
|
||||
if candidates:
|
||||
for c in candidates:
|
||||
print(f" found: {c['path']} mtime={c['mtime']}")
|
||||
else:
|
||||
print(" no ChromaDB sqlite found under ~/aaronai")
|
||||
|
||||
# 5d. Propose sentinel.
|
||||
sub("5d. Sentinel proposal")
|
||||
# Earliest doc cutover: per query, document=2026-04-30. Migration commit f78b830 was
|
||||
# 2026-04-26. Most defensible sentinel for "rows that entered pgvector before NOW()
|
||||
# writes were canonical" = the migration commit date.
|
||||
proposed = "2026-04-26T00:00:00Z"
|
||||
reasoning = (
|
||||
"git f78b830 'Migrate to pgvector — remove ChromaDB from api.py, ingest scripts, "
|
||||
"dream.py' is dated 2026-04-26. The earliest type='document' row with a non-NULL "
|
||||
"created_at lands 2026-04-30 (the F11 canonical-encoding cutover). Rows with NULL "
|
||||
"created_at all predate F11 and most predate the pgvector cutover itself. "
|
||||
"2026-04-26 is the date the ChromaDB->pgvector migration script was committed, "
|
||||
"so any row currently in the embeddings table with NULL created_at must have been "
|
||||
"ingested on or after that date (when the table came into existence in current form). "
|
||||
"It is the tightest defensible upper bound on 'the row entered pgvector before "
|
||||
"timestamps were tracked', so it is the right sentinel."
|
||||
)
|
||||
print(f" Proposed sentinel: {proposed}")
|
||||
print(f" Reasoning: {reasoning}")
|
||||
|
||||
return {
|
||||
"earliest_per_type": rows,
|
||||
"git_findings": git_findings,
|
||||
"chromadb_candidates": candidates,
|
||||
"proposed_sentinel": proposed,
|
||||
"reasoning": reasoning,
|
||||
}
|
||||
|
||||
|
||||
# ─── Section 6: 50-row stratified sample ────────────────────────────────────
|
||||
|
||||
def section_6_stratified_sample(cur, sentinel_iso):
|
||||
header("6. 50-ROW STRATIFIED SAMPLE — derived (type, created_at, source)")
|
||||
by_path, by_name = load_watcher_state()
|
||||
|
||||
cohorts = [
|
||||
("A (type NULL, ca NULL)", "type IS NULL AND created_at IS NULL", 10),
|
||||
("B-doc-old (type='document', ca NULL)", "type='document' AND created_at IS NULL", 10),
|
||||
("B-chatgpt (type='chatgpt_conversation', ca NULL)", "type='chatgpt_conversation' AND created_at IS NULL", 10),
|
||||
("C-doc-new (type='document', ca set)", "type='document' AND created_at IS NOT NULL", 10),
|
||||
("C-claude (type='claude_conversation', ca set)", "type='claude_conversation' AND created_at IS NOT NULL", 5),
|
||||
("C-aaronai (type='aaronai_conversation', ca set)", "type='aaronai_conversation' AND created_at IS NOT NULL", 5),
|
||||
]
|
||||
|
||||
samples = []
|
||||
for label, predicate, n in cohorts:
|
||||
sub(f"{label} (sample size: {n})")
|
||||
cur.execute(f"""
|
||||
SELECT id, source, type, created_at, metadata
|
||||
FROM embeddings WHERE {predicate}
|
||||
ORDER BY random() LIMIT %s;
|
||||
""", (n,))
|
||||
rows = cur.fetchall()
|
||||
for r in rows:
|
||||
row_meta = r["metadata"] or {}
|
||||
fp = row_meta.get("filepath")
|
||||
inferred_type = r["type"] or ("document" if (r["source"] or "").lower().endswith(tuple(SUPPORTED_EXT)) else "?")
|
||||
inferred_ca = r["created_at"]
|
||||
inferred_ca_source = "preserved" if inferred_ca else None
|
||||
if not inferred_ca:
|
||||
if fp and Path(fp).exists():
|
||||
inferred_ca = fmt_ts_from_st_mtime(Path(fp))
|
||||
inferred_ca_source = "filepath_stat"
|
||||
elif r["source"] and r["source"] in by_name:
|
||||
candidates = by_name[r["source"]]
|
||||
if len(candidates) == 1:
|
||||
inferred_ca = fmt_ts_from_unix(candidates[0][1])
|
||||
inferred_ca_source = "watcher_state_unique"
|
||||
else:
|
||||
# take most recent
|
||||
latest = max(candidates, key=lambda x: float(x[1]))
|
||||
inferred_ca = fmt_ts_from_unix(latest[1])
|
||||
inferred_ca_source = f"watcher_state_collision_pick_latest_of_{len(candidates)}"
|
||||
else:
|
||||
inferred_ca = sentinel_iso
|
||||
inferred_ca_source = "sentinel"
|
||||
print(f" id={r['id']:<22} src={(r['source'] or '')[:38]:<38}")
|
||||
print(f" existing: type={r['type']!r:<22} ca={r['created_at']!r}")
|
||||
print(f" inferred: type={inferred_type!r:<22} ca={inferred_ca!r} ({inferred_ca_source})")
|
||||
samples.append({
|
||||
"cohort": label, "id": r["id"], "source": r["source"],
|
||||
"existing_type": r["type"], "existing_ca": r["created_at"],
|
||||
"inferred_type": inferred_type, "inferred_ca": inferred_ca,
|
||||
"inferred_ca_source": inferred_ca_source,
|
||||
})
|
||||
return samples
|
||||
|
||||
|
||||
# ─── Driver ─────────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
pg = get_pg()
|
||||
cur = pg.cursor()
|
||||
|
||||
out = {"generated_at": datetime.now(timezone.utc).isoformat()}
|
||||
out["section_1"] = section_1_cohort_recap(cur)
|
||||
out["section_2"] = section_2_type_inference(cur)
|
||||
out["section_3"] = section_3_created_at_inference(cur)
|
||||
out["section_4"] = section_4_chatgpt_export(cur)
|
||||
out["section_5"] = section_5_sentinel(cur)
|
||||
sentinel_iso = out["section_5"]["proposed_sentinel"]
|
||||
out["section_6"] = section_6_stratified_sample(cur, sentinel_iso)
|
||||
|
||||
pg.close()
|
||||
|
||||
# JSON sidecar — strip non-serializables.
|
||||
def _serialize(o):
|
||||
if isinstance(o, datetime):
|
||||
return o.isoformat()
|
||||
return str(o)
|
||||
|
||||
OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||
OUT_PATH.write_text(json.dumps(out, indent=2, default=_serialize))
|
||||
print(f"\nJSON sidecar written: {OUT_PATH}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user