From 2b9a1782c1780a2a90bedb54a96d92183a46080c Mon Sep 17 00:00:00 2001 From: Aaron Nelson Date: Thu, 30 Apr 2026 04:04:31 +0000 Subject: [PATCH] feat: stage2/3 pipeline, taxonomy-free cascade, E1.8/E4 experiments, corpus migration state --- dreamer_state.json | 29 +- experiments/e14_cascade_results.json | 1091 +++++++++++++++++ experiments/e14_per_source_comparison.json | 302 +++++ experiments/e14_sample.json | 177 +++ experiments/e1_8_eval.json | 1053 ++++++++++++++++ experiments/e1_8_results.json | 909 ++++++++++++++ experiments/e4_aaron_ratings.json | 122 ++ experiments/e4_comparison.json | 208 ++++ experiments/e4_llm_ratings.json | 158 +++ experiments/tier1_migration_results.json | 1271 ++++++++++++++++++++ experiments/tier1_migration_state.json | 272 +++++ scripts/ingest.py | 34 +- scripts/stage2_worker.py | 226 ++++ scripts/tier1_migration.py | 298 +++++ 14 files changed, 6145 insertions(+), 5 deletions(-) create mode 100644 experiments/e14_cascade_results.json create mode 100644 experiments/e14_per_source_comparison.json create mode 100644 experiments/e14_sample.json create mode 100644 experiments/e1_8_eval.json create mode 100644 experiments/e1_8_results.json create mode 100644 experiments/e4_aaron_ratings.json create mode 100644 experiments/e4_comparison.json create mode 100644 experiments/e4_llm_ratings.json create mode 100644 experiments/tier1_migration_results.json create mode 100644 experiments/tier1_migration_state.json create mode 100644 scripts/stage2_worker.py create mode 100644 scripts/tier1_migration.py diff --git a/dreamer_state.json b/dreamer_state.json index fc8c40c..6060d10 100644 --- a/dreamer_state.json +++ b/dreamer_state.json @@ -1,5 +1,30 @@ { - "last_dream_timestamp": 1777276868.016728, + "last_dream_timestamp": 1777480274.444462, "last_dream_mode": "pipeline", - "last_dream_file": "Journal/Dreams/2026-04-27-synthesis-1.md" + "last_dream_file": "Journal/Dreams/2026-04-29-synthesis-1.md", + "retrieved_sources": [ + "ChatGPT: CV Summary Request", + "Dossier Narrative.docx", + "2026-04-28-early-rem.md", + "Advances in Architectural Geometry 2023 -- Kathrin D\u00f6rfler (editor); Jan Knippers (editor); Achim.pdf", + "2026-04-29-late-rem.md", + "Mod06_GrabCAD_Print_and _Advanced_FDM_2023.pptx", + "The Extended Mind _ The Power of Thinking Outside the Brain -- Annie Murphy Paul.pdf", + "Utah MDD - Aaron Nelson - Copy.pptx", + "ChatGPT: Dean Position Evaluation", + "Company of One -- Paul Jarvis.pdf", + "References.docx", + "Dossier Narrative Kill Me PLS_REV.docx", + "ChatGPT: Career change anxiety", + "2026-04-27-early-rem-1.md", + "The Poetics of Space -- Gaston Bachelard translated from the French by Maria Jolas -- First Edition, 1994.pdf", + "ChatGPT: Digital fabrication education", + "Dossier Narrative Kill Me PLS.docx", + "ChatGPT: Digital Fabrication Cultural Project", + "Claude: Weighing Utah versus Oklahoma", + "Claude: Importing chat history from ChatGPT", + "Claude: I filling out my annual report...", + "References.pdf", + "Dossier Narrative Kill Me PLS_REV_HOME.docx" + ] } \ No newline at end of file diff --git a/experiments/e14_cascade_results.json b/experiments/e14_cascade_results.json new file mode 100644 index 0000000..824a558 --- /dev/null +++ b/experiments/e14_cascade_results.json @@ -0,0 +1,1091 @@ +{ + "results": [ + { + "name": "Claude: Finding ideal rural housing near University of Utah", + "bucket": "high", + "tier1_entities": 40, + "doc_chars": 27530, + "metadata": { + "language": "en", + "char_length": 12000, + "primary_format": "prose", + "structural_signals": { + "has_headings": true, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": true + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": true, + "has_technical_terminology": false, + "has_first_person": false, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "Discussion about finding ideal rural housing near University of Utah" + }, + "metadata_elapsed_s": 132.7, + "custom_extraction_instructions": "This is a educational document in prose format. Summary: Discussion about finding ideal rural housing near University of Utah This metadata is provided to orient your extraction, not to constrain it. Extract entities and relationships freely from the document text itself; the metadata is descriptive context, not a checklist.", + "submit_elapsed_s": 32.2, + "submit_result": { + "ok": true + } + }, + { + "name": "ChatGPT: Rhino 3D object flow", + "bucket": "high", + "tier1_entities": 39, + "doc_chars": 81902, + "metadata": { + "language": "en", + "char_length": 12000, + "primary_format": "prose", + "structural_signals": { + "has_headings": true, + "has_bullet_lists": true, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": true + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": true, + "has_technical_terminology": true, + "has_first_person": false, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "A discussion on the connections between Deleuze/Guattari's ideas and digital fabrication in the context of scripting, CNC, and software like Grasshopper" + }, + "metadata_elapsed_s": 142.8, + "custom_extraction_instructions": "This is a educational document in prose format. Summary: A discussion on the connections between Deleuze/Guattari's ideas and digital fabrication in the context of scripting, CNC, and software like Grasshopper This metadata is provided to orient your extraction, not to constrain it. Extract entities and relationships freely from the document text itself; the metadata is descriptive context, not a checklist.", + "submit_elapsed_s": 41.0, + "submit_result": { + "ok": true + } + }, + { + "name": "Claude: Evaluating tenure prospects at R1 universities", + "bucket": "high", + "tier1_entities": 34, + "doc_chars": 33951, + "metadata": { + "language": "en", + "char_length": 12000, + "primary_format": "prose", + "structural_signals": { + "has_headings": true, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": false + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": true, + "has_technical_terminology": true, + "has_first_person": true, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "Discussion about the tenure prospects at R1 universities for a candidate" + }, + "metadata_elapsed_s": 123.0, + "custom_extraction_instructions": "This is a educational document in prose format. Summary: Discussion about the tenure prospects at R1 universities for a candidate This metadata is provided to orient your extraction, not to constrain it. Extract entities and relationships freely from the document text itself; the metadata is descriptive context, not a checklist.", + "submit_elapsed_s": 36.5, + "submit_result": { + "ok": true + } + }, + { + "name": "Claude: Law enforcement career options", + "bucket": "high", + "tier1_entities": 34, + "doc_chars": 26637, + "metadata": { + "language": "en", + "char_length": 12000, + "primary_format": "prose", + "structural_signals": { + "has_headings": true, + "has_bullet_lists": false, + "has_numbered_lists": true, + "has_tables": false, + "has_code_blocks": false, + "has_dates": true + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": true, + "has_technical_terminology": true, + "has_first_person": false, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "This document provides an overview of career options in law enforcement and discusses the specifics of joining National Park Service as a seasonal LE ranger" + }, + "metadata_elapsed_s": 127.6, + "custom_extraction_instructions": "This is a educational document in prose format. Summary: This document provides an overview of career options in law enforcement and discusses the specifics of joining National Park Service as a seasonal LE ranger This metadata is provided to orient your extraction, not to constrain it. Extract entities and relationships freely from the document text itself; the metadata is descriptive context, not a checklist.", + "submit_elapsed_s": 36.5, + "submit_result": { + "ok": true + } + }, + { + "name": "ChatGPT: Resume formatting and review", + "bucket": "high", + "tier1_entities": 28, + "doc_chars": 47571, + "metadata": { + "language": "en", + "char_length": 12000, + "primary_format": "prose", + "structural_signals": { + "has_headings": true, + "has_bullet_lists": true, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": true + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": true, + "has_technical_terminology": true, + "has_first_person": true, + "has_quotations": false + }, + "domain_class": "personal", + "one_sentence_summary": "Professional Resume of Aaron Nelson for a job as Additive Manufacturing Technician" + }, + "metadata_elapsed_s": 119.3, + "custom_extraction_instructions": "This is a personal document in prose format. Summary: Professional Resume of Aaron Nelson for a job as Additive Manufacturing Technician This metadata is provided to orient your extraction, not to constrain it. Extract entities and relationships freely from the document text itself; the metadata is descriptive context, not a checklist.", + "submit_elapsed_s": 29.8, + "submit_result": { + "ok": true + } + }, + { + "name": "ChatGPT: Tulsa Concept Album Guide", + "bucket": "high", + "tier1_entities": 28, + "doc_chars": 27502, + "metadata": { + "language": "en", + "char_length": 12000, + "primary_format": "prose", + "structural_signals": { + "has_headings": true, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": false + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": true, + "has_technical_terminology": false, + "has_first_person": true, + "has_quotations": false + }, + "domain_class": "personal", + "one_sentence_summary": "A discussion on creating a concept album about Tulsa Oklahoma, inspired by Terry Allen's 'Lubbock (on Everything)', with the artist sharing their personal experiences growing up there as a mixed-race person and expressing feelings of rejection" + }, + "metadata_elapsed_s": 146.5, + "custom_extraction_instructions": "This is a personal document in prose format. Summary: A discussion on creating a concept album about Tulsa Oklahoma, inspired by Terry Allen's 'Lubbock (on Everything)', with the artist sharing their personal experiences growing up there as a mixed-race person and expressing feelings of rejection This metadata is provided to orient your extraction, not to constrain it. Extract entities and relationships freely from the document text itself; the metadata is descriptive context, not a checklist.", + "submit_elapsed_s": 33.5, + "submit_result": { + "ok": true + } + }, + { + "name": "ChatGPT: SEC coaches with OSU ties", + "bucket": "high", + "tier1_entities": 27, + "doc_chars": 33839, + "metadata": { + "assistant_coaches": [ + { + "name": "Curtis Luper", + "position": "RBs", + "school": "Missouri", + "osu_connection": "former AHC/RBs, OSU alum, letterman" + }, + { + "name": "Tim Rattay", + "position": "Offensive Analyst", + "school": "LSU", + "osu_connection": "former QBs coach 2020-24" + }, + { + "name": "Sean Gleeson", + "position": "QBs", + "school": "Missouri", + "osu_connection": "former OC" + }, + { + "name": "Glen Elarbee", + "position": "OL", + "school": "Tennessee", + "osu_connection": "former GA" + } + ], + "head_coach_candidates": [ + { + "name": "Curtis Luper", + "qualifications": [ + "OSU alum + ex-assistant head coach in Stillwater", + "long P5 track", + "deep OK/TX recruiting ties", + "culture fit is elite", + "biggest question is lack of recent OC/DC title" + ] + }, + { + "name": "Tim Rattay", + "qualifications": [ + "QB developer with NFL background", + "aligns with OSU\u2019s QB-centric identity", + "needs a coordinator/associate HC stop to check the final box" + ] + } + ], + "char_length": 12000 + }, + "metadata_elapsed_s": 168.3, + "custom_extraction_instructions": "This is a unknown document in unknown format. Summary: This metadata is provided to orient your extraction, not to constrain it. Extract entities and relationships freely from the document text itself; the metadata is descriptive context, not a checklist.", + "submit_elapsed_s": 30.7, + "submit_result": { + "ok": true + } + }, + { + "name": "Claude: Lubbock on everything album lyrics", + "bucket": "high", + "tier1_entities": 26, + "doc_chars": 11440, + "metadata": { + "language": "en", + "char_length": 11440, + "primary_format": "prose", + "structural_signals": { + "has_headings": false, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": false + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": false, + "has_technical_terminology": false, + "has_first_person": false, + "has_quotations": false + }, + "domain_class": "conversational", + "one_sentence_summary": "Discussion about the lyrics and writing style of Lubbock on Everything album by Terry Allen" + }, + "metadata_elapsed_s": 129.7, + "custom_extraction_instructions": "This is a conversational document in prose format. Summary: Discussion about the lyrics and writing style of Lubbock on Everything album by Terry Allen This metadata is provided to orient your extraction, not to constrain it. Extract entities and relationships freely from the document text itself; the metadata is descriptive context, not a checklist.", + "submit_elapsed_s": 30.8, + "submit_result": { + "ok": true + } + }, + { + "name": "Claude: Bonding ASA 3D printed parts", + "bucket": "mid", + "tier1_entities": 11, + "doc_chars": 3361, + "metadata": { + "language": "en", + "char_length": 3361, + "primary_format": "prose", + "structural_signals": { + "has_headings": false, + "has_bullet_lists": true, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": false + }, + "content_signals": { + "has_named_people": false, + "has_institutional_language": false, + "has_technical_terminology": true, + "has_first_person": true, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "Discussion on the best adhesives for bonding ASA 3D printed parts and the estimated fair price of a used utility trailer that is 5 years old" + }, + "metadata_elapsed_s": 46.2, + "custom_extraction_instructions": "This is a educational document in prose format. Summary: Discussion on the best adhesives for bonding ASA 3D printed parts and the estimated fair price of a used utility trailer that is 5 years old This metadata is provided to orient your extraction, not to constrain it. Extract entities and relationships freely from the document text itself; the metadata is descriptive context, not a checklist.", + "submit_elapsed_s": 20.5, + "submit_result": { + "ok": true + } + }, + { + "name": "Claude: SUNY faculty conflict of interest policies", + "bucket": "mid", + "tier1_entities": 11, + "doc_chars": 6858, + "metadata": { + "language": "en", + "char_length": 6858, + "primary_format": "prose", + "structural_signals": { + "has_headings": true, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": false + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": true, + "has_technical_terminology": true, + "has_first_person": true, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "Information about SUNY faculty conflict of interest policies for outside work, specifically related to a computational design & additive manufacturing consulting practice" + }, + "metadata_elapsed_s": 65.6, + "custom_extraction_instructions": "This is a educational document in prose format. Summary: Information about SUNY faculty conflict of interest policies for outside work, specifically related to a computational design & additive manufacturing consulting practice This metadata is provided to orient your extraction, not to constrain it. Extract entities and relationships freely from the document text itself; the metadata is descriptive context, not a checklist.", + "submit_elapsed_s": 20.8, + "submit_result": { + "ok": true + } + }, + { + "name": "Claude: Interview presentation research and preparation", + "bucket": "mid", + "tier1_entities": 11, + "doc_chars": 163880, + "metadata": { + "language": "en", + "char_length": 12000, + "primary_format": "prose", + "structural_signals": { + "has_headings": true, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": false + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": true, + "has_technical_terminology": true, + "has_first_person": true, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "Discussion on the integration of emerging technologies like AI and XR in a design curriculum" + }, + "metadata_elapsed_s": 125.0, + "custom_extraction_instructions": "This is a educational document in prose format. Summary: Discussion on the integration of emerging technologies like AI and XR in a design curriculum This metadata is provided to orient your extraction, not to constrain it. Extract entities and relationships freely from the document text itself; the metadata is descriptive context, not a checklist.", + "submit_elapsed_s": 16.5, + "submit_result": { + "ok": true + } + }, + { + "name": "ChatGPT: Remington 700 5R Gen 1", + "bucket": "mid", + "tier1_entities": 11, + "doc_chars": 6513, + "metadata": { + "language": "en", + "char_length": 6513, + "primary_format": "prose", + "structural_signals": { + "has_headings": true, + "has_bullet_lists": true, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": true + }, + "content_signals": { + "has_named_people": false, + "has_institutional_language": false, + "has_technical_terminology": true, + "has_first_person": false, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "A discussion about the desirability, value, and characteristics of a Remington 700 5R Gen 1 rifle in .308 caliber with a 20-inch threaded barrel." + }, + "metadata_elapsed_s": 89.7, + "custom_extraction_instructions": "This is a educational document in prose format. Summary: A discussion about the desirability, value, and characteristics of a Remington 700 5R Gen 1 rifle in .308 caliber with a 20-inch threaded barrel. This metadata is provided to orient your extraction, not to constrain it. Extract entities and relationships freely from the document text itself; the metadata is descriptive context, not a checklist.", + "submit_elapsed_s": 26.5, + "submit_result": { + "ok": true + } + }, + { + "name": "Claude: I filling out my annual report...", + "bucket": "mid", + "tier1_entities": 9, + "doc_chars": 87301, + "metadata": { + "language": "en", + "char_length": 12000, + "primary_format": "prose", + "structural_signals": { + "has_headings": true, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": true + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": true, + "has_technical_terminology": true, + "has_first_person": true, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "A document discussing the growth and impact of a Digital Design & Fabrication program, highlighting its focus on computational design, additive manufacturing, employer partnerships, and 100% employment rate for graduates." + }, + "metadata_elapsed_s": 115.7, + "custom_extraction_instructions": "This is a educational document in prose format. Summary: A document discussing the growth and impact of a Digital Design & Fabrication program, highlighting its focus on computational design, additive manufacturing, employer partnerships, and 100% employment rate for graduates. This metadata is provided to orient your extraction, not to constrain it. Extract entities and relationships freely from the document text itself; the metadata is descriptive context, not a checklist.", + "submit_elapsed_s": 21.9, + "submit_result": { + "ok": true + } + }, + { + "name": "Aaron AI: So, I've been working on the RNAI project, and the way I've ...", + "bucket": "mid", + "tier1_entities": 9, + "doc_chars": 4259, + "metadata": { + "language": "en", + "char_length": 4259, + "primary_format": "prose", + "structural_signals": { + "has_headings": false, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": true, + "has_dates": true + }, + "content_signals": { + "has_named_people": false, + "has_institutional_language": true, + "has_technical_terminology": true, + "has_first_person": true, + "has_quotations": false + }, + "domain_class": "technical", + "one_sentence_summary": "Document discusses the use of Claude Code for a RNAI project and potential security concerns when using it." + }, + "metadata_elapsed_s": 48.2, + "custom_extraction_instructions": "This is a technical document in prose format. Summary: Document discusses the use of Claude Code for a RNAI project and potential security concerns when using it. This metadata is provided to orient your extraction, not to constrain it. Extract entities and relationships freely from the document text itself; the metadata is descriptive context, not a checklist.", + "submit_elapsed_s": 22.8, + "submit_result": { + "ok": true + } + }, + { + "name": "Claude: Internship agreement writing help", + "bucket": "mid", + "tier1_entities": 8, + "doc_chars": 3851, + "metadata": { + "language": "en", + "char_length": 3851, + "primary_format": "structured", + "structural_signals": { + "has_headings": true, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": false + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": true, + "has_technical_terminology": true, + "has_first_person": false, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "Conversation about writing internship agreements for SUNY New Paltz's Digital Design & Fabrication program" + }, + "metadata_elapsed_s": 45.7, + "custom_extraction_instructions": "This is a educational document in structured format. Summary: Conversation about writing internship agreements for SUNY New Paltz's Digital Design & Fabrication program This metadata is provided to orient your extraction, not to constrain it. Extract entities and relationships freely from the document text itself; the metadata is descriptive context, not a checklist.", + "submit_elapsed_s": 13.4, + "submit_result": { + "ok": true + } + }, + { + "name": "ChatGPT: Research Statement Restructure", + "bucket": "mid", + "tier1_entities": 8, + "doc_chars": 9250, + "metadata": { + "language": "en", + "char_length": 9250, + "primary_format": "mixed", + "structural_signals": { + "has_headings": true, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": false + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": false, + "has_technical_terminology": true, + "has_first_person": true, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "A conversation between Aaron and ChatGPT about restructuring a research statement and revising a philosophy of teaching for a job application" + }, + "metadata_elapsed_s": 84.0, + "custom_extraction_instructions": "This is a educational document in mixed format. Summary: A conversation between Aaron and ChatGPT about restructuring a research statement and revising a philosophy of teaching for a job application This metadata is provided to orient your extraction, not to constrain it. Extract entities and relationships freely from the document text itself; the metadata is descriptive context, not a checklist.", + "submit_elapsed_s": 19.5, + "submit_result": { + "ok": true + } + }, + { + "name": "ChatGPT: Sink Sprayer Fitting Name", + "bucket": "low", + "tier1_entities": 5, + "doc_chars": 1383, + "metadata": { + "language": "en", + "char_length": 1383, + "primary_format": "conversational", + "structural_signals": { + "has_headings": false, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": false + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": false, + "has_technical_terminology": true, + "has_first_person": true, + "has_quotations": false + }, + "domain_class": "personal", + "one_sentence_summary": "Discussion about the name and fitting components of a sink sprayer" + }, + "metadata_elapsed_s": 22.5, + "custom_extraction_instructions": "This is a personal document in conversational format. Summary: Discussion about the name and fitting components of a sink sprayer This metadata is provided to orient your extraction, not to constrain it. Extract entities and relationships freely from the document text itself; the metadata is descriptive context, not a checklist.", + "submit_elapsed_s": 9.5, + "submit_result": { + "ok": true + } + }, + { + "name": "ChatGPT: Title: User request summary.", + "bucket": "low", + "tier1_entities": 5, + "doc_chars": 4363, + "metadata": { + "language": "en", + "char_length": 4363, + "primary_format": "mixed", + "structural_signals": { + "has_headings": true, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": false + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": true, + "has_technical_terminology": false, + "has_first_person": true, + "has_quotations": false + }, + "domain_class": "personal", + "one_sentence_summary": "Conversation between a user and an assistant discussing the creation of an introductory speech for a Design Week event" + }, + "metadata_elapsed_s": 47.9, + "custom_extraction_instructions": "This is a personal document in mixed format. Summary: Conversation between a user and an assistant discussing the creation of an introductory speech for a Design Week event This metadata is provided to orient your extraction, not to constrain it. Extract entities and relationships freely from the document text itself; the metadata is descriptive context, not a checklist.", + "submit_elapsed_s": 14.0, + "submit_result": { + "ok": true + } + }, + { + "name": "ChatGPT: Regex for inserting letters", + "bucket": "low", + "tier1_entities": 5, + "doc_chars": 12778, + "metadata": { + "operation": "replace", + "searchValue": "Z 0", + "replaceValue": "Z 25", + "char_length": 12000 + }, + "metadata_elapsed_s": 141.9, + "custom_extraction_instructions": "This is a unknown document in unknown format. Summary: This metadata is provided to orient your extraction, not to constrain it. Extract entities and relationships freely from the document text itself; the metadata is descriptive context, not a checklist.", + "submit_elapsed_s": 10.8, + "submit_result": { + "ok": true + } + }, + { + "name": "ChatGPT: Provide DXF file", + "bucket": "low", + "tier1_entities": 5, + "doc_chars": 38594, + "metadata": { + "role": "assistant", + "usage": { + "assistant": 1 + }, + "messages": [ + { + "role": "assistant", + "content": "Here's the code to convert the existing stroke-only SVG polylines into curved SVG paths using cubic Beziers (Catmull\u2013Rom smoothing):" + }, + { + "role": "assistant", + "content": "First, we'll import the required libraries and parse the SVG file." + }, + { + "role": "assistant", + "content": "```javascript\nconst svgjs = require('svg.js');\nlet doc = svgjs('path-to-your-svg-file');\n```" + }, + { + "role": "assistant", + "content": "Next, we'll loop through all the paths in the SVG and replace them with smoothed cubic Bezier curves." + }, + { + "role": "assistant", + "content": "```javascript\nfor (let path of doc.find('path')) {\n // Calculate control points for Catmull\u2013Rom splines\n const tangentsBefore = getTangents(path);\nconst controlPoints = calculateControlPoints(tangentsBefore, 4);\n\n // Replace the original path with a new one using cubic Bezier curves\n const smoothedPath = doc.path().moveTo(controlPoints[0].x, controlPoints[0].y).cubic(4, \n controlPoints[1].x - controlPoints[0].x, controlPoints[1].y - controlPoints[0].y, \n controlPoints[2].x - controlPoints[1].x, controlPoints[2].y - controlPoints[1].y, \n controlPoints[3].x - controlPoints[2].x, controlPoints[3].y - controlPoints[2].y);\n\n // Replace the original path with the new smoothed one\n path.remove().insertBefore(smoothedPath);\n}\n\nfunction getTangents(path) {\n const points = path.bbox().getPoints();\nconst tangents = [];\nfor (let i = 1; i < points.length - 1; i++) {\n const dx = points[i].x - points[i-1].x;\n const dy = points[i].y - points[i-1].y;\n tangents.push({dx, dy});\n}\nreturn tangents;\n}\n\nfunction calculateControlPoints(tangents, segmentCount) {\n const controlPoints = [];\nfor (let i = 0; i < tangents.length - 2 * segmentCount + 1; i += 4) {\n // Calculate four Catmull\u2013Rom spline control points for the next segment\n const p0 = tangents[i];\nconst p1 = tangents[i+1];\nconst p2 = tangents[i+2];\nconst p3 = tangents[i+3];\n\ncontrolPoints.push(getBezierControlPoint(p0, p1, p2, p3));\n}\nreturn controlPoints;\n}\n\nfunction getBezierControlPoint(p0, p1, p2, p3) {\n // Hermite curve calculation for each point between p0 and p1,\np2, and p3.\n const t = 0.5;\n\n// First derivative of the Hermite polynomial at p0:\n const d0x = 2 * (p1.x - p0.x) - t * (t * (2 * p0.x - 3 * p1.x + 2 * p2.x) - (2 - t) * (p1.x - p2.x));\nconst d0y = 2 * (p1.y - p0.y) - t * (t * (2 * p0.y - 3 * p1.y + 2 * p2.y) - (2 - t) * (p1.y - p2.y));\n\n// First derivative of the Hermite polynomial at p3:\n const d3x = 2 * (p3.x - p2.x) - t * (t * (2 * p2.x - 3 * p3.x + 2 * p0.x) - (2 - t) * (p2.x - p0.x));\nconst d3y = 2 * (p3.y - p2.y) - t * (t * (2 * p2.y - 3 * p3.y + 2 * p0.y) - (2 - t) * (p2.y - p0.y));\n\n// Calculate the control points:\n const c0 = {x: p0.x + d0x / 6, y: p0.y + d0y / 6};\nconst c1 = {x: (p1.x + p2.x) / 2, y: (p1.y + p2.y) / 2};\nconst c2 = {x: p3.x + d3x / 6, y: p3.y + d3y / 6};\nreturn {x: (c0.x + c1.x + c2.x) / 3, y: (c0.y + c1.y + c2.y) / 3};\n}\n```" + } + ], + "char_length": 12000 + }, + "metadata_elapsed_s": 281.4, + "custom_extraction_instructions": "This is a unknown document in unknown format. Summary: This metadata is provided to orient your extraction, not to constrain it. Extract entities and relationships freely from the document text itself; the metadata is descriptive context, not a checklist.", + "submit_elapsed_s": 28.7, + "submit_result": { + "ok": true + } + }, + { + "name": "ChatGPT: Rectangle Edge Offset Algorithm", + "bucket": "low", + "tier1_entities": 3, + "doc_chars": 12610, + "metadata": { + "response_text": "Here is the requested information in JSON format for the analysis:", + "json_data": { + "primary_text": "Offsetting Rectangles with a Specific Distance", + "sub_title": "Calculation of d for a specific perpendicular offset \u03b4", + "content": [ + { + "heading": "2D Case (Standard Rectangle)", + "steps": [ + { + "description": "Determine the angle \u03b8 between one of the edges and the bisector vector.", + "value": "\u03b8 = 45\u00b0" + }, + { + "description": "Relate d to the perpendicular offset \u03b4", + "value": "d = \u03b4 * sqrt(2)" + } + ] + }, + { + "heading": "General Case (Arbitrary Rectangles in 2D or 3D)", + "steps": [ + { + "description": "Calculate the angle \u03b8 between one of the rectangle\u2019s edges and the bisector vector.", + "value": "cos(\u03b8) = vec{u} . vec{b} / (|vec{u}| |vec{b}|)" + }, + { + "description": "Determine d", + "value": "d = \u03b4 / sin(\u03b8)" + } + ] + } + ], + "additional_information": [ + { + "heading": "Bisector Vector Calculation", + "steps": [ + { + "description": "Add the unit vectors of the two edges to get a vector that lies between them:", + "value": "\u03b2 = \u03c3{u} + \u03c3{v}" + }, + { + "description": "Normalize the resulting vector to convert it into a unit vector (this is the bisector vector):", + "value": "\u03c3{\u03b2} = \u03c3(\u03b2) / |\u03b2|" + }, + { + "description": "In 3D space, the angle between the edge vector and bisector may not be 45\u00b0. In this case, use the calculated angle \u03b8 to find d as described in the General Case." + } + ] + } + ], + "primary_keywords": [ + "offsetting", + "rectangles", + "specific distance" + ], + "secondary_keywords": [ + "bisector vector", + "perpendicular offset", + "angle", + "vector calculation" + ] + }, + "char_length": 12000 + }, + "metadata_elapsed_s": 201.7, + "custom_extraction_instructions": "This is a unknown document in unknown format. Summary: This metadata is provided to orient your extraction, not to constrain it. Extract entities and relationships freely from the document text itself; the metadata is descriptive context, not a checklist.", + "submit_elapsed_s": 17.5, + "submit_result": { + "ok": true + } + }, + { + "name": "ChatGPT: Testing Vector Alignment", + "bucket": "low", + "tier1_entities": 3, + "doc_chars": 2209, + "metadata": { + "language": "en", + "char_length": 2209, + "primary_format": "prose", + "structural_signals": { + "has_headings": true, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": false + }, + "content_signals": { + "has_named_people": false, + "has_institutional_language": false, + "has_technical_terminology": true, + "has_first_person": false, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "This document explains how to test if two vectors are pointing in the same general direction using the dot product." + }, + "metadata_elapsed_s": 43.2, + "custom_extraction_instructions": "This is a educational document in prose format. Summary: This document explains how to test if two vectors are pointing in the same general direction using the dot product. This metadata is provided to orient your extraction, not to constrain it. Extract entities and relationships freely from the document text itself; the metadata is descriptive context, not a checklist.", + "submit_elapsed_s": 31.6, + "submit_result": { + "ok": true + } + }, + { + "name": "ChatGPT: Scholarship Recommendation Letter Tips", + "bucket": "low", + "tier1_entities": 3, + "doc_chars": 2980, + "metadata": { + "language": "en", + "char_length": 2980, + "primary_format": "prose", + "structural_signals": { + "has_headings": true, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": false + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": false, + "has_technical_terminology": false, + "has_first_person": true, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "This document provides a comprehensive guide on how to write a scholarship recommendation letter, covering key areas such as introductions, academic and professional qualities, personal characteristics, achievements, future potential, and conclusion." + }, + "metadata_elapsed_s": 36.3, + "custom_extraction_instructions": "This is a educational document in prose format. Summary: This document provides a comprehensive guide on how to write a scholarship recommendation letter, covering key areas such as introductions, academic and professional qualities, personal characteristics, achievements, future potential, and conclusion. This metadata is provided to orient your extraction, not to constrain it. Extract entities and relationships freely from the document text itself; the metadata is descriptive context, not a checklist.", + "submit_elapsed_s": 4.1, + "submit_result": { + "ok": true + } + }, + { + "name": "ChatGPT: Respect Individual Interests for Christmas", + "bucket": "low", + "tier1_entities": 2, + "doc_chars": 6126, + "metadata": { + "language": "en", + "char_length": 6126, + "primary_format": "prose", + "structural_signals": { + "has_headings": false, + "has_bullet_lists": true, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": false + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": false, + "has_technical_terminology": false, + "has_first_person": true, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "Discussion on choosing appropriate Christmas gifts for individuals with unique interests" + }, + "metadata_elapsed_s": 59.9, + "custom_extraction_instructions": "This is a educational document in prose format. Summary: Discussion on choosing appropriate Christmas gifts for individuals with unique interests This metadata is provided to orient your extraction, not to constrain it. Extract entities and relationships freely from the document text itself; the metadata is descriptive context, not a checklist.", + "submit_elapsed_s": 6.6, + "submit_result": { + "ok": true + } + }, + { + "name": "Nic Oconnor Ind Study S2024 Syllabus.docx", + "bucket": "document", + "tier1_entities": 14, + "subtype": "academic", + "doc_chars": 17194, + "metadata": { + "language": "en", + "char_length": 12000, + "primary_format": "prose", + "structural_signals": { + "has_headings": true, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": true + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": true, + "has_technical_terminology": true, + "has_first_person": false, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "This document outlines an Independent Study course focused on creating a custom line of mystery box toys for portfolio development and product design." + }, + "metadata_elapsed_s": 110.3, + "custom_extraction_instructions": "This is a educational document in prose format. Summary: This document outlines an Independent Study course focused on creating a custom line of mystery box toys for portfolio development and product design. This metadata is provided to orient your extraction, not to constrain it. Extract entities and relationships freely from the document text itself; the metadata is descriptive context, not a checklist.", + "submit_elapsed_s": 21.1, + "submit_result": { + "ok": true + } + }, + { + "name": "Nic Oconnor Field Work F2023 Syllabus.pdf", + "bucket": "document", + "tier1_entities": 13, + "subtype": "academic", + "doc_chars": 17830, + "metadata": { + "language": "en", + "char_length": 12000, + "primary_format": "structured", + "structural_signals": { + "has_headings": true, + "has_bullet_lists": false, + "has_numbered_lists": true, + "has_tables": false, + "has_code_blocks": false, + "has_dates": true + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": true, + "has_technical_terminology": true, + "has_first_person": true, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "This document describes an independent study for a 3D modeling course with fieldwork at Dogwood Entertainment in Fall 2023." + }, + "metadata_elapsed_s": 113.4, + "custom_extraction_instructions": "This is a educational document in structured format. Summary: This document describes an independent study for a 3D modeling course with fieldwork at Dogwood Entertainment in Fall 2023. This metadata is provided to orient your extraction, not to constrain it. Extract entities and relationships freely from the document text itself; the metadata is descriptive context, not a checklist.", + "submit_elapsed_s": 26.3, + "submit_result": { + "ok": true + } + }, + { + "name": "Circuits II.pptx", + "bucket": "document", + "tier1_entities": 11, + "subtype": "creative", + "doc_chars": 3645, + "metadata": { + "language": "en", + "char_length": 3645, + "primary_format": "prose", + "structural_signals": { + "has_headings": false, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": false + }, + "content_signals": { + "has_named_people": false, + "has_institutional_language": false, + "has_technical_terminology": true, + "has_first_person": false, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "This document discusses the Arduino pinout, LED connection, power supply, diodes, and LED datasheets in an electronic circuit context." + }, + "metadata_elapsed_s": 48.9, + "custom_extraction_instructions": "This is a educational document in prose format. Summary: This document discusses the Arduino pinout, LED connection, power supply, diodes, and LED datasheets in an electronic circuit context. This metadata is provided to orient your extraction, not to constrain it. Extract entities and relationships freely from the document text itself; the metadata is descriptive context, not a checklist.", + "submit_elapsed_s": 17.5, + "submit_result": { + "ok": true + } + }, + { + "name": "Polar Coordinates.pptx", + "bucket": "document", + "tier1_entities": 10, + "subtype": "creative", + "doc_chars": 1450, + "metadata": { + "language": "en", + "char_length": 1450, + "primary_format": "prose", + "structural_signals": { + "has_headings": false, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": false + }, + "content_signals": { + "has_named_people": false, + "has_institutional_language": true, + "has_technical_terminology": true, + "has_first_person": false, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "The document explains how to convert polar coordinates to Cartesian coordinates in a 2D space for use in Processing" + }, + "metadata_elapsed_s": 26.4, + "custom_extraction_instructions": "This is a educational document in prose format. Summary: The document explains how to convert polar coordinates to Cartesian coordinates in a 2D space for use in Processing This metadata is provided to orient your extraction, not to constrain it. Extract entities and relationships freely from the document text itself; the metadata is descriptive context, not a checklist.", + "submit_elapsed_s": 12.8, + "submit_result": { + "ok": true + } + }, + { + "name": "University of North Texas Cover letter.pdf", + "bucket": "document", + "tier1_entities": 14, + "subtype": "reference", + "doc_chars": 2272, + "metadata": { + "language": "en", + "char_length": 2272, + "primary_format": "prose", + "structural_signals": { + "has_headings": true, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": false + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": true, + "has_technical_terminology": true, + "has_first_person": true, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "This document is a job application letter for a Senior Lecturer position in Computing in the Visual Arts at University of North Texas." + }, + "metadata_elapsed_s": 29.5, + "custom_extraction_instructions": "This is a educational document in prose format. Summary: This document is a job application letter for a Senior Lecturer position in Computing in the Visual Arts at University of North Texas. This metadata is provided to orient your extraction, not to constrain it. Extract entities and relationships freely from the document text itself; the metadata is descriptive context, not a checklist.", + "submit_elapsed_s": 21.7, + "submit_result": { + "ok": true + } + }, + { + "name": "New Mexico Cover Letter.docx", + "bucket": "document", + "tier1_entities": 13, + "subtype": "reference", + "doc_chars": 2645, + "metadata": { + "language": "en", + "char_length": 2645, + "primary_format": "prose", + "structural_signals": { + "has_headings": true, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": false + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": true, + "has_technical_terminology": true, + "has_first_person": true, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "A job application letter from Aaron Nelson for a Tenure Track Faculty position in Experimental Art and Technology at the University of New Mexico, detailing his experiences and research areas." + }, + "metadata_elapsed_s": 32.5, + "custom_extraction_instructions": "This is a educational document in prose format. Summary: A job application letter from Aaron Nelson for a Tenure Track Faculty position in Experimental Art and Technology at the University of New Mexico, detailing his experiences and research areas. This metadata is provided to orient your extraction, not to constrain it. Extract entities and relationships freely from the document text itself; the metadata is descriptive context, not a checklist.", + "submit_elapsed_s": 33.6, + "submit_result": { + "ok": true + } + } + ] +} \ No newline at end of file diff --git a/experiments/e14_per_source_comparison.json b/experiments/e14_per_source_comparison.json new file mode 100644 index 0000000..8396ef5 --- /dev/null +++ b/experiments/e14_per_source_comparison.json @@ -0,0 +1,302 @@ +[ + { + "name": "Claude: Finding ideal rural housing near University of Utah", + "bucket": "high", + "prod_preds": 14, + "cascade_preds": 13, + "delta_preds": -1, + "prod_edges": 28, + "cascade_edges": 30, + "delta_edges": 2 + }, + { + "name": "ChatGPT: Rhino 3D object flow", + "bucket": "high", + "prod_preds": 16, + "cascade_preds": 11, + "delta_preds": -5, + "prod_edges": 31, + "cascade_edges": 28, + "delta_edges": -3 + }, + { + "name": "Claude: Evaluating tenure prospects at R1 universities", + "bucket": "high", + "prod_preds": 3, + "cascade_preds": 15, + "delta_preds": 12, + "prod_edges": 5, + "cascade_edges": 36, + "delta_edges": 31 + }, + { + "name": "Claude: Law enforcement career options", + "bucket": "high", + "prod_preds": 18, + "cascade_preds": 19, + "delta_preds": 1, + "prod_edges": 23, + "cascade_edges": 21, + "delta_edges": -2 + }, + { + "name": "ChatGPT: Resume formatting and review", + "bucket": "high", + "prod_preds": 8, + "cascade_preds": 9, + "delta_preds": 1, + "prod_edges": 19, + "cascade_edges": 22, + "delta_edges": 3 + }, + { + "name": "ChatGPT: Tulsa Concept Album Guide", + "bucket": "high", + "prod_preds": 10, + "cascade_preds": 4, + "delta_preds": -6, + "prod_edges": 19, + "cascade_edges": 16, + "delta_edges": -3 + }, + { + "name": "ChatGPT: SEC coaches with OSU ties", + "bucket": "high", + "prod_preds": 5, + "cascade_preds": 4, + "delta_preds": -1, + "prod_edges": 25, + "cascade_edges": 24, + "delta_edges": -1 + }, + { + "name": "Claude: Lubbock on everything album lyrics", + "bucket": "high", + "prod_preds": 11, + "cascade_preds": 4, + "delta_preds": -7, + "prod_edges": 17, + "cascade_edges": 5, + "delta_edges": -12 + }, + { + "name": "Claude: Bonding ASA 3D printed parts", + "bucket": "mid", + "prod_preds": 5, + "cascade_preds": 4, + "delta_preds": -1, + "prod_edges": 10, + "cascade_edges": 11, + "delta_edges": 1 + }, + { + "name": "Claude: SUNY faculty conflict of interest policies", + "bucket": "mid", + "prod_preds": 7, + "cascade_preds": 3, + "delta_preds": -4, + "prod_edges": 10, + "cascade_edges": 3, + "delta_edges": -7 + }, + { + "name": "Claude: Interview presentation research and preparation", + "bucket": "mid", + "prod_preds": 9, + "cascade_preds": 5, + "delta_preds": -4, + "prod_edges": 10, + "cascade_edges": 10, + "delta_edges": 0 + }, + { + "name": "ChatGPT: Remington 700 5R Gen 1", + "bucket": "mid", + "prod_preds": 7, + "cascade_preds": 8, + "delta_preds": 1, + "prod_edges": 10, + "cascade_edges": 10, + "delta_edges": 0 + }, + { + "name": "Claude: I filling out my annual report...", + "bucket": "mid", + "prod_preds": 6, + "cascade_preds": 7, + "delta_preds": 1, + "prod_edges": 8, + "cascade_edges": 11, + "delta_edges": 3 + }, + { + "name": "Aaron AI: So, I've been working on the RNAI project, and the way I've ...", + "bucket": "mid", + "prod_preds": 5, + "cascade_preds": 6, + "delta_preds": 1, + "prod_edges": 6, + "cascade_edges": 8, + "delta_edges": 2 + }, + { + "name": "Claude: Internship agreement writing help", + "bucket": "mid", + "prod_preds": 6, + "cascade_preds": 6, + "delta_preds": 0, + "prod_edges": 6, + "cascade_edges": 7, + "delta_edges": 1 + }, + { + "name": "ChatGPT: Research Statement Restructure", + "bucket": "mid", + "prod_preds": 4, + "cascade_preds": 2, + "delta_preds": -2, + "prod_edges": 5, + "cascade_edges": 8, + "delta_edges": 3 + }, + { + "name": "ChatGPT: Sink Sprayer Fitting Name", + "bucket": "low", + "prod_preds": 3, + "cascade_preds": 3, + "delta_preds": 0, + "prod_edges": 4, + "cascade_edges": 3, + "delta_edges": -1 + }, + { + "name": "ChatGPT: Title: User request summary.", + "bucket": "low", + "prod_preds": 3, + "cascade_preds": 2, + "delta_preds": -1, + "prod_edges": 3, + "cascade_edges": 2, + "delta_edges": -1 + }, + { + "name": "ChatGPT: Regex for inserting letters", + "bucket": "low", + "prod_preds": 4, + "cascade_preds": 4, + "delta_preds": 0, + "prod_edges": 5, + "cascade_edges": 5, + "delta_edges": 0 + }, + { + "name": "ChatGPT: Provide DXF file", + "bucket": "low", + "prod_preds": 5, + "cascade_preds": 9, + "delta_preds": 4, + "prod_edges": 6, + "cascade_edges": 17, + "delta_edges": 11 + }, + { + "name": "ChatGPT: Rectangle Edge Offset Algorithm", + "bucket": "low", + "prod_preds": 2, + "cascade_preds": 2, + "delta_preds": 0, + "prod_edges": 5, + "cascade_edges": 8, + "delta_edges": 3 + }, + { + "name": "ChatGPT: Testing Vector Alignment", + "bucket": "low", + "prod_preds": 3, + "cascade_preds": 3, + "delta_preds": 0, + "prod_edges": 3, + "cascade_edges": 3, + "delta_edges": 0 + }, + { + "name": "ChatGPT: Scholarship Recommendation Letter Tips", + "bucket": "low", + "prod_preds": 2, + "cascade_preds": 1, + "delta_preds": -1, + "prod_edges": 2, + "cascade_edges": 1, + "delta_edges": -1 + }, + { + "name": "ChatGPT: Respect Individual Interests for Christmas", + "bucket": "low", + "prod_preds": 3, + "cascade_preds": 1, + "delta_preds": -2, + "prod_edges": 3, + "cascade_edges": 1, + "delta_edges": -2 + }, + { + "name": "Nic Oconnor Ind Study S2024 Syllabus.docx", + "bucket": "document", + "prod_preds": 6, + "cascade_preds": 8, + "delta_preds": 2, + "prod_edges": 12, + "cascade_edges": 13, + "delta_edges": 1 + }, + { + "name": "Nic Oconnor Field Work F2023 Syllabus.pdf", + "bucket": "document", + "prod_preds": 5, + "cascade_preds": 14, + "delta_preds": 9, + "prod_edges": 9, + "cascade_edges": 16, + "delta_edges": 7 + }, + { + "name": "Circuits II.pptx", + "bucket": "document", + "prod_preds": 5, + "cascade_preds": 7, + "delta_preds": 2, + "prod_edges": 7, + "cascade_edges": 12, + "delta_edges": 5 + }, + { + "name": "Polar Coordinates.pptx", + "bucket": "document", + "prod_preds": 8, + "cascade_preds": 8, + "delta_preds": 0, + "prod_edges": 9, + "cascade_edges": 9, + "delta_edges": 0 + }, + { + "name": "University of North Texas Cover letter.pdf", + "bucket": "document", + "prod_preds": 10, + "cascade_preds": 8, + "delta_preds": -2, + "prod_edges": 13, + "cascade_edges": 12, + "delta_edges": -1 + }, + { + "name": "New Mexico Cover Letter.docx", + "bucket": "document", + "prod_preds": 7, + "cascade_preds": 10, + "delta_preds": 3, + "prod_edges": 12, + "cascade_edges": 20, + "delta_edges": 8 + } +] \ No newline at end of file diff --git a/experiments/e14_sample.json b/experiments/e14_sample.json new file mode 100644 index 0000000..6a9e4e0 --- /dev/null +++ b/experiments/e14_sample.json @@ -0,0 +1,177 @@ +{ + "metadata": { + "purpose": "E1.4 cascade re-extraction replication (n=30)", + "exclusions": "E1's 10 sources", + "stratification": { + "high": 8, + "mid": 8, + "low": 8, + "document": 6, + "document_subtypes": { + "academic": 2, + "creative": 2, + "reference": 2 + } + }, + "quartile_top": 19, + "quartile_bottom": 5 + }, + "selected": [ + { + "name": "Claude: Finding ideal rural housing near University of Utah", + "entities": 40, + "bucket": "high" + }, + { + "name": "ChatGPT: Rhino 3D object flow", + "entities": 39, + "bucket": "high" + }, + { + "name": "Claude: Evaluating tenure prospects at R1 universities", + "entities": 34, + "bucket": "high" + }, + { + "name": "Claude: Law enforcement career options", + "entities": 34, + "bucket": "high" + }, + { + "name": "ChatGPT: Resume formatting and review", + "entities": 28, + "bucket": "high" + }, + { + "name": "ChatGPT: Tulsa Concept Album Guide", + "entities": 28, + "bucket": "high" + }, + { + "name": "ChatGPT: SEC coaches with OSU ties", + "entities": 27, + "bucket": "high" + }, + { + "name": "Claude: Lubbock on everything album lyrics", + "entities": 26, + "bucket": "high" + }, + { + "name": "Claude: Bonding ASA 3D printed parts", + "entities": 11, + "bucket": "mid" + }, + { + "name": "Claude: SUNY faculty conflict of interest policies", + "entities": 11, + "bucket": "mid" + }, + { + "name": "Claude: Interview presentation research and preparation", + "entities": 11, + "bucket": "mid" + }, + { + "name": "ChatGPT: Remington 700 5R Gen 1", + "entities": 11, + "bucket": "mid" + }, + { + "name": "Claude: I filling out my annual report...", + "entities": 9, + "bucket": "mid" + }, + { + "name": "Aaron AI: So, I've been working on the RNAI project, and the way I've ...", + "entities": 9, + "bucket": "mid" + }, + { + "name": "Claude: Internship agreement writing help", + "entities": 8, + "bucket": "mid" + }, + { + "name": "ChatGPT: Research Statement Restructure", + "entities": 8, + "bucket": "mid" + }, + { + "name": "ChatGPT: Sink Sprayer Fitting Name", + "entities": 5, + "bucket": "low" + }, + { + "name": "ChatGPT: Title: User request summary.", + "entities": 5, + "bucket": "low" + }, + { + "name": "ChatGPT: Regex for inserting letters", + "entities": 5, + "bucket": "low" + }, + { + "name": "ChatGPT: Provide DXF file", + "entities": 5, + "bucket": "low" + }, + { + "name": "ChatGPT: Rectangle Edge Offset Algorithm", + "entities": 3, + "bucket": "low" + }, + { + "name": "ChatGPT: Testing Vector Alignment", + "entities": 3, + "bucket": "low" + }, + { + "name": "ChatGPT: Scholarship Recommendation Letter Tips", + "entities": 3, + "bucket": "low" + }, + { + "name": "ChatGPT: Respect Individual Interests for Christmas", + "entities": 2, + "bucket": "low" + }, + { + "name": "Nic Oconnor Ind Study S2024 Syllabus.docx", + "entities": 14, + "subtype": "academic", + "bucket": "document" + }, + { + "name": "Nic Oconnor Field Work F2023 Syllabus.pdf", + "entities": 13, + "subtype": "academic", + "bucket": "document" + }, + { + "name": "Circuits II.pptx", + "entities": 11, + "subtype": "creative", + "bucket": "document" + }, + { + "name": "Polar Coordinates.pptx", + "entities": 10, + "subtype": "creative", + "bucket": "document" + }, + { + "name": "University of North Texas Cover letter.pdf", + "entities": 14, + "subtype": "reference", + "bucket": "document" + }, + { + "name": "New Mexico Cover Letter.docx", + "entities": 13, + "subtype": "reference", + "bucket": "document" + } + ] +} \ No newline at end of file diff --git a/experiments/e1_8_eval.json b/experiments/e1_8_eval.json new file mode 100644 index 0000000..34dba2e --- /dev/null +++ b/experiments/e1_8_eval.json @@ -0,0 +1,1053 @@ +{ + "subsample_a": [ + { + "name": "Claude: Lubbock on everything album lyrics", + "prod_preds": 11, + "prod_edges": 17, + "e14_preds": 4, + "e14_edges": 5, + "tf_preds": 10, + "tf_edges": 18, + "bucket": "high", + "taxfree_metadata": { + "active_frames": [ + "Songwriting", + "Storytelling", + "Literary Analysis" + ], + "frame_relationships": "The Songwriting frame is the primary focus of this content, with the Storytelling and Literary Analysis frames providing context and analysis for understanding Terry Allen's writing style.", + "extraction_orientation": "Look for specific elements of Allen's writing style as outlined by Claude, including conversational deadpan, specificity as worldbuilding, humor next to heartbreak, the 'report from the inside' voice, short, punchy, declarative sentences, and characters as shorthand for the place.", + "one_sentence_summary": "A discussion about Terry Allen's album Lubbock (on Everything) and its writing style, with guidance on how to emulate it for a creative project" + }, + "e14_delta_preds": -7, + "tf_delta_vs_prod": -9.090909090909092, + "e14_delta_vs_prod": -63.63636363636363 + }, + { + "name": "ChatGPT: Tulsa Concept Album Guide", + "prod_preds": 10, + "prod_edges": 19, + "e14_preds": 4, + "e14_edges": 16, + "tf_preds": 10, + "tf_edges": 18, + "bucket": "high", + "taxfree_metadata": { + "active_frames": [ + "Personal_Narrative", + "Place_Exploration", + "Autobiography" + ], + "frame_relationships": "The Personal_Narrative and Place_Exploration frames are intertwined, with the album being a personal account of the writer's experiences in Tulsa. The Autobiography frame is used to disguise the concept album as a traditional autobiography.", + "extraction_orientation": "The most relationship-rich content can be found in the vignettes and track descriptions, as they provide insights into the writer's personal experiences, emotions, and perceptions of Tulsa.", + "one_sentence_summary": "The document describes a planned concept album about the writer's personal experiences growing up in Tulsa, Oklahoma, inspired by Terry Allen's *Lubbock (on Everything)*, with a focus on exploring themes related to race, religion, nostalgia, and departure." + }, + "e14_delta_preds": -6, + "tf_delta_vs_prod": 0.0, + "e14_delta_vs_prod": -60.0 + }, + { + "name": "ChatGPT: Rhino 3D object flow", + "prod_preds": 16, + "prod_edges": 31, + "e14_preds": 11, + "e14_edges": 28, + "tf_preds": 10, + "tf_edges": 22, + "bucket": "high", + "taxfree_metadata": { + "active_frames": [ + "Digital Fabrication", + "Philosophical Concepts", + "Design Process" + ], + "frame_relationships": "The philosophical concepts provide a framework for understanding and describing digital fabrication processes.", + "extraction_orientation": "Focus on the mapping between the philosophical concepts and the digital fabrication processes", + "one_sentence_summary": "Discussion about how Deleuze's ideas map onto digital fabrication processes in architecture, design, and fabrication labs" + }, + "e14_delta_preds": -5, + "tf_delta_vs_prod": -37.5, + "e14_delta_vs_prod": -31.25 + }, + { + "name": "Claude: SUNY faculty conflict of interest policies", + "prod_preds": 7, + "prod_edges": 10, + "e14_preds": 3, + "e14_edges": 3, + "tf_preds": 6, + "tf_edges": 7, + "bucket": "mid", + "taxfree_metadata": { + "active_frames": [ + "Educational Policy", + "Workplace Ethics", + "Conflict of Interest", + "Academia-Industry Collaboration" + ], + "frame_relationships": "The document primarily discusses the SUNY conflict of interest policy for faculty members, focusing on outside work and consulting. The system-level policy and specific campus handbooks are provided as resources.", + "extraction_orientation": "Look for details about the nature of the consulting firm, its relationship with the academic work, and compliance with state ethics laws and SUNY policies.", + "one_sentence_summary": "This document outlines SUNY conflict of interest policies for faculty members engaging in outside work or consulting." + }, + "e14_delta_preds": -4, + "tf_delta_vs_prod": -14.285714285714285, + "e14_delta_vs_prod": -57.14285714285714 + }, + { + "name": "Claude: Interview presentation research and preparation", + "prod_preds": 9, + "prod_edges": 10, + "e14_preds": 5, + "e14_edges": 10, + "tf_preds": 6, + "tf_edges": 11, + "bucket": "mid", + "taxfree_metadata": { + "active_frames": [ + "Digital-Physical Interface", + "Emerging Technologies" + ], + "frame_relationships": "The Digital-Physical Interface frame centers the workflow between digital models and physical objects, while Emerging Technologies highlights the specific tools (AI, XR) used in this workflow.", + "extraction_orientation": "Look for instances where the workflow between digital models and physical objects is described, as well as mentions of AI or XR technologies being used in this context.", + "one_sentence_summary": "A discussion on the intersection of AI and XR in a design project, focusing on the digital-physical workflow and its implications for teaching." + }, + "e14_delta_preds": -4, + "tf_delta_vs_prod": -33.33333333333333, + "e14_delta_vs_prod": -44.44444444444444 + }, + { + "name": "ChatGPT: Respect Individual Interests for Christmas", + "prod_preds": 3, + "prod_edges": 3, + "e14_preds": 1, + "e14_edges": 1, + "tf_preds": 2, + "tf_edges": 2, + "bucket": "low", + "taxfree_metadata": { + "active_frames": [ + "Gift Giving", + "Interests and Preferences", + "Holiday Season" + ], + "frame_relationships": "The frames of gift giving and interests/preferences are intertwined as the document discusses choosing gifts based on a recipient's unique hobbies and preferences during the holiday season.", + "extraction_orientation": "Focus on relational content that describes how specific gifts align with an individual's interests or preferences, and consider the context of the holiday season.", + "one_sentence_summary": "The document provides advice on choosing Christmas gifts based on an individual's unique interests and preferences." + }, + "e14_delta_preds": -2, + "tf_delta_vs_prod": -33.33333333333333, + "e14_delta_vs_prod": -66.66666666666666 + }, + { + "name": "University of North Texas Cover letter.pdf", + "prod_preds": 10, + "prod_edges": 13, + "e14_preds": 8, + "e14_edges": 12, + "tf_preds": 10, + "tf_edges": 13, + "bucket": "document", + "taxfree_metadata": { + "active_frames": [ + "Education", + "Job Application", + "Artwork Creation", + "Academia" + ], + "frame_relationships": "The 'Education' and 'Job Application' frames are the primary drivers of the document, with 'Artwork Creation' providing contextual details. The 'Academia' frame serves as a broader context for all other frames.", + "extraction_orientation": "Focus on relations between the applicant's educational background, job application, and artworks to understand their qualifications and fit for the position.", + "one_sentence_summary": "This document details an artist's application for a Senior Lecturer position in Computing in the Visual Arts at University of North Texas, highlighting their educational background, teaching experience, and artistic practice." + }, + "e14_delta_preds": -2, + "tf_delta_vs_prod": 0.0, + "e14_delta_vs_prod": -20.0 + }, + { + "name": "Claude: Finding ideal rural housing near University of Utah", + "prod_preds": 14, + "prod_edges": 28, + "e14_preds": 13, + "e14_edges": 30, + "tf_preds": 17, + "tf_edges": 33, + "bucket": "high", + "taxfree_metadata": { + "active_frames": [ + "Geography", + "Community", + "Housing", + "Transportation" + ], + "frame_relationships": "The geographical location and community characteristics inform the housing and transportation options in each area.", + "extraction_orientation": "Focus on the specific details about Morgan, Kamas Valley, and Heber City/Midway regarding their character, housing, commute specifics to U of U, demographics, and vegetation.", + "one_sentence_summary": "Document discusses potential rural living situations near University of Utah, comparing Morgan, Kamas Valley (Oakley, Kamas, Francis), and Heber City/Midway in terms of geography, community, housing, transportation, and demographics." + }, + "e14_delta_preds": -1, + "tf_delta_vs_prod": 21.428571428571427, + "e14_delta_vs_prod": -7.142857142857142 + }, + { + "name": "ChatGPT: SEC coaches with OSU ties", + "prod_preds": 5, + "prod_edges": 25, + "e14_preds": 4, + "e14_edges": 24, + "tf_preds": 4, + "tf_edges": 24, + "bucket": "high", + "taxfree_metadata": { + "error": { + "message": "An error occurred", + "details": [ + { + "title": "OEmbed request error", + "url": "https://api.twitter.com/1.1/oembed?url=https%3A%2F%2Fmobile.twitter.com%2Fstatus%2F1657087050422960128", + "message": "Failed to load external media.\nThis is likely not a stable URL for an oEmbed." + } + ] + } + }, + "e14_delta_preds": -1, + "tf_delta_vs_prod": -20.0, + "e14_delta_vs_prod": -20.0 + }, + { + "name": "Claude: Bonding ASA 3D printed parts", + "prod_preds": 5, + "prod_edges": 10, + "e14_preds": 4, + "e14_edges": 11, + "tf_preds": 5, + "tf_edges": 9, + "bucket": "mid", + "taxfree_metadata": { + "active_frames": [ + "Bonding", + "Material_Selection", + "3D_Printed_Parts", + "Adhesives" + ], + "frame_relationships": "The frames 'Bonding' and 'Material_Selection' are active, with the latter providing context for the former by discussing suitable adhesives for bonding 3D printed parts.", + "extraction_orientation": "Focus on the properties, application areas, and relative strength of the discussed adhesives in relation to ASA 3D printed parts.", + "one_sentence_summary": "This document provides recommendations on various adhesives for bonding ASA 3D printed parts." + }, + "e14_delta_preds": -1, + "tf_delta_vs_prod": 0.0, + "e14_delta_vs_prod": -20.0 + }, + { + "name": "ChatGPT: Title: User request summary.", + "prod_preds": 3, + "prod_edges": 3, + "e14_preds": 2, + "e14_edges": 2, + "tf_preds": 3, + "tf_edges": 3, + "bucket": "low", + "taxfree_metadata": { + "active_frames": [ + "Event", + "Speech", + "Hosting", + "Communication" + ], + "frame_relationships": "The 'Event' frame is the main context, within which the 'Speech', 'Hosting', and 'Communication' frames are active.", + "extraction_orientation": "Look for the relationships between the host, the event, and the speech content in the introductory text of the document.", + "one_sentence_summary": "The document revolves around a conversation about creating an introductory speech for a design week event." + }, + "e14_delta_preds": -1, + "tf_delta_vs_prod": 0.0, + "e14_delta_vs_prod": -33.33333333333333 + }, + { + "name": "ChatGPT: Scholarship Recommendation Letter Tips", + "prod_preds": 2, + "prod_edges": 2, + "e14_preds": 1, + "e14_edges": 1, + "tf_preds": 1, + "tf_edges": 1, + "bucket": "low", + "taxfree_metadata": { + "active_frames": [ + "Recommendation", + "Education", + "Professional Skills", + "Personal Characteristics", + "Achievements", + "Future Potential" + ], + "frame_relationships": "The frames interact to provide a comprehensive assessment of the candidate's qualifications for a scholarship, focusing on both their academic and personal qualities.", + "extraction_orientation": "Relational content to focus on includes relationships between the recommender and applicant, the applicant's achievements, contributions, and potential, as well as examples supporting the description of personal traits.", + "one_sentence_summary": "This document outlines guidelines for writing a scholarship recommendation letter, covering key areas such as introduction, purpose, academic and professional qualities, personal characteristics, achievements, future potential, conclusion, and letter formatting." + }, + "e14_delta_preds": -1, + "tf_delta_vs_prod": -50.0, + "e14_delta_vs_prod": -50.0 + } + ], + "subsample_b": [ + { + "name": "ChatGPT: Job application comparison", + "base_preds": 13, + "base_edges": 22, + "std_preds": 15, + "std_edges": 21, + "tf_preds": 14, + "tf_edges": 18, + "bucket": "high", + "taxfree_metadata": { + "active_frames": [ + "Job Application", + "Education & Qualifications Comparison", + "School Leadership Profile", + "Arts Education & Facilities Management" + ], + "frame_relationships": "The frames interact by comparing the applicant's qualifications to the job requirements, peer deans' profiles, and the mission of the School of the Arts at Western.", + "extraction_orientation": "Relational content a knowledge graph extractor should look for includes comparisons between the applicant's profile and job requirements, peer dean profiles, and the School of the Arts at Western's mission.", + "one_sentence_summary": "This document discusses a job application for a Dean position in the new School of the Arts at Western, comparing the applicant's qualifications to job requirements and peer deans' profiles while emphasizing their fit with the School's mission." + }, + "standard_metadata": { + "language": "en", + "char_length": 1569, + "primary_format": "prose", + "structural_signals": { + "has_headings": true, + "has_bullet_lists": true, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": true + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": true, + "has_technical_terminology": true, + "has_first_person": true, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "Discussion about a job application for the dean position at Western's School of the Arts, comparing it to other similar schools and analyzing the applicant's qualifications." + }, + "std_delta_vs_base": 15.384615384615385, + "tf_delta_vs_base": 7.6923076923076925 + }, + { + "name": "ChatGPT: External review for tenure", + "base_preds": 3, + "base_edges": 5, + "std_preds": 5, + "std_edges": 5, + "tf_preds": 4, + "tf_edges": 7, + "bucket": "high", + "taxfree_metadata": { + "active_frames": [ + "Tenure Review", + "Academic Evaluation", + "Peer Review" + ], + "frame_relationships": "The Tenure Review frame is the central and overarching context in which the other frames (Academic Evaluation, Peer Review) are activated.", + "extraction_orientation": "Focus on the specific criteria, standards, and expectations related to tenure review, as well as the evaluation process itself.", + "one_sentence_summary": "This document provides guidelines for an external evaluator in a tenure review process, detailing what materials to expect, how to structure the evaluation letter, and what aspects to consider in assessing research/creative activity, teaching, service, and overall recommendation." + }, + "standard_metadata": { + "language": "en", + "char_length": 1963, + "primary_format": "prose", + "structural_signals": { + "has_headings": true, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": true + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": true, + "has_technical_terminology": true, + "has_first_person": true, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "This document provides guidelines for an external reviewer to evaluate a faculty member's record for tenure." + }, + "std_delta_vs_base": 66.66666666666666, + "tf_delta_vs_base": 33.33333333333333 + }, + { + "name": "Claude: University of Utah interview teaching example", + "base_preds": 14, + "base_edges": 24, + "std_preds": 14, + "std_edges": 22, + "tf_preds": 15, + "tf_edges": 31, + "bucket": "high", + "taxfree_metadata": { + "active_frames": [ + "Education", + "Teaching", + "Computational Design", + "Programming for Arts", + "3D Printing" + ], + "frame_relationships": "The document discusses a university course, its curriculum, and related projects; these topics are interconnected as they all revolve around the teaching of computational design in a university setting.", + "extraction_orientation": "Look for relationships between the course content, its objectives, and practical applications, including programming languages used, projects undertaken, and equipment utilized.", + "one_sentence_summary": "The document describes a university course offering on computational design for arts, covering topics such as programming in Processing/Java, 3D printing, advanced slicer techniques, Stratasys equipment usage, and certification options." + }, + "standard_metadata": { + "language": "en", + "char_length": 1624, + "primary_format": "prose", + "structural_signals": { + "has_headings": true, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": false + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": false, + "has_technical_terminology": true, + "has_first_person": true, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "Discussion about a course teaching computational techniques for 3D design at the University of Utah" + }, + "std_delta_vs_base": 0.0, + "tf_delta_vs_base": 7.142857142857142 + }, + { + "name": "ChatGPT: Starting Dropship Gun Business", + "base_preds": 9, + "base_edges": 15, + "std_preds": 13, + "std_edges": 18, + "tf_preds": 18, + "tf_edges": 19, + "bucket": "high", + "taxfree_metadata": { + "active_frames": [ + "GunBusinessSetup", + "FFLCompliance", + "DistributorRelationshipBuilding", + "Ecommerce" + ], + "frame_relationships": "The 'GunBusinessSetup' frame involves navigating multiple layers of compliance (FFL, zoning, and state/local laws) while building relationships with distributors and setting up an e-commerce platform.", + "extraction_orientation": "Focus on the detailed workflows for obtaining FFL, setting up compliant e-commerce store, finding a distributor, and establishing B2B credit relationship with them.", + "one_sentence_summary": "A guide detailing how to start a dropship gun business in the US, including the legal requirements, necessary setup steps, and tips for building relationships with distributors." + }, + "standard_metadata": { + "language": "en", + "char_length": 2764, + "primary_format": "prose", + "structural_signals": { + "has_headings": true, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": false + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": true, + "has_technical_terminology": true, + "has_first_person": true, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "This document provides a detailed guide on how to set up a dropship gun business in the U.S., covering licensing, setting up compliant premises, understanding regulations, dealing with payments and e-commerce platforms, and building relationships with distributors." + }, + "std_delta_vs_base": 44.44444444444444, + "tf_delta_vs_base": 100.0 + }, + { + "name": "ChatGPT: Analyze business plan", + "base_preds": 8, + "base_edges": 11, + "std_preds": 8, + "std_edges": 10, + "tf_preds": 8, + "tf_edges": 9, + "bucket": "high", + "taxfree_metadata": {}, + "standard_metadata": { + "assistant_name": "aaron", + "role": "assistant", + "version": "7.1.0-alpha", + "name": "Business Plan Analysis - Employees and Contractors Only", + "messages": [ + { + "role": "system", + "content": "Here's an analysis of the business plan focusing only on the employees and contractors expenses, comparing them to the provided OpEx bands." + }, + { + "role": "assistant", + "content": "First, let me load the data from the excel file and format the columns.\n\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom caas_jupyter_tools import display_dataframe_to_user\npath = '/mnt/data/EAS Staffing.xlsx'\ndf = pd.read_excel(path, header=9)\ndf = df.dropna(axis=1, how='all')\ndf.columns = [str(c).strip().lower().replace('\n',' ').replace(' ', '_') for c in df.columns]\n" + }, + { + "role": "assistant", + "content": "Next, let's identify the month columns and fill any missing values with 0.\n\nmonth_cols = [c for c in df.columns if c.startswith('m') and c[1:].lstrip('-').isdigit()]\nmonth_cols = [c for c in month_cols if int(c[1:]) >= 0]\nmonth_cols = sorted(month_cols, key=lambda c: int(c[1:]))\nfor c in month_cols:\ndf[c] = pd.to_numeric(df[c], errors='coerce').fillna(0)" + }, + { + "role": "assistant", + "content": "Now, let's filter the data to only include employees and contractors (exclude subtotals/notes).\n\nmask_emp_con = df['type'].astype(str).str.lower().str.contains('employee|contract', na=False)\nempcon = df.loc[mask_emp_con].copy()" + }, + { + "role": "assistant", + "content": "Next, let's compute the total staffing costs for each month.\n\ntotals = pd.DataFrame({ 'MonthIndex': [int(c[1:]) for c in month_cols], 'Staffing_Total': [empcon[c].sum() for c in month_cols] })\n" + }, + { + "role": "assistant", + "content": "Now, let's define the years 1 and 2 masks based on the total index.\ny1 = totals['MonthIndex'] <= 11\ny2 = (totals['MonthIndex'] >= 12) & (totals['MonthIndex'] <= 23)" + }, + { + "role": "assistant", + "content": "Finally, let's compare the total staffing costs to the provided OpEx bands for years 1 and 2.\n\nYear 1 (M1 - M11): Staffing_Total: $" + }, + { + "role": "assistant", + "content": "\tSum(totals.loc[y1, 'Staffing_Total']): ${:,.0f}\n\nYear 2 (M12 - M24): Staffing_Total: $" + }, + { + "role": "assistant", + "content": "\tSum(totals.loc[y2, 'Staffing_Total']): ${:,.0f}" + } + ] + }, + "std_delta_vs_base": 0.0, + "tf_delta_vs_base": 0.0 + }, + { + "name": "ChatGPT: Outdoor Layering Explained", + "base_preds": 14, + "base_edges": 16, + "std_preds": 6, + "std_edges": 16, + "tf_preds": 14, + "tf_edges": 17, + "bucket": "mid", + "taxfree_metadata": { + "active_frames": [ + "Clothing Layering", + "Outdoor Clothing", + "Sweat Management", + "Insulation", + "Protection", + "Weather" + ], + "frame_relationships": "The frames interact by defining layers in an outdoor clothing system for managing sweat, insulation, and protection from weather conditions.", + "extraction_orientation": "Focus on the relationships between layers and how they are mixed-and-matched depending on temperature, wind, precipitation, and activity level.", + "one_sentence_summary": "The document describes a 3-layer outdoor clothing system for managing sweat, insulation, and protection from wind, rain, and snow during various activities." + }, + "standard_metadata": { + "language": "en", + "char_length": 3761, + "primary_format": "prose", + "structural_signals": { + "has_headings": true, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": false + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": false, + "has_technical_terminology": true, + "has_first_person": true, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "The document explains the concept of layering outdoor clothing for different activities and weather conditions, including base layer, mid-layer, and outer layer, as well as hand and leg layers." + }, + "std_delta_vs_base": -57.14285714285714, + "tf_delta_vs_base": 0.0 + }, + { + "name": "ChatGPT: Limits in Calculus.", + "base_preds": 8, + "base_edges": 15, + "std_preds": 6, + "std_edges": 14, + "tf_preds": 4, + "tf_edges": 8, + "bucket": "mid", + "taxfree_metadata": { + "active_frames": [ + "Mathematics", + "Calculus", + "Functions", + "Limits" + ], + "frame_relationships": "The 'Mathematics' and 'Calculus' frames are active. The 'Functions', 'Limits', and 'Calculus' frames are interconnected, as the document discusses the concept of limits in relation to functions within calculus.", + "extraction_orientation": "Look for relational content between functions, their inputs, and the behavior of those functions as they approach certain values (limits).", + "one_sentence_summary": "The document explains the concept of limits in calculus and provides real-world examples to illustrate its use." + }, + "standard_metadata": { + "language": "en", + "char_length": 1269, + "primary_format": "prose", + "structural_signals": { + "has_headings": false, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": true + }, + "content_signals": { + "has_named_people": false, + "has_institutional_language": false, + "has_technical_terminology": true, + "has_first_person": false, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "Discussion on the concept of limits in calculus and their real-world applications" + }, + "std_delta_vs_base": -25.0, + "tf_delta_vs_base": -50.0 + }, + { + "name": "ChatGPT: Academic Program Director Role", + "base_preds": 4, + "base_edges": 10, + "std_preds": 5, + "std_edges": 11, + "tf_preds": 5, + "tf_edges": 11, + "bucket": "mid", + "taxfree_metadata": { + "active_frames": [ + "academic_program", + "administration", + "faculty_management", + "student_engagement", + "curriculum_development" + ], + "frame_relationships": "The 'academic program' frame underpins the entire document, with 'administration', 'faculty management', and 'student engagement' frames supporting it. The 'curriculum development' frame is a core function that intersects all other frames.", + "extraction_orientation": "Look for relationships between roles (e.g., program director to faculty, students, administration), responsibilities within each role, and interactions between these responsibilities", + "one_sentence_summary": "This document outlines the key duties of an academic program director in Design and Digital Fabrication programs, focusing on managing faculty, engaging with industry partners, developing curriculum, assessing student outcomes, and fostering student success." + }, + "standard_metadata": { + "language": "en", + "char_length": 4507, + "primary_format": "prose", + "structural_signals": { + "has_headings": true, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": false + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": true, + "has_technical_terminology": true, + "has_first_person": true, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "Document describes the role and responsibilities of an Academic Program Director for Design and Digital Fabrication programs" + }, + "std_delta_vs_base": 25.0, + "tf_delta_vs_base": 25.0 + }, + { + "name": "ChatGPT: Lonely Island Poop Skit", + "base_preds": 4, + "base_edges": 11, + "std_preds": 4, + "std_edges": 14, + "tf_preds": 3, + "tf_edges": 7, + "bucket": "mid", + "taxfree_metadata": { + "active_frames": [ + "Music", + "Humor" + ], + "frame_relationships": "The Music frame is used to deliver comedic content within the Humor frame.", + "extraction_orientation": "Look for lyrics or song details that provide a deeper understanding of the comedic narrative.", + "one_sentence_summary": "The document discusses a Lonely Island track, 'Perfect Saturday', which is a comedic musical number about a poop-related incident during a sexual encounter." + }, + "standard_metadata": { + "language": "en", + "char_length": 1069, + "primary_format": "mixed", + "structural_signals": { + "has_headings": false, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": true + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": false, + "has_technical_terminology": false, + "has_first_person": false, + "has_quotations": false + }, + "domain_class": "conversational", + "one_sentence_summary": "A conversation about finding a specific skit from Lonely Island's albums 'Turtleneck & Chain', 'The Wack Album', or 'Incredibad'" + }, + "std_delta_vs_base": 0.0, + "tf_delta_vs_base": -25.0 + }, + { + "name": "ChatGPT: Parse Tidal playlist", + "base_preds": 3, + "base_edges": 4, + "std_preds": 2, + "std_edges": 4, + "tf_preds": 6, + "tf_edges": 92, + "bucket": "mid", + "taxfree_metadata": { + "active_speaker": "Assistant", + "finetuned_response": { + "source_language": "en", + "target_language": "en", + "dialog_id": 412769058, + "generated_utts": [ + { + "role": "Assistant", + "text": "The follow-up playlist for your 80s night has been carefully curated with 25 more danceable tracks that maintain the same vibe as your original list. Here it is:\n\n## \ud83e\udea9 25 More Danceable '80s Tracks (Alt-Club Vibe, Fits Your Original Playlist)\n### \ud83d\udd25 Synthpop / New Wave / Dance-Rock:\n1. **Alphaville \u2013 Forever Young**\n2. **Depeche Mode \u2013 Enjoy the Silence**\n3. **Tears for Fears \u2013 Shout** *(club remix)*\n4. **OMD \u2013 If You Leave**\n5. **The Human League \u2013 Love Action (I Believe in Love)**\n6. **Men Without Hats \u2013 Safety Dance**\n7. **Culture Club \u2013 Time (Clock of the Heart)**\n8. **A Flock of Seagulls \u2013 I Ran (So Far Away)** *(extended 12\u201d version)*\n9. **The Police \u2013 Don't Stand So Close to Me**\n10. **Pet Shop Boys \u2013 It's a Sin**" + }, + { + "role": "Assistant", + "text": "### \ud83d\udc83 Funky Post-Punk / Art-Disco Bangers:\n11. **The Stranglers \u2013 Golden Brown**\n12. **Salt-N-Pepa \u2013 Push It** *(early hip hop infused with disco)*\n13. **Chic \u2013 Le Freak**\n14. **Rick James \u2013 Super Freak**\n15. **The Buggles \u2013 Video Killed the Radio Star** *(early MTV staple)*" + }, + { + "role": "Assistant", + "text": "### \ud83e\uddca Sleek & Cool Euro-Dance / Underground Synth:\n16. **Kraftwerk \u2013 The Model**\n17. **Japan \u2013 Ghosts** *(haunting, synthpop ballad)*\n18. **New Order \u2013 Blue Monday (Extended 12\u201d Version)** *(one of the longest running UK singles)*\n19. **Soft Cell \u2013 Tainted Love** *(classic remake)*\n20. **Yazoo \u2013 Only You**" + }, + { + "role": "Assistant", + "text": "### \ud83c\udf00 Optional Bonus Vibes (For future expansion)\n- **Thompson Twins \u2013 Hold Me Now** *(power ballad)*\n- **ABC \u2013 The Look of Love** *(disco-infused synthpop)*\n- **Simple Minds \u2013 Alive and Kicking** *(anthemic, uplifting)*" + } + ] + } + }, + "standard_metadata": { + "error": "parse_failed", + "raw": "{\n\"query_id\": \"4b6a7e10-c39d-481f-ae4a-79e3ba6c4cdd\",\n\"results\": [\n {\n \"answer\": \"Sure, I can provide another 25 songs in the same flavor that leans more towards a darker, sexier after-midni" + }, + "std_delta_vs_base": -33.33333333333333, + "tf_delta_vs_base": 100.0 + }, + { + "name": "NO thesis proposal.pdf", + "base_preds": 6, + "base_edges": 15, + "std_preds": 2, + "std_edges": 6, + "tf_preds": 3, + "tf_edges": 7, + "bucket": "document", + "taxfree_metadata": { + "active_frames": [ + "Artist", + "Career Progression", + "Identity", + "Personal Interests", + "Product Design" + ], + "frame_relationships": "The artist's career progression is intertwined with their identity and personal interests, shaping the subject matter and style of their product designs.", + "extraction_orientation": "Focus on relationships between the artist's professional evolution, personal identity, and the creation process of their product designs.", + "one_sentence_summary": "The document discusses a graduate thesis proposal for a 3D modeler, focusing on creating whimsical products with specific subject matters: critters, fish, and poodles." + }, + "standard_metadata": { + "language": "en", + "char_length": 602, + "primary_format": "prose", + "structural_signals": { + "has_headings": false, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": false + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": false, + "has_technical_terminology": false, + "has_first_person": true, + "has_quotations": false + }, + "domain_class": "personal", + "one_sentence_summary": "The document discusses an artist's graduate thesis proposal for a project centered around creating whimsical product designs featuring critters, fish, and poodles." + }, + "std_delta_vs_base": -66.66666666666666, + "tf_delta_vs_base": -50.0 + }, + { + "name": "PWM.pdf", + "base_preds": 3, + "base_edges": 3, + "std_preds": 2, + "std_edges": 2, + "tf_preds": 2, + "tf_edges": 2, + "bucket": "document", + "taxfree_metadata": { + "active_frames": [ + "Pulse Width Modulation (PWM)", + "Analog to Digital Conversion", + "Arduino ADC" + ], + "frame_relationships": "The PWM and Analog to Digital Conversion frames are directly related as they both discuss the principles and applications of PWM in relation to analog-to-digital conversion, while the Arduino ADC frame provides specific context regarding the use of these concepts with an Arduino microcontroller.", + "extraction_orientation": "Look for relationships between PWM and its parameters (time, voltage levels) and their impact on Analog to Digital Conversion resolution and Arduino ADC functions", + "one_sentence_summary": "This document discusses the principles of Pulse Width Modulation, its implementation in Arduino ADC, and the relationship between PWM and analog-to-digital conversion." + }, + "standard_metadata": { + "language": "en", + "char_length": 427, + "primary_format": "prose", + "structural_signals": { + "has_headings": false, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": false + }, + "content_signals": { + "has_named_people": false, + "has_institutional_language": true, + "has_technical_terminology": true, + "has_first_person": false, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "The document discusses Pulse Width Modulation (PWM) and Analog to Digital Conversion in the context of Arduino." + }, + "std_delta_vs_base": -33.33333333333333, + "tf_delta_vs_base": -33.33333333333333 + }, + { + "name": "Will_It_Print.pdf", + "base_preds": 2, + "base_edges": 2, + "std_preds": 2, + "std_edges": 2, + "tf_preds": 2, + "tf_edges": 2, + "bucket": "document", + "taxfree_metadata": { + "active_frames": [ + "Part Design", + "Manufacturing Process", + "Material Removal", + "Surface Finish", + "Support Structures", + "Overhang Features", + "Internal Cavities", + "Tolerances", + "Stress Concentration" + ], + "frame_relationships": "The document discusses the design and manufacturing of a part, focusing on its features, material removal, surface finish, support structures, overhangs, internal cavities, tolerances, and stress concentration.", + "extraction_orientation": "Focus on the relationships between the part's design, manufacturing process, and associated features such as overhangs, internal cavities, tolerances, and stress concentration.", + "one_sentence_summary": "This document provides instructions for reducing the number of printing and prototyping failures by Joran Booth, focusing on various aspects of part design and manufacturing." + }, + "standard_metadata": { + "language": "en", + "char_length": 803, + "primary_format": "structured", + "structural_signals": { + "has_headings": true, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": false + }, + "content_signals": { + "has_named_people": false, + "has_institutional_language": false, + "has_technical_terminology": true, + "has_first_person": false, + "has_quotations": false + }, + "domain_class": "technical", + "one_sentence_summary": "This document provides instructions for reducing the number of printing and prototyping failures by Joran Booth for each category of a part to be printed" + }, + "std_delta_vs_base": 0.0, + "tf_delta_vs_base": 0.0 + }, + { + "name": "Kim Kedem Ind Study F2025 Syllabus.docx", + "base_preds": 13, + "base_edges": 18, + "std_preds": 4, + "std_edges": 9, + "tf_preds": 10, + "tf_edges": 13, + "bucket": "document", + "taxfree_metadata": { + "active_frames": [ + "Education", + "Project", + "Software Development", + "3D Printing" + ], + "frame_relationships": "The Education frame serves as the context for the Project and Software Development frames, with the 3D Printing frame acting as a specific application of these skills.", + "extraction_orientation": "A knowledge graph extractor should focus on relationships between educational concepts, project goals, software development requirements, and 3D printing techniques.", + "one_sentence_summary": "This independent study outlines a project for the design and implementation of a custom 3D printing slicer in Rhino Grasshopper environment." + }, + "standard_metadata": { + "language": "en", + "char_length": 1857, + "primary_format": "prose", + "structural_signals": { + "has_headings": true, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": true + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": true, + "has_technical_terminology": true, + "has_first_person": false, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "This document outlines an independent study focused on designing and implementing a custom 3D printing slicer in the Rhino Grasshopper environment." + }, + "std_delta_vs_base": -69.23076923076923, + "tf_delta_vs_base": -23.076923076923077 + }, + { + "name": "Aaron Nelson Graduate Transcript.pdf", + "base_preds": 5, + "base_edges": 6, + "std_preds": 4, + "std_edges": 4, + "tf_preds": 5, + "tf_edges": 5, + "bucket": "document", + "taxfree_metadata": {}, + "standard_metadata": { + "Name": "Aaron Nelson", + "Student ID": "00917133", + "University": "University of Massachusetts Dartmouth", + "Address": "285 Old Westport Road N Dartmouth, MA 027472356", + "Degree Awarded": "Master of Fine Arts", + "Confer Date": "05/23/2010", + "Degree GPA": "3.885", + "Program of Study": { + "Artisanry - Jewelry/Metals": [ + { + "Course": "Graduate Seminar I", + "Grade": "B", + "Credits": "3.00", + "Points": "9.000" + }, + { + "Course": "Instructional Development", + "Grade": "B+", + "Credits": "3.00", + "Points": "9.900" + }, + { + "Course": "Graduate Seminar II", + "Grade": "A-", + "Credits": "3.00", + "Points": "11.100" + }, + { + "Course": "Graduate Standard I: Metals/Jewelry", + "Grade": "A", + "Credits": "9.00", + "Points": "36.000" + }, + { + "Course": "Methods & Theory in Art History", + "Grade": "A", + "Credits": "3.00", + "Points": "12.000" + }, + { + "Course": "Graduate Standard II: Metals/Jewelry", + "Grade": null, + "Credits": "9.00", + "Points": null + }, + { + "Course": "Visual Thesis", + "Grade": null, + "Credits": "6.00", + "Points": null + }, + { + "Course": "Written Thesis", + "Grade": null, + "Credits": "3.00", + "Points": null + }, + { + "Course": "Independent Study (Arduino Microcontroller Basics)", + "Grade": "A", + "Credits": "6.00", + "Points": "24.000" + }, + { + "Course": "Independent Study (Furniture Failing)", + "Grade": "A", + "Credits": "3.00", + "Points": "12.000" + }, + { + "Course": "Visual Thesis (Continuation)", + "Grade": null, + "Credits": "6.00", + "Points": null + }, + { + "Course": "Writing Seminar", + "Grade": "A-", + "Credits": "3.00", + "Points": "11.100" + }, + { + "Course": "Thesis Report (Continuation)", + "Grade": null, + "Credits": "3.00", + "Points": null + }, + { + "Course": "Transcript Note: Lessons in Friction and Torque or: How I learned to Stop Worrying and Love the Information Age (Continuation)", + "Grade": null, + "Credits": null, + "Points": null + } + ] + }, + "Non Credit Courses": [ + { + "Course": "Metals Open Studio", + "Credits": null, + "Points": null + } + ] + }, + "std_delta_vs_base": -20.0, + "tf_delta_vs_base": 0.0 + } + ] +} \ No newline at end of file diff --git a/experiments/e1_8_results.json b/experiments/e1_8_results.json new file mode 100644 index 0000000..a96b49b --- /dev/null +++ b/experiments/e1_8_results.json @@ -0,0 +1,909 @@ +{ + "subsample_a": [ + { + "name": "Claude: Lubbock on everything album lyrics", + "bucket": "high", + "taxfree_metadata": { + "active_frames": [ + "Songwriting", + "Storytelling", + "Literary Analysis" + ], + "frame_relationships": "The Songwriting frame is the primary focus of this content, with the Storytelling and Literary Analysis frames providing context and analysis for understanding Terry Allen's writing style.", + "extraction_orientation": "Look for specific elements of Allen's writing style as outlined by Claude, including conversational deadpan, specificity as worldbuilding, humor next to heartbreak, the 'report from the inside' voice, short, punchy, declarative sentences, and characters as shorthand for the place.", + "one_sentence_summary": "A discussion about Terry Allen's album Lubbock (on Everything) and its writing style, with guidance on how to emulate it for a creative project" + }, + "taxfree_orientation": "Active frames: Songwriting, Storytelling, Literary Analysis. Frame relationships: The Songwriting frame is the primary focus of this content, with the Storytelling and Literary Analysis frames providing context and analysis for understanding Terry Allen's writing style. Extraction focus: Look for specific elements of Allen's writing style as outlined by Claude, including conversational deadpan, specificity as worldbuilding, humor next to heartbreak, the 'report from the inside' voice, short, punchy, declarative sentences, and characters as shorthand for the place. Summary: A discussion about Terry Allen's album Lubbock (on Everything) and its writing style, with guidance on how to emulate it for a creative project", + "e14_prod_preds": 11, + "e14_cascade_preds": 4, + "e14_delta_preds": -7, + "e14_prod_edges": 17, + "e14_cascade_edges": 5, + "e14_delta_edges": -12 + }, + { + "name": "ChatGPT: Tulsa Concept Album Guide", + "bucket": "high", + "taxfree_metadata": { + "active_frames": [ + "Personal_Narrative", + "Place_Exploration", + "Autobiography" + ], + "frame_relationships": "The Personal_Narrative and Place_Exploration frames are intertwined, with the album being a personal account of the writer's experiences in Tulsa. The Autobiography frame is used to disguise the concept album as a traditional autobiography.", + "extraction_orientation": "The most relationship-rich content can be found in the vignettes and track descriptions, as they provide insights into the writer's personal experiences, emotions, and perceptions of Tulsa.", + "one_sentence_summary": "The document describes a planned concept album about the writer's personal experiences growing up in Tulsa, Oklahoma, inspired by Terry Allen's *Lubbock (on Everything)*, with a focus on exploring themes related to race, religion, nostalgia, and departure." + }, + "taxfree_orientation": "Active frames: Personal_Narrative, Place_Exploration, Autobiography. Frame relationships: The Personal_Narrative and Place_Exploration frames are intertwined, with the album being a personal account of the writer's experiences in Tulsa. The Autobiography frame is used to disguise the concept album as a traditional autobiography. Extraction focus: The most relationship-rich content can be found in the vignettes and track descriptions, as they provide insights into the writer's personal experiences, emotions, and perceptions of Tulsa. Summary: The document describes a planned concept album about the writer's personal experiences growing up in Tulsa, Oklahoma, inspired by Terry Allen's *Lubbock (on Everything)*, with a focus on exploring themes related to race, religion, nostalgia, and departure.", + "e14_prod_preds": 10, + "e14_cascade_preds": 4, + "e14_delta_preds": -6, + "e14_prod_edges": 19, + "e14_cascade_edges": 16, + "e14_delta_edges": -3 + }, + { + "name": "ChatGPT: Rhino 3D object flow", + "bucket": "high", + "taxfree_metadata": { + "active_frames": [ + "Digital Fabrication", + "Philosophical Concepts", + "Design Process" + ], + "frame_relationships": "The philosophical concepts provide a framework for understanding and describing digital fabrication processes.", + "extraction_orientation": "Focus on the mapping between the philosophical concepts and the digital fabrication processes", + "one_sentence_summary": "Discussion about how Deleuze's ideas map onto digital fabrication processes in architecture, design, and fabrication labs" + }, + "taxfree_orientation": "Active frames: Digital Fabrication, Philosophical Concepts, Design Process. Frame relationships: The philosophical concepts provide a framework for understanding and describing digital fabrication processes. Extraction focus: Focus on the mapping between the philosophical concepts and the digital fabrication processes Summary: Discussion about how Deleuze's ideas map onto digital fabrication processes in architecture, design, and fabrication labs", + "e14_prod_preds": 16, + "e14_cascade_preds": 11, + "e14_delta_preds": -5, + "e14_prod_edges": 31, + "e14_cascade_edges": 28, + "e14_delta_edges": -3 + }, + { + "name": "Claude: SUNY faculty conflict of interest policies", + "bucket": "mid", + "taxfree_metadata": { + "active_frames": [ + "Educational Policy", + "Workplace Ethics", + "Conflict of Interest", + "Academia-Industry Collaboration" + ], + "frame_relationships": "The document primarily discusses the SUNY conflict of interest policy for faculty members, focusing on outside work and consulting. The system-level policy and specific campus handbooks are provided as resources.", + "extraction_orientation": "Look for details about the nature of the consulting firm, its relationship with the academic work, and compliance with state ethics laws and SUNY policies.", + "one_sentence_summary": "This document outlines SUNY conflict of interest policies for faculty members engaging in outside work or consulting." + }, + "taxfree_orientation": "Active frames: Educational Policy, Workplace Ethics, Conflict of Interest, Academia-Industry Collaboration. Frame relationships: The document primarily discusses the SUNY conflict of interest policy for faculty members, focusing on outside work and consulting. The system-level policy and specific campus handbooks are provided as resources. Extraction focus: Look for details about the nature of the consulting firm, its relationship with the academic work, and compliance with state ethics laws and SUNY policies. Summary: This document outlines SUNY conflict of interest policies for faculty members engaging in outside work or consulting.", + "e14_prod_preds": 7, + "e14_cascade_preds": 3, + "e14_delta_preds": -4, + "e14_prod_edges": 10, + "e14_cascade_edges": 3, + "e14_delta_edges": -7 + }, + { + "name": "Claude: Interview presentation research and preparation", + "bucket": "mid", + "taxfree_metadata": { + "active_frames": [ + "Digital-Physical Interface", + "Emerging Technologies" + ], + "frame_relationships": "The Digital-Physical Interface frame centers the workflow between digital models and physical objects, while Emerging Technologies highlights the specific tools (AI, XR) used in this workflow.", + "extraction_orientation": "Look for instances where the workflow between digital models and physical objects is described, as well as mentions of AI or XR technologies being used in this context.", + "one_sentence_summary": "A discussion on the intersection of AI and XR in a design project, focusing on the digital-physical workflow and its implications for teaching." + }, + "taxfree_orientation": "Active frames: Digital-Physical Interface, Emerging Technologies. Frame relationships: The Digital-Physical Interface frame centers the workflow between digital models and physical objects, while Emerging Technologies highlights the specific tools (AI, XR) used in this workflow. Extraction focus: Look for instances where the workflow between digital models and physical objects is described, as well as mentions of AI or XR technologies being used in this context. Summary: A discussion on the intersection of AI and XR in a design project, focusing on the digital-physical workflow and its implications for teaching.", + "e14_prod_preds": 9, + "e14_cascade_preds": 5, + "e14_delta_preds": -4, + "e14_prod_edges": 10, + "e14_cascade_edges": 10, + "e14_delta_edges": 0 + }, + { + "name": "ChatGPT: Respect Individual Interests for Christmas", + "bucket": "low", + "taxfree_metadata": { + "active_frames": [ + "Gift Giving", + "Interests and Preferences", + "Holiday Season" + ], + "frame_relationships": "The frames of gift giving and interests/preferences are intertwined as the document discusses choosing gifts based on a recipient's unique hobbies and preferences during the holiday season.", + "extraction_orientation": "Focus on relational content that describes how specific gifts align with an individual's interests or preferences, and consider the context of the holiday season.", + "one_sentence_summary": "The document provides advice on choosing Christmas gifts based on an individual's unique interests and preferences." + }, + "taxfree_orientation": "Active frames: Gift Giving, Interests and Preferences, Holiday Season. Frame relationships: The frames of gift giving and interests/preferences are intertwined as the document discusses choosing gifts based on a recipient's unique hobbies and preferences during the holiday season. Extraction focus: Focus on relational content that describes how specific gifts align with an individual's interests or preferences, and consider the context of the holiday season. Summary: The document provides advice on choosing Christmas gifts based on an individual's unique interests and preferences.", + "e14_prod_preds": 3, + "e14_cascade_preds": 1, + "e14_delta_preds": -2, + "e14_prod_edges": 3, + "e14_cascade_edges": 1, + "e14_delta_edges": -2 + }, + { + "name": "University of North Texas Cover letter.pdf", + "bucket": "document", + "taxfree_metadata": { + "active_frames": [ + "Education", + "Job Application", + "Artwork Creation", + "Academia" + ], + "frame_relationships": "The 'Education' and 'Job Application' frames are the primary drivers of the document, with 'Artwork Creation' providing contextual details. The 'Academia' frame serves as a broader context for all other frames.", + "extraction_orientation": "Focus on relations between the applicant's educational background, job application, and artworks to understand their qualifications and fit for the position.", + "one_sentence_summary": "This document details an artist's application for a Senior Lecturer position in Computing in the Visual Arts at University of North Texas, highlighting their educational background, teaching experience, and artistic practice." + }, + "taxfree_orientation": "Active frames: Education, Job Application, Artwork Creation, Academia. Frame relationships: The 'Education' and 'Job Application' frames are the primary drivers of the document, with 'Artwork Creation' providing contextual details. The 'Academia' frame serves as a broader context for all other frames. Extraction focus: Focus on relations between the applicant's educational background, job application, and artworks to understand their qualifications and fit for the position. Summary: This document details an artist's application for a Senior Lecturer position in Computing in the Visual Arts at University of North Texas, highlighting their educational background, teaching experience, and artistic practice.", + "e14_prod_preds": 10, + "e14_cascade_preds": 8, + "e14_delta_preds": -2, + "e14_prod_edges": 13, + "e14_cascade_edges": 12, + "e14_delta_edges": -1 + }, + { + "name": "Claude: Finding ideal rural housing near University of Utah", + "bucket": "high", + "taxfree_metadata": { + "active_frames": [ + "Geography", + "Community", + "Housing", + "Transportation" + ], + "frame_relationships": "The geographical location and community characteristics inform the housing and transportation options in each area.", + "extraction_orientation": "Focus on the specific details about Morgan, Kamas Valley, and Heber City/Midway regarding their character, housing, commute specifics to U of U, demographics, and vegetation.", + "one_sentence_summary": "Document discusses potential rural living situations near University of Utah, comparing Morgan, Kamas Valley (Oakley, Kamas, Francis), and Heber City/Midway in terms of geography, community, housing, transportation, and demographics." + }, + "taxfree_orientation": "Active frames: Geography, Community, Housing, Transportation. Frame relationships: The geographical location and community characteristics inform the housing and transportation options in each area. Extraction focus: Focus on the specific details about Morgan, Kamas Valley, and Heber City/Midway regarding their character, housing, commute specifics to U of U, demographics, and vegetation. Summary: Document discusses potential rural living situations near University of Utah, comparing Morgan, Kamas Valley (Oakley, Kamas, Francis), and Heber City/Midway in terms of geography, community, housing, transportation, and demographics.", + "e14_prod_preds": 14, + "e14_cascade_preds": 13, + "e14_delta_preds": -1, + "e14_prod_edges": 28, + "e14_cascade_edges": 30, + "e14_delta_edges": 2 + }, + { + "name": "ChatGPT: SEC coaches with OSU ties", + "bucket": "high", + "taxfree_metadata": { + "error": { + "message": "An error occurred", + "details": [ + { + "title": "OEmbed request error", + "url": "https://api.twitter.com/1.1/oembed?url=https%3A%2F%2Fmobile.twitter.com%2Fstatus%2F1657087050422960128", + "message": "Failed to load external media.\nThis is likely not a stable URL for an oEmbed." + } + ] + } + }, + "taxfree_orientation": "Active frames: . Frame relationships: Extraction focus: Summary: ", + "e14_prod_preds": 5, + "e14_cascade_preds": 4, + "e14_delta_preds": -1, + "e14_prod_edges": 25, + "e14_cascade_edges": 24, + "e14_delta_edges": -1 + }, + { + "name": "Claude: Bonding ASA 3D printed parts", + "bucket": "mid", + "taxfree_metadata": { + "active_frames": [ + "Bonding", + "Material_Selection", + "3D_Printed_Parts", + "Adhesives" + ], + "frame_relationships": "The frames 'Bonding' and 'Material_Selection' are active, with the latter providing context for the former by discussing suitable adhesives for bonding 3D printed parts.", + "extraction_orientation": "Focus on the properties, application areas, and relative strength of the discussed adhesives in relation to ASA 3D printed parts.", + "one_sentence_summary": "This document provides recommendations on various adhesives for bonding ASA 3D printed parts." + }, + "taxfree_orientation": "Active frames: Bonding, Material_Selection, 3D_Printed_Parts, Adhesives. Frame relationships: The frames 'Bonding' and 'Material_Selection' are active, with the latter providing context for the former by discussing suitable adhesives for bonding 3D printed parts. Extraction focus: Focus on the properties, application areas, and relative strength of the discussed adhesives in relation to ASA 3D printed parts. Summary: This document provides recommendations on various adhesives for bonding ASA 3D printed parts.", + "e14_prod_preds": 5, + "e14_cascade_preds": 4, + "e14_delta_preds": -1, + "e14_prod_edges": 10, + "e14_cascade_edges": 11, + "e14_delta_edges": 1 + }, + { + "name": "ChatGPT: Title: User request summary.", + "bucket": "low", + "taxfree_metadata": { + "active_frames": [ + "Event", + "Speech", + "Hosting", + "Communication" + ], + "frame_relationships": "The 'Event' frame is the main context, within which the 'Speech', 'Hosting', and 'Communication' frames are active.", + "extraction_orientation": "Look for the relationships between the host, the event, and the speech content in the introductory text of the document.", + "one_sentence_summary": "The document revolves around a conversation about creating an introductory speech for a design week event." + }, + "taxfree_orientation": "Active frames: Event, Speech, Hosting, Communication. Frame relationships: The 'Event' frame is the main context, within which the 'Speech', 'Hosting', and 'Communication' frames are active. Extraction focus: Look for the relationships between the host, the event, and the speech content in the introductory text of the document. Summary: The document revolves around a conversation about creating an introductory speech for a design week event.", + "e14_prod_preds": 3, + "e14_cascade_preds": 2, + "e14_delta_preds": -1, + "e14_prod_edges": 3, + "e14_cascade_edges": 2, + "e14_delta_edges": -1 + }, + { + "name": "ChatGPT: Scholarship Recommendation Letter Tips", + "bucket": "low", + "taxfree_metadata": { + "active_frames": [ + "Recommendation", + "Education", + "Professional Skills", + "Personal Characteristics", + "Achievements", + "Future Potential" + ], + "frame_relationships": "The frames interact to provide a comprehensive assessment of the candidate's qualifications for a scholarship, focusing on both their academic and personal qualities.", + "extraction_orientation": "Relational content to focus on includes relationships between the recommender and applicant, the applicant's achievements, contributions, and potential, as well as examples supporting the description of personal traits.", + "one_sentence_summary": "This document outlines guidelines for writing a scholarship recommendation letter, covering key areas such as introduction, purpose, academic and professional qualities, personal characteristics, achievements, future potential, conclusion, and letter formatting." + }, + "taxfree_orientation": "Active frames: Recommendation, Education, Professional Skills, Personal Characteristics, Achievements, Future Potential. Frame relationships: The frames interact to provide a comprehensive assessment of the candidate's qualifications for a scholarship, focusing on both their academic and personal qualities. Extraction focus: Relational content to focus on includes relationships between the recommender and applicant, the applicant's achievements, contributions, and potential, as well as examples supporting the description of personal traits. Summary: This document outlines guidelines for writing a scholarship recommendation letter, covering key areas such as introduction, purpose, academic and professional qualities, personal characteristics, achievements, future potential, conclusion, and letter formatting.", + "e14_prod_preds": 2, + "e14_cascade_preds": 1, + "e14_delta_preds": -1, + "e14_prod_edges": 2, + "e14_cascade_edges": 1, + "e14_delta_edges": -1 + } + ], + "subsample_b": [ + { + "name": "ChatGPT: Job application comparison", + "bucket": "high", + "taxfree_metadata": { + "active_frames": [ + "Job Application", + "Education & Qualifications Comparison", + "School Leadership Profile", + "Arts Education & Facilities Management" + ], + "frame_relationships": "The frames interact by comparing the applicant's qualifications to the job requirements, peer deans' profiles, and the mission of the School of the Arts at Western.", + "extraction_orientation": "Relational content a knowledge graph extractor should look for includes comparisons between the applicant's profile and job requirements, peer dean profiles, and the School of the Arts at Western's mission.", + "one_sentence_summary": "This document discusses a job application for a Dean position in the new School of the Arts at Western, comparing the applicant's qualifications to job requirements and peer deans' profiles while emphasizing their fit with the School's mission." + }, + "standard_metadata": { + "language": "en", + "char_length": 1569, + "primary_format": "prose", + "structural_signals": { + "has_headings": true, + "has_bullet_lists": true, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": true + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": true, + "has_technical_terminology": true, + "has_first_person": true, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "Discussion about a job application for the dean position at Western's School of the Arts, comparing it to other similar schools and analyzing the applicant's qualifications." + } + }, + { + "name": "ChatGPT: External review for tenure", + "bucket": "high", + "taxfree_metadata": { + "active_frames": [ + "Tenure Review", + "Academic Evaluation", + "Peer Review" + ], + "frame_relationships": "The Tenure Review frame is the central and overarching context in which the other frames (Academic Evaluation, Peer Review) are activated.", + "extraction_orientation": "Focus on the specific criteria, standards, and expectations related to tenure review, as well as the evaluation process itself.", + "one_sentence_summary": "This document provides guidelines for an external evaluator in a tenure review process, detailing what materials to expect, how to structure the evaluation letter, and what aspects to consider in assessing research/creative activity, teaching, service, and overall recommendation." + }, + "standard_metadata": { + "language": "en", + "char_length": 1963, + "primary_format": "prose", + "structural_signals": { + "has_headings": true, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": true + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": true, + "has_technical_terminology": true, + "has_first_person": true, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "This document provides guidelines for an external reviewer to evaluate a faculty member's record for tenure." + } + }, + { + "name": "Claude: University of Utah interview teaching example", + "bucket": "high", + "taxfree_metadata": { + "active_frames": [ + "Education", + "Teaching", + "Computational Design", + "Programming for Arts", + "3D Printing" + ], + "frame_relationships": "The document discusses a university course, its curriculum, and related projects; these topics are interconnected as they all revolve around the teaching of computational design in a university setting.", + "extraction_orientation": "Look for relationships between the course content, its objectives, and practical applications, including programming languages used, projects undertaken, and equipment utilized.", + "one_sentence_summary": "The document describes a university course offering on computational design for arts, covering topics such as programming in Processing/Java, 3D printing, advanced slicer techniques, Stratasys equipment usage, and certification options." + }, + "standard_metadata": { + "language": "en", + "char_length": 1624, + "primary_format": "prose", + "structural_signals": { + "has_headings": true, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": false + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": false, + "has_technical_terminology": true, + "has_first_person": true, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "Discussion about a course teaching computational techniques for 3D design at the University of Utah" + } + }, + { + "name": "ChatGPT: Starting Dropship Gun Business", + "bucket": "high", + "taxfree_metadata": { + "active_frames": [ + "GunBusinessSetup", + "FFLCompliance", + "DistributorRelationshipBuilding", + "Ecommerce" + ], + "frame_relationships": "The 'GunBusinessSetup' frame involves navigating multiple layers of compliance (FFL, zoning, and state/local laws) while building relationships with distributors and setting up an e-commerce platform.", + "extraction_orientation": "Focus on the detailed workflows for obtaining FFL, setting up compliant e-commerce store, finding a distributor, and establishing B2B credit relationship with them.", + "one_sentence_summary": "A guide detailing how to start a dropship gun business in the US, including the legal requirements, necessary setup steps, and tips for building relationships with distributors." + }, + "standard_metadata": { + "language": "en", + "char_length": 2764, + "primary_format": "prose", + "structural_signals": { + "has_headings": true, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": false + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": true, + "has_technical_terminology": true, + "has_first_person": true, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "This document provides a detailed guide on how to set up a dropship gun business in the U.S., covering licensing, setting up compliant premises, understanding regulations, dealing with payments and e-commerce platforms, and building relationships with distributors." + } + }, + { + "name": "ChatGPT: Analyze business plan", + "bucket": "high", + "taxfree_metadata": {}, + "standard_metadata": { + "assistant_name": "aaron", + "role": "assistant", + "version": "7.1.0-alpha", + "name": "Business Plan Analysis - Employees and Contractors Only", + "messages": [ + { + "role": "system", + "content": "Here's an analysis of the business plan focusing only on the employees and contractors expenses, comparing them to the provided OpEx bands." + }, + { + "role": "assistant", + "content": "First, let me load the data from the excel file and format the columns.\n\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom caas_jupyter_tools import display_dataframe_to_user\npath = '/mnt/data/EAS Staffing.xlsx'\ndf = pd.read_excel(path, header=9)\ndf = df.dropna(axis=1, how='all')\ndf.columns = [str(c).strip().lower().replace('\n',' ').replace(' ', '_') for c in df.columns]\n" + }, + { + "role": "assistant", + "content": "Next, let's identify the month columns and fill any missing values with 0.\n\nmonth_cols = [c for c in df.columns if c.startswith('m') and c[1:].lstrip('-').isdigit()]\nmonth_cols = [c for c in month_cols if int(c[1:]) >= 0]\nmonth_cols = sorted(month_cols, key=lambda c: int(c[1:]))\nfor c in month_cols:\ndf[c] = pd.to_numeric(df[c], errors='coerce').fillna(0)" + }, + { + "role": "assistant", + "content": "Now, let's filter the data to only include employees and contractors (exclude subtotals/notes).\n\nmask_emp_con = df['type'].astype(str).str.lower().str.contains('employee|contract', na=False)\nempcon = df.loc[mask_emp_con].copy()" + }, + { + "role": "assistant", + "content": "Next, let's compute the total staffing costs for each month.\n\ntotals = pd.DataFrame({ 'MonthIndex': [int(c[1:]) for c in month_cols], 'Staffing_Total': [empcon[c].sum() for c in month_cols] })\n" + }, + { + "role": "assistant", + "content": "Now, let's define the years 1 and 2 masks based on the total index.\ny1 = totals['MonthIndex'] <= 11\ny2 = (totals['MonthIndex'] >= 12) & (totals['MonthIndex'] <= 23)" + }, + { + "role": "assistant", + "content": "Finally, let's compare the total staffing costs to the provided OpEx bands for years 1 and 2.\n\nYear 1 (M1 - M11): Staffing_Total: $" + }, + { + "role": "assistant", + "content": "\tSum(totals.loc[y1, 'Staffing_Total']): ${:,.0f}\n\nYear 2 (M12 - M24): Staffing_Total: $" + }, + { + "role": "assistant", + "content": "\tSum(totals.loc[y2, 'Staffing_Total']): ${:,.0f}" + } + ] + } + }, + { + "name": "ChatGPT: Outdoor Layering Explained", + "bucket": "mid", + "taxfree_metadata": { + "active_frames": [ + "Clothing Layering", + "Outdoor Clothing", + "Sweat Management", + "Insulation", + "Protection", + "Weather" + ], + "frame_relationships": "The frames interact by defining layers in an outdoor clothing system for managing sweat, insulation, and protection from weather conditions.", + "extraction_orientation": "Focus on the relationships between layers and how they are mixed-and-matched depending on temperature, wind, precipitation, and activity level.", + "one_sentence_summary": "The document describes a 3-layer outdoor clothing system for managing sweat, insulation, and protection from wind, rain, and snow during various activities." + }, + "standard_metadata": { + "language": "en", + "char_length": 3761, + "primary_format": "prose", + "structural_signals": { + "has_headings": true, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": false + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": false, + "has_technical_terminology": true, + "has_first_person": true, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "The document explains the concept of layering outdoor clothing for different activities and weather conditions, including base layer, mid-layer, and outer layer, as well as hand and leg layers." + } + }, + { + "name": "ChatGPT: Limits in Calculus.", + "bucket": "mid", + "taxfree_metadata": { + "active_frames": [ + "Mathematics", + "Calculus", + "Functions", + "Limits" + ], + "frame_relationships": "The 'Mathematics' and 'Calculus' frames are active. The 'Functions', 'Limits', and 'Calculus' frames are interconnected, as the document discusses the concept of limits in relation to functions within calculus.", + "extraction_orientation": "Look for relational content between functions, their inputs, and the behavior of those functions as they approach certain values (limits).", + "one_sentence_summary": "The document explains the concept of limits in calculus and provides real-world examples to illustrate its use." + }, + "standard_metadata": { + "language": "en", + "char_length": 1269, + "primary_format": "prose", + "structural_signals": { + "has_headings": false, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": true + }, + "content_signals": { + "has_named_people": false, + "has_institutional_language": false, + "has_technical_terminology": true, + "has_first_person": false, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "Discussion on the concept of limits in calculus and their real-world applications" + } + }, + { + "name": "ChatGPT: Academic Program Director Role", + "bucket": "mid", + "taxfree_metadata": { + "active_frames": [ + "academic_program", + "administration", + "faculty_management", + "student_engagement", + "curriculum_development" + ], + "frame_relationships": "The 'academic program' frame underpins the entire document, with 'administration', 'faculty management', and 'student engagement' frames supporting it. The 'curriculum development' frame is a core function that intersects all other frames.", + "extraction_orientation": "Look for relationships between roles (e.g., program director to faculty, students, administration), responsibilities within each role, and interactions between these responsibilities", + "one_sentence_summary": "This document outlines the key duties of an academic program director in Design and Digital Fabrication programs, focusing on managing faculty, engaging with industry partners, developing curriculum, assessing student outcomes, and fostering student success." + }, + "standard_metadata": { + "language": "en", + "char_length": 4507, + "primary_format": "prose", + "structural_signals": { + "has_headings": true, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": false + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": true, + "has_technical_terminology": true, + "has_first_person": true, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "Document describes the role and responsibilities of an Academic Program Director for Design and Digital Fabrication programs" + } + }, + { + "name": "ChatGPT: Lonely Island Poop Skit", + "bucket": "mid", + "taxfree_metadata": { + "active_frames": [ + "Music", + "Humor" + ], + "frame_relationships": "The Music frame is used to deliver comedic content within the Humor frame.", + "extraction_orientation": "Look for lyrics or song details that provide a deeper understanding of the comedic narrative.", + "one_sentence_summary": "The document discusses a Lonely Island track, 'Perfect Saturday', which is a comedic musical number about a poop-related incident during a sexual encounter." + }, + "standard_metadata": { + "language": "en", + "char_length": 1069, + "primary_format": "mixed", + "structural_signals": { + "has_headings": false, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": true + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": false, + "has_technical_terminology": false, + "has_first_person": false, + "has_quotations": false + }, + "domain_class": "conversational", + "one_sentence_summary": "A conversation about finding a specific skit from Lonely Island's albums 'Turtleneck & Chain', 'The Wack Album', or 'Incredibad'" + } + }, + { + "name": "ChatGPT: Parse Tidal playlist", + "bucket": "mid", + "taxfree_metadata": { + "active_speaker": "Assistant", + "finetuned_response": { + "source_language": "en", + "target_language": "en", + "dialog_id": 412769058, + "generated_utts": [ + { + "role": "Assistant", + "text": "The follow-up playlist for your 80s night has been carefully curated with 25 more danceable tracks that maintain the same vibe as your original list. Here it is:\n\n## \ud83e\udea9 25 More Danceable '80s Tracks (Alt-Club Vibe, Fits Your Original Playlist)\n### \ud83d\udd25 Synthpop / New Wave / Dance-Rock:\n1. **Alphaville \u2013 Forever Young**\n2. **Depeche Mode \u2013 Enjoy the Silence**\n3. **Tears for Fears \u2013 Shout** *(club remix)*\n4. **OMD \u2013 If You Leave**\n5. **The Human League \u2013 Love Action (I Believe in Love)**\n6. **Men Without Hats \u2013 Safety Dance**\n7. **Culture Club \u2013 Time (Clock of the Heart)**\n8. **A Flock of Seagulls \u2013 I Ran (So Far Away)** *(extended 12\u201d version)*\n9. **The Police \u2013 Don't Stand So Close to Me**\n10. **Pet Shop Boys \u2013 It's a Sin**" + }, + { + "role": "Assistant", + "text": "### \ud83d\udc83 Funky Post-Punk / Art-Disco Bangers:\n11. **The Stranglers \u2013 Golden Brown**\n12. **Salt-N-Pepa \u2013 Push It** *(early hip hop infused with disco)*\n13. **Chic \u2013 Le Freak**\n14. **Rick James \u2013 Super Freak**\n15. **The Buggles \u2013 Video Killed the Radio Star** *(early MTV staple)*" + }, + { + "role": "Assistant", + "text": "### \ud83e\uddca Sleek & Cool Euro-Dance / Underground Synth:\n16. **Kraftwerk \u2013 The Model**\n17. **Japan \u2013 Ghosts** *(haunting, synthpop ballad)*\n18. **New Order \u2013 Blue Monday (Extended 12\u201d Version)** *(one of the longest running UK singles)*\n19. **Soft Cell \u2013 Tainted Love** *(classic remake)*\n20. **Yazoo \u2013 Only You**" + }, + { + "role": "Assistant", + "text": "### \ud83c\udf00 Optional Bonus Vibes (For future expansion)\n- **Thompson Twins \u2013 Hold Me Now** *(power ballad)*\n- **ABC \u2013 The Look of Love** *(disco-infused synthpop)*\n- **Simple Minds \u2013 Alive and Kicking** *(anthemic, uplifting)*" + } + ] + } + }, + "standard_metadata": { + "error": "parse_failed", + "raw": "{\n\"query_id\": \"4b6a7e10-c39d-481f-ae4a-79e3ba6c4cdd\",\n\"results\": [\n {\n \"answer\": \"Sure, I can provide another 25 songs in the same flavor that leans more towards a darker, sexier after-midni" + } + }, + { + "name": "NO thesis proposal.pdf", + "bucket": "document", + "taxfree_metadata": { + "active_frames": [ + "Artist", + "Career Progression", + "Identity", + "Personal Interests", + "Product Design" + ], + "frame_relationships": "The artist's career progression is intertwined with their identity and personal interests, shaping the subject matter and style of their product designs.", + "extraction_orientation": "Focus on relationships between the artist's professional evolution, personal identity, and the creation process of their product designs.", + "one_sentence_summary": "The document discusses a graduate thesis proposal for a 3D modeler, focusing on creating whimsical products with specific subject matters: critters, fish, and poodles." + }, + "standard_metadata": { + "language": "en", + "char_length": 602, + "primary_format": "prose", + "structural_signals": { + "has_headings": false, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": false + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": false, + "has_technical_terminology": false, + "has_first_person": true, + "has_quotations": false + }, + "domain_class": "personal", + "one_sentence_summary": "The document discusses an artist's graduate thesis proposal for a project centered around creating whimsical product designs featuring critters, fish, and poodles." + } + }, + { + "name": "PWM.pdf", + "bucket": "document", + "taxfree_metadata": { + "active_frames": [ + "Pulse Width Modulation (PWM)", + "Analog to Digital Conversion", + "Arduino ADC" + ], + "frame_relationships": "The PWM and Analog to Digital Conversion frames are directly related as they both discuss the principles and applications of PWM in relation to analog-to-digital conversion, while the Arduino ADC frame provides specific context regarding the use of these concepts with an Arduino microcontroller.", + "extraction_orientation": "Look for relationships between PWM and its parameters (time, voltage levels) and their impact on Analog to Digital Conversion resolution and Arduino ADC functions", + "one_sentence_summary": "This document discusses the principles of Pulse Width Modulation, its implementation in Arduino ADC, and the relationship between PWM and analog-to-digital conversion." + }, + "standard_metadata": { + "language": "en", + "char_length": 427, + "primary_format": "prose", + "structural_signals": { + "has_headings": false, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": false + }, + "content_signals": { + "has_named_people": false, + "has_institutional_language": true, + "has_technical_terminology": true, + "has_first_person": false, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "The document discusses Pulse Width Modulation (PWM) and Analog to Digital Conversion in the context of Arduino." + } + }, + { + "name": "Will_It_Print.pdf", + "bucket": "document", + "taxfree_metadata": { + "active_frames": [ + "Part Design", + "Manufacturing Process", + "Material Removal", + "Surface Finish", + "Support Structures", + "Overhang Features", + "Internal Cavities", + "Tolerances", + "Stress Concentration" + ], + "frame_relationships": "The document discusses the design and manufacturing of a part, focusing on its features, material removal, surface finish, support structures, overhangs, internal cavities, tolerances, and stress concentration.", + "extraction_orientation": "Focus on the relationships between the part's design, manufacturing process, and associated features such as overhangs, internal cavities, tolerances, and stress concentration.", + "one_sentence_summary": "This document provides instructions for reducing the number of printing and prototyping failures by Joran Booth, focusing on various aspects of part design and manufacturing." + }, + "standard_metadata": { + "language": "en", + "char_length": 803, + "primary_format": "structured", + "structural_signals": { + "has_headings": true, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": false + }, + "content_signals": { + "has_named_people": false, + "has_institutional_language": false, + "has_technical_terminology": true, + "has_first_person": false, + "has_quotations": false + }, + "domain_class": "technical", + "one_sentence_summary": "This document provides instructions for reducing the number of printing and prototyping failures by Joran Booth for each category of a part to be printed" + } + }, + { + "name": "Kim Kedem Ind Study F2025 Syllabus.docx", + "bucket": "document", + "taxfree_metadata": { + "active_frames": [ + "Education", + "Project", + "Software Development", + "3D Printing" + ], + "frame_relationships": "The Education frame serves as the context for the Project and Software Development frames, with the 3D Printing frame acting as a specific application of these skills.", + "extraction_orientation": "A knowledge graph extractor should focus on relationships between educational concepts, project goals, software development requirements, and 3D printing techniques.", + "one_sentence_summary": "This independent study outlines a project for the design and implementation of a custom 3D printing slicer in Rhino Grasshopper environment." + }, + "standard_metadata": { + "language": "en", + "char_length": 1857, + "primary_format": "prose", + "structural_signals": { + "has_headings": true, + "has_bullet_lists": false, + "has_numbered_lists": false, + "has_tables": false, + "has_code_blocks": false, + "has_dates": true + }, + "content_signals": { + "has_named_people": true, + "has_institutional_language": true, + "has_technical_terminology": true, + "has_first_person": false, + "has_quotations": false + }, + "domain_class": "educational", + "one_sentence_summary": "This document outlines an independent study focused on designing and implementing a custom 3D printing slicer in the Rhino Grasshopper environment." + } + }, + { + "name": "Aaron Nelson Graduate Transcript.pdf", + "bucket": "document", + "taxfree_metadata": {}, + "standard_metadata": { + "Name": "Aaron Nelson", + "Student ID": "00917133", + "University": "University of Massachusetts Dartmouth", + "Address": "285 Old Westport Road N Dartmouth, MA 027472356", + "Degree Awarded": "Master of Fine Arts", + "Confer Date": "05/23/2010", + "Degree GPA": "3.885", + "Program of Study": { + "Artisanry - Jewelry/Metals": [ + { + "Course": "Graduate Seminar I", + "Grade": "B", + "Credits": "3.00", + "Points": "9.000" + }, + { + "Course": "Instructional Development", + "Grade": "B+", + "Credits": "3.00", + "Points": "9.900" + }, + { + "Course": "Graduate Seminar II", + "Grade": "A-", + "Credits": "3.00", + "Points": "11.100" + }, + { + "Course": "Graduate Standard I: Metals/Jewelry", + "Grade": "A", + "Credits": "9.00", + "Points": "36.000" + }, + { + "Course": "Methods & Theory in Art History", + "Grade": "A", + "Credits": "3.00", + "Points": "12.000" + }, + { + "Course": "Graduate Standard II: Metals/Jewelry", + "Grade": null, + "Credits": "9.00", + "Points": null + }, + { + "Course": "Visual Thesis", + "Grade": null, + "Credits": "6.00", + "Points": null + }, + { + "Course": "Written Thesis", + "Grade": null, + "Credits": "3.00", + "Points": null + }, + { + "Course": "Independent Study (Arduino Microcontroller Basics)", + "Grade": "A", + "Credits": "6.00", + "Points": "24.000" + }, + { + "Course": "Independent Study (Furniture Failing)", + "Grade": "A", + "Credits": "3.00", + "Points": "12.000" + }, + { + "Course": "Visual Thesis (Continuation)", + "Grade": null, + "Credits": "6.00", + "Points": null + }, + { + "Course": "Writing Seminar", + "Grade": "A-", + "Credits": "3.00", + "Points": "11.100" + }, + { + "Course": "Thesis Report (Continuation)", + "Grade": null, + "Credits": "3.00", + "Points": null + }, + { + "Course": "Transcript Note: Lessons in Friction and Torque or: How I learned to Stop Worrying and Love the Information Age (Continuation)", + "Grade": null, + "Credits": null, + "Points": null + } + ] + }, + "Non Credit Courses": [ + { + "Course": "Metals Open Studio", + "Credits": null, + "Points": null + } + ] + } + } + ] +} \ No newline at end of file diff --git a/experiments/e4_aaron_ratings.json b/experiments/e4_aaron_ratings.json new file mode 100644 index 0000000..cf2a0cc --- /dev/null +++ b/experiments/e4_aaron_ratings.json @@ -0,0 +1,122 @@ +[ + { + "name": "Claude: Modern website redesign with portfolio and contact form", + "category": "recent_conversation", + "score": 2, + "rationale": "i was just talking about my website nothing emotionally charge" + }, + { + "name": "Claude: University of Utah interview teaching example", + "category": "recent_conversation", + "score": 4, + "rationale": "This is about the utah job which was recent" + }, + { + "name": "Claude: Preparing for dinner with Jim Agutter", + "category": "recent_conversation", + "score": 5, + "rationale": "this was dinner about my recent interview that i was nervous for" + }, + { + "name": "Claude: Realtor requirements for home buying in New York", + "category": "recent_conversation", + "score": 4, + "rationale": "this is something that is an axienty of mine recently" + }, + { + "name": "Claude: Memory", + "category": "recent_conversation", + "score": 5, + "rationale": "This is a lot of bigraphical info about me and that seems to surface in dreams" + }, + { + "name": "ChatGPT: Website architecture options", + "category": "mid_conversation", + "score": 2, + "rationale": "more tehcnical discussion about he website" + }, + { + "name": "ChatGPT: Meat and longevity tips", + "category": "mid_conversation", + "score": 1, + "rationale": "I think this was ajoke convo i had about monkey meat, but who knows mayeb there is something there subconciouslly" + }, + { + "name": "ChatGPT: Funk and jazz list", + "category": "mid_conversation", + "score": 2, + "rationale": "Jsut a music recommendation playlist" + }, + { + "name": "ChatGPT: Variants for different SKUs", + "category": "mid_conversation", + "score": 2, + "rationale": "technical discussion about something to do with teh mossygear site" + }, + { + "name": "ChatGPT: Hip hop weed songs", + "category": "mid_conversation", + "score": 2, + "rationale": "more music recomendations" + }, + { + "name": "ChatGPT: Anvil Jeep with Wheels", + "category": "early_conversation", + "score": 3, + "rationale": "this was a image i generated to see what the jeep woudl look like with white wwagon wheels" + }, + { + "name": "ChatGPT: Octopus Beer Line Art", + "category": "early_conversation", + "score": 3, + "rationale": "this was some art i generated to demo gcode generation for class" + }, + { + "name": "ChatGPT: Monty Python's Moving On", + "category": "early_conversation", + "score": 2, + "rationale": "This was some silly convo i had for some reason" + }, + { + "name": "ChatGPT: Generate Female Brunette Image", + "category": "early_conversation", + "score": 2, + "rationale": "this was somethign where i was uploading an image of myself and seeing who chat gpt though i should be with, im batting out of my league in real life" + }, + { + "name": "ChatGPT: New chat", + "category": "early_conversation", + "score": 2, + "rationale": "technical discussion abotu python" + }, + { + "name": "Bosch Ind Study SP2022 Syllabus.docx", + "category": "document", + "score": 1, + "rationale": "na iS proposal, mayshould be a 2" + }, + { + "name": "Mod02_Industries_and_Applications.pdf", + "category": "document", + "score": 2, + "rationale": "Technical materila for a course" + }, + { + "name": "CAA Workshop.pdf", + "category": "document", + "score": 2, + "rationale": "description for a workshoop i gave a number of years ago" + }, + { + "name": "GRAD_ARF_24-25.pdf", + "category": "document", + "score": 2, + "rationale": "maybe should be a 1, this is an appointment form" + }, + { + "name": "Thesis Request.pdf", + "category": "document", + "score": 2, + "rationale": "mayeba 1 this is another form" + } +] \ No newline at end of file diff --git a/experiments/e4_comparison.json b/experiments/e4_comparison.json new file mode 100644 index 0000000..645d8a2 --- /dev/null +++ b/experiments/e4_comparison.json @@ -0,0 +1,208 @@ +{ + "records": [ + { + "name": "Claude: Modern website redesign with portfolio and contact form", + "aaron": 2, + "haiku": 5, + "diff": 3, + "exact": false, + "within1": false, + "direction": false, + "category": "recent_conversation" + }, + { + "name": "Claude: University of Utah interview teaching example", + "aaron": 4, + "haiku": 4, + "diff": 0, + "exact": true, + "within1": true, + "direction": true, + "category": "recent_conversation" + }, + { + "name": "Claude: Preparing for dinner with Jim Agutter", + "aaron": 5, + "haiku": 4, + "diff": 1, + "exact": false, + "within1": true, + "direction": true, + "category": "recent_conversation" + }, + { + "name": "Claude: Realtor requirements for home buying in New York", + "aaron": 4, + "haiku": 3, + "diff": 1, + "exact": false, + "within1": true, + "direction": true, + "category": "recent_conversation" + }, + { + "name": "Claude: Memory", + "aaron": 5, + "haiku": 4, + "diff": 1, + "exact": false, + "within1": true, + "direction": true, + "category": "recent_conversation" + }, + { + "name": "ChatGPT: Website architecture options", + "aaron": 2, + "haiku": 4, + "diff": 2, + "exact": false, + "within1": false, + "direction": false, + "category": "mid_conversation" + }, + { + "name": "ChatGPT: Meat and longevity tips", + "aaron": 1, + "haiku": 2, + "diff": 1, + "exact": false, + "within1": true, + "direction": true, + "category": "mid_conversation" + }, + { + "name": "ChatGPT: Funk and jazz list", + "aaron": 2, + "haiku": 3, + "diff": 1, + "exact": false, + "within1": true, + "direction": false, + "category": "mid_conversation" + }, + { + "name": "ChatGPT: Variants for different SKUs", + "aaron": 2, + "haiku": 5, + "diff": 3, + "exact": false, + "within1": false, + "direction": false, + "category": "mid_conversation" + }, + { + "name": "ChatGPT: Hip hop weed songs", + "aaron": 2, + "haiku": 1, + "diff": 1, + "exact": false, + "within1": true, + "direction": true, + "category": "mid_conversation" + }, + { + "name": "ChatGPT: Anvil Jeep with Wheels", + "aaron": 3, + "haiku": 2, + "diff": 1, + "exact": false, + "within1": true, + "direction": false, + "category": "early_conversation" + }, + { + "name": "ChatGPT: Octopus Beer Line Art", + "aaron": 3, + "haiku": 3, + "diff": 0, + "exact": true, + "within1": true, + "direction": true, + "category": "early_conversation" + }, + { + "name": "ChatGPT: Monty Python's Moving On", + "aaron": 2, + "haiku": 2, + "diff": 0, + "exact": true, + "within1": true, + "direction": true, + "category": "early_conversation" + }, + { + "name": "ChatGPT: Generate Female Brunette Image", + "aaron": 2, + "haiku": 2, + "diff": 0, + "exact": true, + "within1": true, + "direction": true, + "category": "early_conversation" + }, + { + "name": "ChatGPT: New chat", + "aaron": 2, + "haiku": 3, + "diff": 1, + "exact": false, + "within1": true, + "direction": false, + "category": "early_conversation" + }, + { + "name": "Bosch Ind Study SP2022 Syllabus.docx", + "aaron": 1, + "haiku": 2, + "diff": 1, + "exact": false, + "within1": true, + "direction": true, + "category": "document" + }, + { + "name": "Mod02_Industries_and_Applications.pdf", + "aaron": 2, + "haiku": 3, + "diff": 1, + "exact": false, + "within1": true, + "direction": false, + "category": "document" + }, + { + "name": "CAA Workshop.pdf", + "aaron": 2, + "haiku": 4, + "diff": 2, + "exact": false, + "within1": false, + "direction": false, + "category": "document" + }, + { + "name": "GRAD_ARF_24-25.pdf", + "aaron": 2, + "haiku": 1, + "diff": 1, + "exact": false, + "within1": true, + "direction": true, + "category": "document" + }, + { + "name": "Thesis Request.pdf", + "aaron": 2, + "haiku": 1, + "diff": 1, + "exact": false, + "within1": true, + "direction": true, + "category": "document" + } + ], + "exact": 4, + "within1": 16, + "direction": 12, + "total": 20 +} \ No newline at end of file diff --git a/experiments/e4_llm_ratings.json b/experiments/e4_llm_ratings.json new file mode 100644 index 0000000..de74fa0 --- /dev/null +++ b/experiments/e4_llm_ratings.json @@ -0,0 +1,158 @@ +[ + { + "name": "Claude: Modern website redesign with portfolio and contact form", + "category": "recent_conversation", + "score": null, + "error": "Expecting value: line 1 column 1 (char 0)" + }, + { + "name": "Claude: University of Utah interview teaching example", + "category": "recent_conversation", + "score": null, + "error": "Expecting value: line 1 column 1 (char 0)" + }, + { + "name": "Claude: Preparing for dinner with Jim Agutter", + "category": "recent_conversation", + "score": null, + "error": "Expecting value: line 1 column 1 (char 0)" + }, + { + "name": "Claude: Modern website redesign with portfolio and contact form", + "category": "recent_conversation", + "score": null, + "error": "Expecting value: line 1 column 1 (char 0)" + }, + { + "name": "Claude: University of Utah interview teaching example", + "category": "recent_conversation", + "score": null, + "error": "Expecting value: line 1 column 1 (char 0)" + }, + { + "name": "Claude: Preparing for dinner with Jim Agutter", + "category": "recent_conversation", + "score": null, + "error": "Expecting value: line 1 column 1 (char 0)" + }, + { + "name": "Claude: Modern website redesign with portfolio and contact form", + "category": "recent_conversation", + "score": 5, + "rationale": "Aaron experiences a genuine moment of design literacy (recognizing Cargo's ubiquity as system constraint, not aesthetic choice) combined with professional identity tension\u2014whether his website should signal computational/fabrication expertise distinctly or accept a convention that works\u2014with active Claude collaboration on resolution, making this a salient crossroads for how he presents himself professionally." + }, + { + "name": "Claude: University of Utah interview teaching example", + "category": "recent_conversation", + "score": 4, + "rationale": "Aaron is actively preparing teaching materials and interview content, and Claude's validation that his Twitter Printer project is relevant to his presentation demonstrates a moment of clarification about the broader significance of his work\u2014worth remembering as he positions his practice and teaching philosophy." + }, + { + "name": "Claude: Preparing for dinner with Jim Agutter", + "category": "recent_conversation", + "score": 4, + "rationale": "Strategic professional moment with high-stakes interview preparation details and interpersonal intelligence about a key decision-maker, but the conversation is cut off before completion and represents tactical rather than recurring concern." + }, + { + "name": "Claude: Realtor requirements for home buying in New York", + "category": "recent_conversation", + "score": 3, + "rationale": "Practical research on a real estate transaction Aaron is actively considering, with useful state-specific legal distinctions, but lacking emotional weight or unresolved creative/professional concerns that would warrant higher salience." + }, + { + "name": "Claude: Memory", + "category": "recent_conversation", + "score": 4, + "rationale": "This is comprehensive institutional and personal context that Claude has synthesized about Aaron, documenting his current career inflection point, teaching innovation, emotional geography, and creative practice\u2014useful reference material but meta-documentation rather than a discrete salient episode." + }, + { + "name": "ChatGPT: Website architecture options", + "category": "mid_conversation", + "score": 4, + "rationale": "Salient working material: Aaron is actively exploring a concrete product-focused problem (modern e-commerce site with non-technical CMS access) that intersects his core competencies in design, fabrication tools, and program building, and the response captures decision-critical architectural patterns he'll likely reference or iterate on." + }, + { + "name": "ChatGPT: Meat and longevity tips", + "category": "mid_conversation", + "score": 2, + "rationale": "Routine health information gathering with no evident personal stakes, unresolved tensions, or creative/practice implications for Aaron's work." + }, + { + "name": "ChatGPT: Funk and jazz list", + "category": "mid_conversation", + "score": 3, + "rationale": "Practical music curation task with clear execution\u2014useful reference material for Aaron's listening but lacking the conceptual depth, unresolved questions, or creative friction that would warrant higher salience." + }, + { + "name": "ChatGPT: Variants for different SKUs", + "category": "mid_conversation", + "score": 5, + "rationale": "Aaron is confronting a core product architecture decision (SKU management for size variants) that directly impacts his e-commerce system design, with concrete implementation guidance that resolves an unresolved blocker he explicitly stated." + }, + { + "name": "ChatGPT: Hip hop weed songs", + "category": "mid_conversation", + "score": 1, + "rationale": "Routine music playlist generation with no personal stakes, unresolved questions, emotional weight, or design/fabrication relevance to Aaron's core practice." + }, + { + "name": "ChatGPT: Anvil Jeep with Wheels", + "category": "early_conversation", + "score": 2, + "rationale": "A routine exploratory conversation about vehicle aesthetics and image generation with minor iterative refinement, lacking deeper implications for Aaron's design thinking or unresolved creative concerns." + }, + { + "name": "ChatGPT: Octopus Beer Line Art", + "category": "early_conversation", + "score": 3, + "rationale": "A working iteration on visual design refinement\u2014Aaron is exploring the balance between complexity and clarity in vector illustration, which is directly relevant to his fabrication and design practice, but lacks the emotional weight or unresolved tension that would elevate it to higher salience." + }, + { + "name": "ChatGPT: Monty Python's Moving On", + "category": "early_conversation", + "score": 2, + "rationale": "Standard reference material about Monty Python sketches that doesn't reveal Aaron's specific interests, creative directions, or unresolved questions worth surfacing later." + }, + { + "name": "ChatGPT: Generate Female Brunette Image", + "category": "early_conversation", + "score": 2, + "rationale": "Routine operational use of image generation tools for creating portrait variations with different demographic characteristics\u2014procedural and instrumental rather than revealing of Aaron's design thinking, creative challenges, or substantive concerns." + }, + { + "name": "ChatGPT: New chat", + "category": "early_conversation", + "score": 3, + "rationale": "Routine technical problem-solving in Aaron's core domain (computational design/Grasshopper scripting), but the conversation is incomplete, exploratory, and lacks resolution or deeper insight that would warrant resurface." + }, + { + "name": "Bosch Ind Study SP2022 Syllabus.docx", + "category": "document", + "score": 2, + "rationale": "This is a course syllabus document containing standard institutional information (schedule, grading policies, learning outcomes) that serves as reference material rather than an episode with emotional weight, unresolved questions, or recurring professional significance." + }, + { + "name": "Mod02_Industries_and_Applications.pdf", + "category": "document", + "score": 3, + "rationale": "Routine instructional material on Stratasys applications across industries; provides useful reference taxonomy for Aaron's fabrication practice but lacks specific insights, unresolved questions, or novel connections to his work." + }, + { + "name": "CAA Workshop.pdf", + "category": "document", + "score": 4, + "rationale": "Pedagogical documentation of core technical tools (Rhino, Grasshopper, parametric design) that Aaron actively teaches and uses in his practice as a digital design researcher, with a concrete demo project that demonstrates his approach to form-finding and fabrication." + }, + { + "name": "GRAD_ARF_24-25.pdf", + "category": "document", + "score": 1, + "rationale": "This is a boilerplate administrative form with no personalized content, creative substance, or relevance to Aaron's practice as a computational designer or fabrication researcher." + }, + { + "name": "Thesis Request.pdf", + "category": "document", + "score": 1, + "rationale": "This is a boilerplate administrative form with no personal context, decision point, or relevance to Aaron's creative practice or research direction." + } +] \ No newline at end of file diff --git a/experiments/tier1_migration_results.json b/experiments/tier1_migration_results.json new file mode 100644 index 0000000..6e2e8d1 --- /dev/null +++ b/experiments/tier1_migration_results.json @@ -0,0 +1,1271 @@ +{ + "started_at": "2026-04-28T19:45:27", + "completed_at": "2026-04-28T22:32:20", + "total_elapsed_s": 5541.6, + "total_elapsed_hours": 1.54, + "n_batches": 126, + "successful_episodes": 250, + "failed_episodes": 0, + "total_ingested_now": 300, + "estimated_total_cost": 30.92, + "batch_results": [ + { + "ok": true, + "status_code": 200, + "elapsed_s": 14.99, + "error": null, + "sources": [ + "Aaron AI: So, I've been working on the RNAI project, and the way I've ..." + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 36.83, + "error": null, + "sources": [ + "Aaron AI: What should I be the most excited about right now?", + "Aaron AI: I think they were just being nice to me at the u of u interv..." + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 60.94, + "error": null, + "sources": [ + "Aaron AI: I'm working on you", + "Claude: Setting up a custom OpenClaw instance" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 62.41, + "error": null, + "sources": [ + "Claude: Evaluating tenure prospects at R1 universities", + "Claude: Finding ideal rural housing near University of Utah" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 46.11, + "error": null, + "sources": [ + "Claude: Yellow hex sticker meaning", + "Claude: Take home pay comparison: Hudson Valley vs Salt Lake City" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 41.66, + "error": null, + "sources": [ + "Claude: Preparing for dinner with Jim Agutter", + "Claude: Recalling a design philosophy discussion" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 24.45, + "error": null, + "sources": [ + "Claude: Interview presentation research and preparation" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 29.65, + "error": null, + "sources": [ + "Claude: Salt Lake City versus Hudson Valley comparison", + "Claude: Bonding ASA 3D printed parts" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 56.68, + "error": null, + "sources": [ + "Claude: Great Salt Lake decline and future outlook", + "Claude: Tulsa vs Salt Lake City living comparison" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 49.27, + "error": null, + "sources": [ + "Claude: SUNY school closure risk and New Paltz", + "Claude: Weighing Utah versus Oklahoma" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 43.44, + "error": null, + "sources": [ + "Claude: Modern website redesign with portfolio and contact form", + "Claude: Realtor requirements for home buying in New York" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 53.61, + "error": null, + "sources": [ + "Claude: Law enforcement career options", + "Claude: I filling out my annual report..." + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 51.95, + "error": null, + "sources": [ + "Claude: Lubbock on everything album lyrics", + "Claude: University of Utah interview teaching example" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 33.03, + "error": null, + "sources": [ + "Claude: SUNY faculty conflict of interest policies", + "Claude: Drafting an internship agreement" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 32.6, + "error": null, + "sources": [ + "Claude: Internship agreement writing help", + "Claude: AI upgrade and context memory" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 45.66, + "error": null, + "sources": [ + "Claude: Importing chat history from ChatGPT", + "ChatGPT: RMA armor discount codes" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 27.65, + "error": null, + "sources": [ + "ChatGPT: Sanity CMS overview", + "ChatGPT: Scholarship Letter Example" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 47.87, + "error": null, + "sources": [ + "ChatGPT: Scholarship Recommendation Letter Tips", + "ChatGPT: SEC coaches with OSU ties" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 16.01, + "error": null, + "sources": [ + "ChatGPT: Sink Sprayer Fitting Name", + "ChatGPT: Site layout planning" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 7.53, + "error": null, + "sources": [ + "ChatGPT: Skeleton Dunking Basketball" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 62.4, + "error": null, + "sources": [ + "ChatGPT: Song recommendations for vibe" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 127.15, + "error": null, + "sources": [ + "ChatGPT: Soulful protest playlist", + "ChatGPT: Soul music playlist ideas" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 40.95, + "error": null, + "sources": [ + "ChatGPT: Stable Diffusion Clarification", + "ChatGPT: Starting Dropship Gun Business" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 38.45, + "error": null, + "sources": [ + "ChatGPT: Studio Series Galvatron review", + "ChatGPT: Summarize information request" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 39.14, + "error": null, + "sources": [ + "ChatGPT: Summary of paper", + "ChatGPT: Supreme Court Marijuana Case" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 35.06, + "error": null, + "sources": [ + "ChatGPT: Tattoo Design Idea", + "ChatGPT: Teaching eval summary" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 14.02, + "error": null, + "sources": [ + "ChatGPT: Tenant rights after purchase" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 15.39, + "error": null, + "sources": [ + "ChatGPT: Testing Vector Alignment", + "ChatGPT: Title: User request summary." + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 37.45, + "error": null, + "sources": [ + "ChatGPT: Truncated Octahedron Algorithm", + "ChatGPT: Tulsa Concept Album Guide" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 80.0, + "error": null, + "sources": [ + "ChatGPT: Variants for different SKUs", + "ChatGPT: Vibes description" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 32.29, + "error": null, + "sources": [ + "ChatGPT: Video understanding limitations", + "ChatGPT: Vinyl print cut costs" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 59.69, + "error": null, + "sources": [ + "ChatGPT: Website architecture options", + "ChatGPT: Website styling changes" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 35.85, + "error": null, + "sources": [ + "ChatGPT: Write Job Cover Letter", + "ChatGPT: Political Breakdown Origins" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 43.36, + "error": null, + "sources": [ + "Cheetah-TDS.pdf", + "Circuit Intro (1).pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 43.75, + "error": null, + "sources": [ + "Circuit Intro (1).pptx", + "Circuit Intro.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 11.94, + "error": null, + "sources": [ + "Circuit Intro.pptx" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 45.42, + "error": null, + "sources": [ + "Circuits II (1).pdf", + "Circuits II.pdf", + "Circuits II.pptx" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 30.7, + "error": null, + "sources": [ + "CitiBank - April.pdf", + "CitiStatemen_05-16.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 61.05, + "error": null, + "sources": [ + "Claude: Memory" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 8.02, + "error": null, + "sources": [ + "CHECKLIST DSI 2020.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 31.25, + "error": null, + "sources": [ + "ChatGPT: PowerPoint creation help", + "ChatGPT: Program response drafting" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 18.15, + "error": null, + "sources": [ + "ChatGPT: Provide DXF file" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 12.18, + "error": null, + "sources": [ + "ChatGPT: Push changes to repo", + "ChatGPT: Python __name__ Main Explanation" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 24.48, + "error": null, + "sources": [ + "ChatGPT: Python script for Grasshopper", + "ChatGPT: Rectangle Edge Offset Algorithm" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 21.24, + "error": null, + "sources": [ + "ChatGPT: Refactor Balance Logic.", + "ChatGPT: Regex for inserting letters" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 37.39, + "error": null, + "sources": [ + "ChatGPT: Remington 700 5R Gen 1", + "ChatGPT: Research Statement Restructure" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 31.48, + "error": null, + "sources": [ + "ChatGPT: Respect Individual Interests for Christmas", + "ChatGPT: Resume formatting and review" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 69.66, + "error": null, + "sources": [ + "ChatGPT: Revolution music" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 6.17, + "error": null, + "sources": [ + "ChatGPT: Rewriting without lists" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 76.69, + "error": null, + "sources": [ + "ChatGPT: Rhino 3D object flow", + "ChatGPT: Rhino Grasshopper Intro" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 87.13, + "error": null, + "sources": [ + "ChatGPT: Rick Ross vibe analysis", + "Undergraduate Work.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 23.13, + "error": null, + "sources": [ + "Undergraduate Work r2.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 34.73, + "error": null, + "sources": [ + "University of North Texas Cover letter.pdf", + "Unofficial Transcript.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 58.0, + "error": null, + "sources": [ + "U of A Applictaion.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 49.79, + "error": null, + "sources": [ + "UofAArt Cover Letter.docx", + "UofAArt Cover Letter.pdf", + "Useless Box code flow chart.pdf", + "Useless Box flow chart.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 743.38, + "error": null, + "sources": [ + "Utah MDD - Aaron Nelson - Copy.pptx", + "Vanessa Bosch RBDS Scholarship Recommendation.docx", + "Vanessa Bosch WYA Scholarship Recommendation.docx" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 24.89, + "error": null, + "sources": [ + "Vernon Byron Ind Study F2024 Syllabus.docx", + "Vernon Byron Ind Study F2024 Syllabus.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 9.64, + "error": null, + "sources": [ + "Vevor 30L Ultrasonic.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 23.41, + "error": null, + "sources": [ + "Vevor Pressure pot.pdf", + "Vevor Vaccum Chamber.pdf", + "Voltage Divider.pdf", + "VT Cover.docx" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 1.75, + "error": null, + "sources": [ + "VU.LID.2.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 31.76, + "error": null, + "sources": [ + "Wearable Marquees.pptx", + "Wearable Marquees uw4.pptx" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 18.28, + "error": null, + "sources": [ + "Week 10 Code Blocks.txt" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 16.13, + "error": null, + "sources": [ + "Week 10 Code Blocks Updated.txt", + "Week 3 Code Blocks.txt" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 16.04, + "error": null, + "sources": [ + "Week 7 Code Blocks - Images.txt" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 41.29, + "error": null, + "sources": [ + "weird bios.docx", + "What is Computational Media.pptx", + "Will_It_Print.pdf", + "Working Job Posting - Lab Tech.docx" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 50.32, + "error": null, + "sources": [ + "wrong.pptx", + "Youngstown State University.docx", + "Susan Arduino IS.docx", + "Syracuse Cover.docx" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 20.73, + "error": null, + "sources": [ + "TA Appointment Request Form - 20-21 - fillable.pdf", + "TAGA Appointment Request Form - 20-21 - fillable.pdf", + "TA Performance Program.docx", + "TA Performance Program.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 31.17, + "error": null, + "sources": [ + "Taylor Varricchio Ind Study Sp 2019 Syllabus.docx", + "Teaching Philosophy.docx" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 7.66, + "error": null, + "sources": [ + "Teaching Philosophy.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 31.8, + "error": null, + "sources": [ + "Team_4_Abstract_F21.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 33.18, + "error": null, + "sources": [ + "Team Rocket Final Presentation.pdf", + "Team Rocket Final Presentation.pptx" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 35.0, + "error": null, + "sources": [ + "Teching Evals Rev2.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 5.21, + "error": null, + "sources": [ + "test-watcher.md.md", + "text_parsing.txt" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 36.08, + "error": null, + "sources": [ + "The Bounds of Cognition -- Frederick Adams and Kenneth Aizawa.pdf", + "The E-myth revisited _ why most small businesses don't work -- Michael E_ Gerber.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 61.74, + "error": null, + "sources": [ + "The Extended Mind _ The Power of Thinking Outside the Brain -- Annie Murphy Paul.pdf", + "The Nature and Art of Workmanship -- David Pye.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 33.79, + "error": null, + "sources": [ + "The Phenomenon of Life_ Nature of Order, Book 1_ An Essay -- Christopher W_ Alexander.pdf", + "The Poetics of Space -- Gaston Bachelard translated from the French by Maria Jolas -- First Edition, 1994.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 67.47, + "error": null, + "sources": [ + "The role of active inference in conscious awareness -- Jonathan Edward Robinson & Andrew W_ Corcoran & Christopher.pdf", + "thesis documentation .pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 12.35, + "error": null, + "sources": [ + "Thesis Paper Guidlines.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 31.76, + "error": null, + "sources": [ + "Thesis proposal.docx", + "Thesis proposal.pdf", + "Thesis Request.pdf", + "ThesisWriting10Things.docx" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 10.8, + "error": null, + "sources": [ + "ThesisWritingWeek1.docx" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 42.24, + "error": null, + "sources": [ + "The world beyond your head _ on becoming an individual in an -- Crawford, Matthew B.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 13.38, + "error": null, + "sources": [ + "This is the text that I am placing in here.docx" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 17.44, + "error": null, + "sources": [ + "Tiger Statement.docx" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 29.22, + "error": null, + "sources": [ + "tinkercad bad.pptx", + "tinkercad_Intro3D.pptx", + "TMS_2022_Abstract_JustinBoswell.docx", + "TMS_2022_Abstract_JustinBoswell.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 16.91, + "error": null, + "sources": [ + "Todd Faulls Ind Study F2024 Syllabus.docx" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 21.33, + "error": null, + "sources": [ + "Todd Faulls Ind Study F2024 Syllabus.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 28.03, + "error": null, + "sources": [ + "TR Performance Program.docx", + "Twitter Printer Statement.docx", + "UARTS2 Cover.docx", + "UARTS Cover Letter.docx" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 64.76, + "error": null, + "sources": [ + "ulysses.txt", + "nelsonaAR20 .docx" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 68.99, + "error": null, + "sources": [ + "nelsonaAR20.pdf", + "Nelson_Aaron_ACAD.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 7.69, + "error": null, + "sources": [ + "Nelson_Aaron-Artist_Statement.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 42.8, + "error": null, + "sources": [ + "Nelson_Aaron-Intent_Letter.docx", + "Nelson_Aaron-Intent_Letter.pdf", + "Nelson_Aaron-Media.docx", + "Nelson_Aaron-Media.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 12.12, + "error": null, + "sources": [ + "Nelson_Aaron-References.docx", + "Nelson_Aaron-References.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 53.81, + "error": null, + "sources": [ + "Nelson_Aaron-Resume.pdf", + "Nelson_Aaron_RTPCC_Response.docx" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 19.92, + "error": null, + "sources": [ + "Nelson_Aaron_RTPCC_ResponseEDIT.docx" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 6.14, + "error": null, + "sources": [ + "Nelson_Aaron-Sample_Course.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 64.37, + "error": null, + "sources": [ + "Nelson_Aaron_UArts.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 14.15, + "error": null, + "sources": [ + "NelsonA_Dell_Monitors.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 53.32, + "error": null, + "sources": [ + "nelsonaDSI20.pdf", + "nelson_application.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 60.07, + "error": null, + "sources": [ + "NelsonA_Surface_Pro.pdf", + "Nelson CV.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 3.2, + "error": null, + "sources": [ + "Nelson_PD_Purchase_Requisition Amazon.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 16.98, + "error": null, + "sources": [ + "Nelson_PD_Purchase_Requisition Foredom.pdf", + "nelson_personalwork.pdf", + "nelson_studentwork.pdf", + "New Code Blocks.txt" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 32.95, + "error": null, + "sources": [ + "New Mexico Cover Letter.docx", + "NewPaltz_DDF_Cover_Letter.docx", + "New Paltz DDMF.pptx" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 29.25, + "error": null, + "sources": [ + "New Text Document.txt", + "next-experiment-proposal-base-class-validation.md" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 23.06, + "error": null, + "sources": [ + "Nic Oconnor Field Work F2023 Syllabus.docx", + "Nic Oconnor Field Work F2023 Syllabus.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 28.18, + "error": null, + "sources": [ + "Nic Oconnor Ind Study F2023 Syllabus.docx", + "Nic Oconnor Ind Study F2023 Syllabus.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 21.0, + "error": null, + "sources": [ + "Nic Oconnor Ind Study S2024 Syllabus.docx", + "Nic Oconnor Ind Study S2024 Syllabus.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 26.46, + "error": null, + "sources": [ + "nic thesis request with both signatures.pdf", + "NinjaFlex-TDS.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 19.48, + "error": null, + "sources": [ + "Northern Arizona Cover letter.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 27.38, + "error": null, + "sources": [ + "notes.txt", + "NO thesis proposal.docx", + "NO thesis proposal.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 40.34, + "error": null, + "sources": [ + "NOVO complete.pdf", + "Novo report.docx" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 26.37, + "error": null, + "sources": [ + "Novo report.pdf", + "NUGENT GRAD_ARF_24-25.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 28.51, + "error": null, + "sources": [ + "NYSMTP 3D PrintingMINIcourseSPRING2015.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 17.76, + "error": null, + "sources": [ + "Oakton Community College Cover.docx" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 18.15, + "error": null, + "sources": [ + "Objects and Arrays no HW.pptx", + "Objects and Arrays.pptx" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 48.47, + "error": null, + "sources": [ + "Objects.txt", + "OBrother.txt" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 18.0, + "error": null, + "sources": [ + "Oklahoma State Cover Letter.docx" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 24.39, + "error": null, + "sources": [ + "Oklahoma State Diversity Statement.docx", + "Oliva Lecture Intro.docx", + "Olivia.pdf", + "On Pages one and two there is mentioned that.docx" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 12.79, + "error": null, + "sources": [ + "Outline for 3D Printed Materials for Foundry Casting.docx", + "Outreach and Education.pdf", + "Paddle.txt" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 43.26, + "error": null, + "sources": [ + "paper-track-positioning-note.md" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 34.7, + "error": null, + "sources": [ + "Parametric Representation of a Curve.pptx", + "Parametric Surfaces.pptx", + "Part Translucency - EN FDM Best Practice.pdf", + "People Who Can Talk.docx" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 34.27, + "error": null, + "sources": [ + "Perazza Independent Study 38-031 1_21_2022_Signed.pdf", + "Personal Work 2019.pdf", + "Personal Work 2019.pptx", + "Personal Work.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 14.66, + "error": null, + "sources": [ + "Personal Work.pptx", + "Petroski_Final_ppt.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 5.75, + "error": null, + "sources": [ + "Philosophy of Teaching.docx" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 31.6, + "error": null, + "sources": [ + "Philosophy of the expansion of the DDF minor.docx", + "Piezo.pdf", + "Piezo.pptx", + "Pitch Presentation - Design Intents.pdf" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 20.62, + "error": null, + "sources": [ + "Pixel Array intro.txt", + "Polar Coordinates (1).pdf", + "Polar Coordinates.pdf", + "Polar Coordinates.pptx" + ] + }, + { + "ok": true, + "status_code": 200, + "elapsed_s": 10.67, + "error": null, + "sources": [ + "Pong Math.pdf", + "Pong Math.pptx", + "Portfolio1.pdf", + "Portfolio2.pdf" + ] + } + ] +} \ No newline at end of file diff --git a/experiments/tier1_migration_state.json b/experiments/tier1_migration_state.json new file mode 100644 index 0000000..1ab239a --- /dev/null +++ b/experiments/tier1_migration_state.json @@ -0,0 +1,272 @@ +{ + "ingested": [ + "Aaron AI: I think they were just being nice to me at the u of u interv...", + "Aaron AI: I'm working on you", + "Aaron AI: So, I've been working on the RNAI project, and the way I've ...", + "Aaron AI: What should I be the most excited about right now?", + "CHECKLIST DSI 2020.pdf", + "ChatGPT: Monty Python's Moving On", + "ChatGPT: Mortars Tear Gas Usage", + "ChatGPT: Move to Rocky Mountains", + "ChatGPT: Movie Quote Clarification", + "ChatGPT: Name Information Request", + "ChatGPT: New chat", + "ChatGPT: No-subscription security cameras", + "ChatGPT: Octopus Beer Line Art", + "ChatGPT: Outdoor Layering Explained", + "ChatGPT: Political Breakdown Origins", + "ChatGPT: PowerPoint creation help", + "ChatGPT: Program response drafting", + "ChatGPT: Provide DXF file", + "ChatGPT: Push changes to repo", + "ChatGPT: Python __name__ Main Explanation", + "ChatGPT: Python script for Grasshopper", + "ChatGPT: RMA armor discount codes", + "ChatGPT: Rectangle Edge Offset Algorithm", + "ChatGPT: Refactor Balance Logic.", + "ChatGPT: Regex for inserting letters", + "ChatGPT: Remington 700 5R Gen 1", + "ChatGPT: Research Statement Restructure", + "ChatGPT: Respect Individual Interests for Christmas", + "ChatGPT: Resume formatting and review", + "ChatGPT: Revolution music", + "ChatGPT: Rewriting without lists", + "ChatGPT: Rhino 3D object flow", + "ChatGPT: Rhino Grasshopper Intro", + "ChatGPT: Rick Ross vibe analysis", + "ChatGPT: SEC coaches with OSU ties", + "ChatGPT: Sanity CMS overview", + "ChatGPT: Scholarship Letter Example", + "ChatGPT: Scholarship Recommendation Letter Tips", + "ChatGPT: Sink Sprayer Fitting Name", + "ChatGPT: Site layout planning", + "ChatGPT: Skeleton Dunking Basketball", + "ChatGPT: Song recommendations for vibe", + "ChatGPT: Soul music playlist ideas", + "ChatGPT: Soulful protest playlist", + "ChatGPT: Stable Diffusion Clarification", + "ChatGPT: Starting Dropship Gun Business", + "ChatGPT: Studio Series Galvatron review", + "ChatGPT: Summarize information request", + "ChatGPT: Summary of paper", + "ChatGPT: Supreme Court Marijuana Case", + "ChatGPT: Tattoo Design Idea", + "ChatGPT: Teaching eval summary", + "ChatGPT: Tenant rights after purchase", + "ChatGPT: Testing Vector Alignment", + "ChatGPT: Title: User request summary.", + "ChatGPT: Truncated Octahedron Algorithm", + "ChatGPT: Tulsa Concept Album Guide", + "ChatGPT: Variants for different SKUs", + "ChatGPT: Vibes description", + "ChatGPT: Video understanding limitations", + "ChatGPT: Vinyl print cut costs", + "ChatGPT: Website architecture options", + "ChatGPT: Website styling changes", + "ChatGPT: Write Job Cover Letter", + "ChatGPT: npm install vulnerabilities explained", + "Cheetah-TDS.pdf", + "Circuit Intro (1).pdf", + "Circuit Intro (1).pptx", + "Circuit Intro.pdf", + "Circuit Intro.pptx", + "Circuits II (1).pdf", + "Circuits II.pdf", + "Circuits II.pptx", + "CitiBank - April.pdf", + "CitiStatemen_05-16.pdf", + "Claude: AI upgrade and context memory", + "Claude: Art jewelry discourse and contemporary trends", + "Claude: Bonding ASA 3D printed parts", + "Claude: Drafting an internship agreement", + "Claude: Evaluating tenure prospects at R1 universities", + "Claude: Finding ideal rural housing near University of Utah", + "Claude: Great Salt Lake decline and future outlook", + "Claude: I filling out my annual report...", + "Claude: Importing chat history from ChatGPT", + "Claude: Internship agreement writing help", + "Claude: Interview presentation research and preparation", + "Claude: Law enforcement career options", + "Claude: Lubbock on everything album lyrics", + "Claude: Memory", + "Claude: Modern website redesign with portfolio and contact form", + "Claude: Preparing for dinner with Jim Agutter", + "Claude: Realtor requirements for home buying in New York", + "Claude: Recalling a design philosophy discussion", + "Claude: SUNY faculty conflict of interest policies", + "Claude: SUNY school closure risk and New Paltz", + "Claude: Salt Lake City versus Hudson Valley comparison", + "Claude: Setting up a custom OpenClaw instance", + "Claude: Take home pay comparison: Hudson Valley vs Salt Lake City", + "Claude: Tulsa vs Salt Lake City living comparison", + "Claude: University of Utah interview teaching example", + "Claude: Weighing Utah versus Oklahoma", + "Claude: Yellow hex sticker meaning", + "NO thesis proposal.docx", + "NO thesis proposal.pdf", + "NOVO complete.pdf", + "NUGENT GRAD_ARF_24-25.pdf", + "NYSMTP 3D PrintingMINIcourseSPRING2015.pdf", + "Nelson CV.pdf", + "NelsonA_Dell_Monitors.pdf", + "NelsonA_Surface_Pro.pdf", + "Nelson_Aaron-Artist_Statement.pdf", + "Nelson_Aaron-Intent_Letter.docx", + "Nelson_Aaron-Intent_Letter.pdf", + "Nelson_Aaron-Media.docx", + "Nelson_Aaron-Media.pdf", + "Nelson_Aaron-References.docx", + "Nelson_Aaron-References.pdf", + "Nelson_Aaron-Resume.pdf", + "Nelson_Aaron-Sample_Course.pdf", + "Nelson_Aaron_ACAD.pdf", + "Nelson_Aaron_RTPCC_Response.docx", + "Nelson_Aaron_RTPCC_ResponseEDIT.docx", + "Nelson_Aaron_UArts.pdf", + "Nelson_PD_Purchase_Requisition Amazon.pdf", + "Nelson_PD_Purchase_Requisition Foredom.pdf", + "New Code Blocks.txt", + "New Mexico Cover Letter.docx", + "New Paltz DDMF.pptx", + "New Text Document.txt", + "NewPaltz_DDF_Cover_Letter.docx", + "Nic Oconnor Field Work F2023 Syllabus.docx", + "Nic Oconnor Field Work F2023 Syllabus.pdf", + "Nic Oconnor Ind Study F2023 Syllabus.docx", + "Nic Oconnor Ind Study F2023 Syllabus.pdf", + "Nic Oconnor Ind Study S2024 Syllabus.docx", + "Nic Oconnor Ind Study S2024 Syllabus.pdf", + "NinjaFlex-TDS.pdf", + "Northern Arizona Cover letter.pdf", + "Novo report.docx", + "Novo report.pdf", + "OBrother.txt", + "Oakton Community College Cover.docx", + "Objects and Arrays no HW.pptx", + "Objects and Arrays.pptx", + "Objects.txt", + "Oklahoma State Cover Letter.docx", + "Oklahoma State Diversity Statement.docx", + "Oliva Lecture Intro.docx", + "Olivia.pdf", + "On Pages one and two there is mentioned that.docx", + "Outline for 3D Printed Materials for Foundry Casting.docx", + "Outreach and Education.pdf", + "Paddle.txt", + "Parametric Representation of a Curve.pptx", + "Parametric Surfaces.pptx", + "Part Translucency - EN FDM Best Practice.pdf", + "People Who Can Talk.docx", + "Perazza Independent Study 38-031 1_21_2022_Signed.pdf", + "Personal Work 2019.pdf", + "Personal Work 2019.pptx", + "Personal Work.pdf", + "Personal Work.pptx", + "Petroski_Final_ppt.pdf", + "Philosophy of Teaching.docx", + "Philosophy of the expansion of the DDF minor.docx", + "Piezo.pdf", + "Piezo.pptx", + "Pitch Presentation - Design Intents.pdf", + "Pixel Array intro.txt", + "Polar Coordinates (1).pdf", + "Polar Coordinates.pdf", + "Polar Coordinates.pptx", + "Pong Math.pdf", + "Pong Math.pptx", + "Portfolio1.pdf", + "Portfolio2.pdf", + "RodiMarkFinalProject.pdf", + "Ronald Rael.docx", + "SDS-000015_26Sep21_American English_AGHS_Ecoworks Cleaning Agent Pouch A.pdf", + "Susan Arduino IS.docx", + "Syracuse Cover.docx", + "TA Appointment Request Form - 20-21 - fillable.pdf", + "TA Performance Program.docx", + "TA Performance Program.pdf", + "TAGA Appointment Request Form - 20-21 - fillable.pdf", + "TMS_2022_Abstract_JustinBoswell.docx", + "TMS_2022_Abstract_JustinBoswell.pdf", + "TR Performance Program.docx", + "Taylor Varricchio Ind Study Sp 2019 Syllabus.docx", + "Teaching Philosophy.docx", + "Teaching Philosophy.pdf", + "Team Rocket Final Presentation.pdf", + "Team Rocket Final Presentation.pptx", + "Team_4_Abstract_F21.pdf", + "Teching Evals Rev2.pdf", + "The Bounds of Cognition -- Frederick Adams and Kenneth Aizawa.pdf", + "The E-myth revisited _ why most small businesses don't work -- Michael E_ Gerber.pdf", + "The Extended Mind _ The Power of Thinking Outside the Brain -- Annie Murphy Paul.pdf", + "The Nature and Art of Workmanship -- David Pye.pdf", + "The Phenomenon of Life_ Nature of Order, Book 1_ An Essay -- Christopher W_ Alexander.pdf", + "The Poetics of Space -- Gaston Bachelard translated from the French by Maria Jolas -- First Edition, 1994.pdf", + "The role of active inference in conscious awareness -- Jonathan Edward Robinson & Andrew W_ Corcoran & Christopher.pdf", + "The world beyond your head _ on becoming an individual in an -- Crawford, Matthew B.pdf", + "Thesis Paper Guidlines.pdf", + "Thesis Request.pdf", + "Thesis proposal.docx", + "Thesis proposal.pdf", + "ThesisWriting10Things.docx", + "ThesisWritingWeek1.docx", + "This is the text that I am placing in here.docx", + "Tiger Statement.docx", + "Todd Faulls Ind Study F2024 Syllabus.docx", + "Todd Faulls Ind Study F2024 Syllabus.pdf", + "Twitter Printer Statement.docx", + "U of A Applictaion.pdf", + "UARTS Cover Letter.docx", + "UARTS2 Cover.docx", + "Undergraduate Work r2.pdf", + "Undergraduate Work.pdf", + "University of North Texas Cover letter.pdf", + "Unofficial Transcript.pdf", + "UofAArt Cover Letter.docx", + "UofAArt Cover Letter.pdf", + "Useless Box code flow chart.pdf", + "Useless Box flow chart.pdf", + "Utah MDD - Aaron Nelson - Copy.pptx", + "VT Cover.docx", + "VU.LID.2.pdf", + "Vanessa Bosch RBDS Scholarship Recommendation.docx", + "Vanessa Bosch WYA Scholarship Recommendation.docx", + "Vernon Byron Ind Study F2024 Syllabus.docx", + "Vernon Byron Ind Study F2024 Syllabus.pdf", + "Vevor 30L Ultrasonic.pdf", + "Vevor Pressure pot.pdf", + "Vevor Vaccum Chamber.pdf", + "Voltage Divider.pdf", + "Wearable Marquees uw4.pptx", + "Wearable Marquees.pptx", + "Week 10 Code Blocks Updated.txt", + "Week 10 Code Blocks.txt", + "Week 3 Code Blocks.txt", + "Week 7 Code Blocks - Images.txt", + "What is Computational Media.pptx", + "Will_It_Print.pdf", + "Working Job Posting - Lab Tech.docx", + "Youngstown State University.docx", + "nelson_application.pdf", + "nelson_personalwork.pdf", + "nelson_studentwork.pdf", + "nelsonaAR20 .docx", + "nelsonaAR20.pdf", + "nelsonaDSI20.pdf", + "next-experiment-proposal-base-class-validation.md", + "nic thesis request with both signatures.pdf", + "notes.txt", + "paper-track-positioning-note.md", + "test-watcher.md.md", + "text_parsing.txt", + "thesis documentation .pdf", + "tinkercad bad.pptx", + "tinkercad_Intro3D.pptx", + "ulysses.txt", + "weird bios.docx", + "wrong.pptx" + ], + "started_at": "2026-04-27T00:00:00", + "last_updated": "2026-04-30T01:46:11", + "total_cost_estimate": 32.362, + "count": 264 +} \ No newline at end of file diff --git a/scripts/ingest.py b/scripts/ingest.py index b476443..2d5a726 100644 --- a/scripts/ingest.py +++ b/scripts/ingest.py @@ -11,7 +11,7 @@ from docx import Document from pypdf import PdfReader from pptx import Presentation -load_dotenv(Path.home() / "aaronai" / ".env") +load_dotenv(Path.home() / "aaronai" / ".env", override=True) print("Loading embedding model...") embedder = SentenceTransformer("all-MiniLM-L6-v2") @@ -63,11 +63,34 @@ def make_id(filepath, chunk_index): path_hash = hashlib.md5(str(filepath).encode()).hexdigest()[:8] return f"{path_hash}_{chunk_index}" +def enqueue_stage2(source, full_text): + """Enqueue document for Stage 2 (Mistral orientation) → Stage 3 (Graphiti ingest). + TEMPORARY: this queue feed will be removed when pgvector is decommissioned + and the watcher calls Stage 2 directly. + """ + try: + pg = get_pg() + cur = pg.cursor() + cur.execute(""" + INSERT INTO stage_2_queue (source, full_text, char_length) + VALUES (%s, %s, %s) + ON CONFLICT (source) DO UPDATE SET + full_text = EXCLUDED.full_text, + char_length = EXCLUDED.char_length, + enqueued_at = NOW(), + completed_at = NULL, + failed_at = NULL, + attempts = 0 + """, (source, full_text[:50000], len(full_text))) + pg.commit() + pg.close() + except Exception as e: + print(f" Stage 2 queue insert failed (non-fatal): {e}") + def ingest_file(filepath): path = Path(filepath) suffix = path.suffix.lower() - # Skip temp files if path.name.startswith("~$") or path.name.startswith("."): return 0 @@ -98,6 +121,7 @@ def ingest_file(filepath): "folder": str(path.parent.relative_to(Path(sys.argv[1]) if len(sys.argv) > 1 else path.parent)) } for _ in chunks] + # STAGE 1: Write to pgvector (TEMPORARY — remove when chat agent migrates to Graphiti) pg = get_pg() cur = pg.cursor() for chunk_id, chunk, embedding, meta in zip(ids, chunks, embeddings, metadatas): @@ -111,12 +135,16 @@ def ingest_file(filepath): metadata = EXCLUDED.metadata """, ( chunk_id, chunk, embedding, - meta.get('source'), 'document', None, + meta.get("source"), "document", None, json.dumps(meta) )) pg.commit() pg.close() print(f" Indexed {len(chunks)} chunks: {path.name}") + + # Enqueue for Stage 2 → Stage 3 (Graphiti pipeline) + enqueue_stage2(path.name, text) + return len(chunks) except Exception as e: diff --git a/scripts/stage2_worker.py b/scripts/stage2_worker.py new file mode 100644 index 0000000..3630e05 --- /dev/null +++ b/scripts/stage2_worker.py @@ -0,0 +1,226 @@ +#!/usr/bin/env python3 +""" +Stage 2 Worker — Taxonomy-Free Mistral Orientation +Polls stage_2_queue, runs Mistral taxonomy-free pass, enqueues Stage 3. +Runs as systemd service: aaronai-stage2.service + +Routing: +- char_length < 2000 → skip Stage 3, mark complete (sparse content, cascade no benefit) +- char_length >= 2000 → enqueue Stage 3 with orientation metadata +""" + +import os, json, time, subprocess, logging, requests +from pathlib import Path +from datetime import datetime +from dotenv import load_dotenv +import psycopg2 + +load_dotenv(Path.home() / "aaronai" / ".env", override=True) + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [stage2] %(levelname)s %(message)s", + handlers=[ + logging.StreamHandler(), + logging.FileHandler("/var/log/aaronai/stage2.log", mode="a"), + ] +) +log = logging.getLogger("stage2") + +PG_DSN = os.getenv("PG_DSN") +OLLAMA_URL = "http://localhost:11434" +HEARTBEAT_FILE = Path("/var/log/aaronai/stage2-heartbeat") +CHAR_LENGTH_THRESHOLD = 2000 +REQUEST_TIMEOUT = 300 +RETRY_ATTEMPTS = 2 +POLL_INTERVAL = 5 +WORKER_VERSION = "2.0" + +TAXFREE_PROMPT = ( + "You are a metadata extraction system. Given a document, describe its content " + "shape for use as orientation context in a knowledge graph extraction pass.\n\n" + "Do not summarize content. Do not extract entities. Do not assign a single category label.\n\n" + "Instead, describe:\n" + "- What domains or frames are active in this content (there may be several simultaneously)\n" + "- How those frames relate to each other in this specific document\n" + "- What kind of relational content a knowledge graph extractor should look for\n\n" + "Output JSON only. No prose, no explanation, no markdown.\n\n" + "Schema:\n" + '{"active_frames": ["", ""], ' + '"frame_relationships": "", ' + '"extraction_orientation": "", ' + '"one_sentence_summary": ""}\n\n' + "Document:\n" +) + + +def get_pg(): + return psycopg2.connect(PG_DSN) + + +def write_heartbeat(): + try: + HEARTBEAT_FILE.parent.mkdir(parents=True, exist_ok=True) + HEARTBEAT_FILE.write_text(datetime.now().isoformat()) + except Exception: + pass + + +def recover_wedge(): + log.warning("Mistral wedge detected — restarting Ollama") + subprocess.run(["sudo", "systemctl", "restart", "ollama"], capture_output=True) + time.sleep(30) + for _ in range(3): + try: + r = requests.get(f"{OLLAMA_URL}/api/tags", timeout=10) + if r.status_code == 200: + log.info("Ollama recovered") + return True + except Exception: + time.sleep(5) + log.error("Ollama recovery failed") + return False + + +def run_mistral(doc_text): + payload = { + "model": "mistral:latest", + "prompt": TAXFREE_PROMPT + doc_text[:12000], + "stream": False, + "format": "json", + } + resp = requests.post(f"{OLLAMA_URL}/api/generate", json=payload, timeout=REQUEST_TIMEOUT) + resp.raise_for_status() + raw = resp.json().get("response", "{}") + try: + return json.loads(raw) + except Exception: + return {"error": "parse_failed", "raw": raw[:200]} + + +def build_orientation(meta): + frames = ", ".join(meta.get("active_frames", [])) + rel = meta.get("frame_relationships", "") + orient = meta.get("extraction_orientation", "") + summary = meta.get("one_sentence_summary", "") + return f"Active frames: {frames}. Frame relationships: {rel} Extraction focus: {orient} Summary: {summary}" + + +def enqueue_stage3(pg, source, full_text, orientation, metadata): + cur = pg.cursor() + cur.execute(""" + INSERT INTO stage_3_queue (source, full_text, orientation, stage2_metadata) + VALUES (%s, %s, %s, %s) + ON CONFLICT (source) DO UPDATE SET + full_text = EXCLUDED.full_text, + orientation = EXCLUDED.orientation, + stage2_metadata = EXCLUDED.stage2_metadata, + enqueued_at = NOW(), + completed_at = NULL, + failed_at = NULL, + attempts = 0 + """, (source, full_text, orientation, json.dumps(metadata))) + pg.commit() + + +def process_one(row): + row_id, source, full_text, char_length = row + log.info(f"Processing: {source} ({char_length} chars)") + + # Mark started + pg = get_pg() + cur = pg.cursor() + cur.execute("UPDATE stage_2_queue SET started_at = NOW(), attempts = attempts + 1 WHERE id = %s", (row_id,)) + pg.commit() + + # Routing: sparse content skips Stage 3 + if char_length < CHAR_LENGTH_THRESHOLD: + log.info(f" Skipping Stage 3 (char_length={char_length} < {CHAR_LENGTH_THRESHOLD})") + cur.execute("UPDATE stage_2_queue SET completed_at = NOW() WHERE id = %s", (row_id,)) + pg.commit() + pg.close() + return True + + # Run Mistral + log.info(f" Running Mistral taxonomy-free pass...") + try: + meta = run_mistral(full_text) + except requests.exceptions.Timeout: + log.warning(f" Mistral timeout on {source}") + pg.close() + return False + except Exception as e: + log.error(f" Mistral error on {source}: {e}") + cur.execute("UPDATE stage_2_queue SET failed_at = NOW(), failure_reason = %s WHERE id = %s", + (str(e)[:500], row_id)) + pg.commit() + pg.close() + return False + + frames = meta.get("active_frames", []) + log.info(f" Frames: {frames}") + + orientation = build_orientation(meta) + meta["_model"] = "mistral:latest" + meta["_worker_version"] = WORKER_VERSION + meta["_generated_at"] = datetime.now().isoformat() + meta["char_length"] = char_length + + # Enqueue Stage 3 + enqueue_stage3(pg, source, full_text, orientation, meta) + cur.execute("UPDATE stage_2_queue SET completed_at = NOW() WHERE id = %s", (row_id,)) + pg.commit() + pg.close() + log.info(f" Enqueued Stage 3: {source}") + return True + + +def run(): + log.info(f"Stage 2 worker starting (v{WORKER_VERSION})") + consecutive_failures = 0 + + while True: + write_heartbeat() + + try: + pg = get_pg() + cur = pg.cursor() + cur.execute(""" + SELECT id, source, full_text, char_length + FROM stage_2_queue + WHERE completed_at IS NULL + AND failed_at IS NULL + AND (started_at IS NULL OR started_at < NOW() - INTERVAL '10 minutes') + AND attempts < %s + ORDER BY enqueued_at ASC + LIMIT 1 + """, (RETRY_ATTEMPTS + 1,)) + row = cur.fetchone() + pg.close() + + if not row: + consecutive_failures = 0 + time.sleep(POLL_INTERVAL) + continue + + success = process_one(row) + + if not success: + consecutive_failures += 1 + if consecutive_failures >= 2: + log.warning("Multiple consecutive failures — checking for Mistral wedge") + recovered = recover_wedge() + if recovered: + consecutive_failures = 0 + time.sleep(10) + else: + consecutive_failures = 0 + time.sleep(1) + + except Exception as e: + log.error(f"Worker loop error: {e}") + time.sleep(10) + + +if __name__ == "__main__": + run() diff --git a/scripts/tier1_migration.py b/scripts/tier1_migration.py new file mode 100644 index 0000000..ebc4e51 --- /dev/null +++ b/scripts/tier1_migration.py @@ -0,0 +1,298 @@ +#!/usr/bin/env python3 +""" +Tier 1 Graphiti Migration — pgvector to Graphiti for ~300 most-recent sources. +Resumable via state file at ~/aaronai/experiments/tier1_migration_state.json. + +Usage: + python3 ~/aaronai/scripts/tier1_migration.py --dry-run + python3 ~/aaronai/scripts/tier1_migration.py + python3 ~/aaronai/scripts/tier1_migration.py --reset +""" +import argparse +import json +import os +import sys +import time +from pathlib import Path +import psycopg2 +import requests +from dotenv import load_dotenv + +load_dotenv(Path.home() / "aaronai" / ".env", override=True) + +GRAPHITI_URL = "http://localhost:8001" +PG_DSN = os.environ["PG_DSN"] + +MAX_SOURCES = 300 +BATCH_SIZE = 4 +BATCH_DELAY_S = 5 +LONG_DOC_THRESHOLD = 5000 +LONG_DOC_BATCH_SIZE = 2 + +EXPERIMENTS = Path.home() / "aaronai" / "experiments" +STATE_FILE = EXPERIMENTS / "tier1_migration_state.json" +RESULTS_FILE = EXPERIMENTS / "tier1_migration_results.json" +EXPERIMENTS.mkdir(parents=True, exist_ok=True) + + +def load_state(): + if STATE_FILE.exists(): + data = json.loads(STATE_FILE.read_text()) + return set(data.get("ingested", [])), data.get("started_at"), data.get("total_cost_estimate", 0) + return set(), None, 0 + + +def save_state(ingested, started_at, total_cost_estimate): + STATE_FILE.write_text(json.dumps({ + "ingested": sorted(ingested), + "started_at": started_at, + "last_updated": time.strftime("%Y-%m-%dT%H:%M:%S"), + "total_cost_estimate": round(total_cost_estimate, 4), + "count": len(ingested), + }, indent=2)) + + +def fetch_tier1_sources(cur, max_sources, exclude_set): + cur.execute(""" + SELECT column_name FROM information_schema.columns + WHERE table_name = 'embeddings' + """) + columns = {r[0] for r in cur.fetchall()} + has_created_at = "created_at" in columns + + if has_created_at: + order_clause = "MAX(created_at) DESC NULLS LAST" + else: + order_clause = "MAX(id) DESC" + + cur.execute(f""" + SELECT source, + STRING_AGG(document, E'\n\n' ORDER BY id) AS full_doc, + {("MAX(created_at)" if has_created_at else "NULL")} AS most_recent + FROM embeddings + GROUP BY source + ORDER BY {order_clause} + LIMIT %s + """, (max_sources * 2,)) + + candidates = cur.fetchall() + selected = [] + for source, doc, recent in candidates: + if not doc: + continue + if source in exclude_set: + continue + selected.append((source, doc, recent)) + if len(selected) >= max_sources: + break + return selected + + +def submit_batch(batch): + payload = { + "episodes": [ + { + "name": source, + "content": doc[:12000], + "source_description": "tier1_migration", + "timestamp": "2026-04-28T00:00:00", + } + for source, doc in batch + ] + } + t0 = time.time() + try: + r = requests.post(f"{GRAPHITI_URL}/episodes/bulk", json=payload, timeout=900) + elapsed = time.time() - t0 + return { + "ok": r.ok, + "status_code": r.status_code, + "elapsed_s": round(elapsed, 2), + "error": None if r.ok else r.text[:500], + "sources": [s for s, _ in batch], + } + except Exception as e: + return { + "ok": False, + "status_code": None, + "elapsed_s": round(time.time() - t0, 2), + "error": str(e)[:500], + "sources": [s for s, _ in batch], + } + + +def chunk_for_batches(sources, base_batch_size, long_threshold, long_batch_size): + batch = [] + current_size_target = base_batch_size + + for source, doc, _ in sources: + is_long = len(doc) >= long_threshold + target_size = long_batch_size if is_long else base_batch_size + + if batch and target_size != current_size_target: + yield batch + batch = [] + current_size_target = target_size + + batch.append((source, doc)) + if len(batch) >= current_size_target: + yield batch + batch = [] + + if batch: + yield batch + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--dry-run", action="store_true") + parser.add_argument("--max-sources", type=int, default=MAX_SOURCES) + parser.add_argument("--reset", action="store_true") + args = parser.parse_args() + + if args.reset and STATE_FILE.exists(): + confirm = input(f"Delete state file {STATE_FILE}? [y/N] ") + if confirm.lower() == "y": + STATE_FILE.unlink() + print("State file deleted. Resuming from scratch.") + else: + print("Aborted.") + return + + print("=" * 70) + print("Tier 1 Graphiti Migration") + print("=" * 70) + + try: + r = requests.get(f"{GRAPHITI_URL}/health", timeout=10) + if not r.ok: + print(f"ERROR: sidecar /health returned {r.status_code}") + return + print(f"Sidecar: {r.json()}") + except Exception as e: + print(f"ERROR: sidecar unreachable: {e}") + return + + ingested, started_at, prior_cost = load_state() + if started_at: + print(f"Resuming run started at {started_at}") + print(f" {len(ingested)} sources already ingested") + print(f" Estimated cost so far: ${prior_cost:.2f}") + else: + started_at = time.strftime("%Y-%m-%dT%H:%M:%S") + print(f"Fresh run starting at {started_at}") + + print() + print(f"Fetching tier 1 sources from pgvector (max={args.max_sources})...") + conn = psycopg2.connect(PG_DSN) + cur = conn.cursor() + sources = fetch_tier1_sources(cur, args.max_sources, ingested) + cur.close() + conn.close() + + print(f" {len(sources)} sources to ingest (excluding {len(ingested)} already done)") + + if not sources: + print() + print("Nothing to do — all tier 1 sources already ingested.") + return + + short = sum(1 for _, d, _ in sources if len(d) < 1000) + medium = sum(1 for _, d, _ in sources if 1000 <= len(d) < 5000) + long_ = sum(1 for _, d, _ in sources if len(d) >= 5000) + print(f" Distribution: short={short} medium={medium} long={long_}") + print() + + batches = list(chunk_for_batches(sources, BATCH_SIZE, LONG_DOC_THRESHOLD, LONG_DOC_BATCH_SIZE)) + print(f" Will submit {len(batches)} batches (delay {BATCH_DELAY_S}s between)") + print() + + if args.dry_run: + print("DRY RUN - first 10 batches:") + for i, batch in enumerate(batches[:10], 1): + print(f" [{i}] n={len(batch)} sources={[s[:40] for s, _ in batch]}") + print(f" ... and {max(0, len(batches) - 10)} more batches") + print() + print("Estimated cost: ${:.2f}".format(0.103 * sum(len(b) for b in batches))) + print("Estimated runtime: {:.1f} hours".format( + (sum(len(b) for b in batches) * 8 + len(batches) * BATCH_DELAY_S) / 3600 + )) + return + + total_start = time.time() + batch_results = [] + successful_episodes = 0 + failed_episodes = 0 + estimated_cost = prior_cost + + for i, batch in enumerate(batches, 1): + avg_chars = int(sum(len(d) for _, d in batch) / len(batch)) + bucket = "long" if avg_chars >= LONG_DOC_THRESHOLD else ("medium" if avg_chars >= 1000 else "short") + print(f"[{i:3d}/{len(batches)}] [{bucket:6s}] n={len(batch)} avg={avg_chars:6d}c", end=" ", flush=True) + + result = submit_batch(batch) + batch_results.append(result) + + if result["ok"]: + print(f" 200 {result['elapsed_s']}s") + for source, _ in batch: + ingested.add(source) + successful_episodes += len(batch) + estimated_cost += 0.103 * len(batch) + save_state(ingested, started_at, estimated_cost) + else: + err = (result["error"] or "")[:80] + print(f" FAIL: {err}") + failed_episodes += len(batch) + save_state(ingested, started_at, estimated_cost) + + if "Max pending queries" in (result["error"] or ""): + print(f" FalkorDB queue overflow - pausing 30s") + time.sleep(30) + elif "timed out" in (result["error"] or "").lower(): + print(f" Query timeout - pausing 15s") + time.sleep(15) + elif "rate" in (result["error"] or "").lower() or "429" in (result["error"] or ""): + print(f" Rate limited - pausing 60s") + time.sleep(60) + + if i < len(batches): + time.sleep(BATCH_DELAY_S) + + total_elapsed = time.time() - total_start + + summary = { + "started_at": started_at, + "completed_at": time.strftime("%Y-%m-%dT%H:%M:%S"), + "total_elapsed_s": round(total_elapsed, 1), + "total_elapsed_hours": round(total_elapsed / 3600, 2), + "n_batches": len(batches), + "successful_episodes": successful_episodes, + "failed_episodes": failed_episodes, + "total_ingested_now": len(ingested), + "estimated_total_cost": round(estimated_cost, 2), + "batch_results": batch_results, + } + RESULTS_FILE.write_text(json.dumps(summary, indent=2)) + + print() + print("=" * 70) + print("TIER 1 MIGRATION COMPLETE") + print("=" * 70) + print(f"Successful episodes: {successful_episodes}/{successful_episodes + failed_episodes}") + print(f"Failed episodes: {failed_episodes}") + print(f"Total ingested now: {len(ingested)}") + print(f"Wall-clock: {total_elapsed/3600:.2f} hours") + print(f"Estimated cost: ${estimated_cost:.2f}") + print() + print(f"State file: {STATE_FILE}") + print(f"Results file: {RESULTS_FILE}") + + +if __name__ == "__main__": + try: + main() + except KeyboardInterrupt: + print() + print("Interrupted. State saved. Re-run to resume.") + sys.exit(130)