From b6fe350ab2909590a41d3252f1a594b377db6cd5 Mon Sep 17 00:00:00 2001 From: Aaron Nelson Date: Tue, 28 Apr 2026 02:47:41 +0000 Subject: [PATCH] experiments: add consistency test and briefing generator results + scripts --- briefing_test_results.json | 1673 +++++++++++++++++++++ consistency_test_results.json | 1539 +++++++++++++++++++ experiments/briefing_test_results.json | 1673 +++++++++++++++++++++ experiments/consistency_test_results.json | 1539 +++++++++++++++++++ scripts/briefing_test.py | 313 ++++ scripts/consistency_test.py | 248 +++ 6 files changed, 6985 insertions(+) create mode 100644 briefing_test_results.json create mode 100644 consistency_test_results.json create mode 100644 experiments/briefing_test_results.json create mode 100644 experiments/consistency_test_results.json create mode 100644 scripts/briefing_test.py create mode 100644 scripts/consistency_test.py diff --git a/briefing_test_results.json b/briefing_test_results.json new file mode 100644 index 0000000..7a870d4 --- /dev/null +++ b/briefing_test_results.json @@ -0,0 +1,1673 @@ +{ + "meta": { + "model": "mistral", + "sample_size": 50, + "started": "2026-04-28T02:09:30.069385", + "completed": "2026-04-28T02:24:39.485763", + "total_elapsed_seconds": 909.7, + "avg_seconds_per_doc": 18.2 + }, + "documents": [ + { + "id": "5ee0b3bb_0", + "source": "00_Syllabus.docx", + "content_hash": "848c971c", + "content_length": 2273, + "status": "SUCCESS", + "elapsed_seconds": 35.6, + "briefing": { + "document_type": "presentation", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 568, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 26.0 + } + }, + { + "id": "4451e0d5_1", + "source": "01_ALL_Overview of AM and 3DP_v3.pptx", + "content_hash": "1e3ff98f", + "content_length": 2167, + "status": "SUCCESS", + "elapsed_seconds": 21.2, + "briefing": { + "document_type": "presentation", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": false, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 542, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 27.0 + } + }, + { + "id": "0619cec0_0", + "source": "01_NURBS Curves.docx", + "content_hash": "2ac1bb56", + "content_length": 1401, + "status": "SUCCESS", + "elapsed_seconds": 19.7, + "briefing": { + "document_type": "code", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 350, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 36.3 + } + }, + { + "id": "d0a3917e_0", + "source": "02_2D Geometry.docx", + "content_hash": "5d53f099", + "content_length": 188, + "status": "SUCCESS", + "elapsed_seconds": 10.4, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 47, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 81.0 + } + }, + { + "id": "89fed291_0", + "source": "02_Point of Curves - AARON.docx", + "content_hash": "864be8ed", + "content_length": 2116, + "status": "SUCCESS", + "elapsed_seconds": 20.0, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 529, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 27.4 + } + }, + { + "id": "2a15be8d_0", + "source": "02_Point of Curves.docx", + "content_hash": "4b683753", + "content_length": 1338, + "status": "SUCCESS", + "elapsed_seconds": 16.2, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullets", + "code_blocks" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 334, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 37.4 + } + }, + { + "id": "a2a7a8d3_2", + "source": "02_PPT_ALL_AM_Technologies_for_3DP_v3.pptx", + "content_hash": "1c260a03", + "content_length": 1675, + "status": "SUCCESS", + "elapsed_seconds": 21.7, + "briefing": { + "document_type": "presentation", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": false, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 419, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 32.3 + } + }, + { + "id": "2b3b1c34_0", + "source": "03_2D Transformation and Deformation.docx", + "content_hash": "306dc581", + "content_length": 418, + "status": "SUCCESS", + "elapsed_seconds": 11.2, + "briefing": { + "document_type": "academic_pdf", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 104, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 65.7 + } + }, + { + "id": "b425985b_0", + "source": "03_2D Transformation, Deformation, and Editing-AARON.docx", + "content_hash": "9f9c422a", + "content_length": 541, + "status": "SUCCESS", + "elapsed_seconds": 12.1, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": false, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 135, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 59.7 + } + }, + { + "id": "1c78c79f_0", + "source": "03_Editing Geometry.docx", + "content_hash": "9491f6cc", + "content_length": 171, + "status": "SUCCESS", + "elapsed_seconds": 10.1, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 43, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 82.4 + } + }, + { + "id": "6453f3a8_7", + "source": "04_ALL_Materials and Their Properties_v3.pptx", + "content_hash": "0c319651", + "content_length": 380, + "status": "SUCCESS", + "elapsed_seconds": 12.3, + "briefing": { + "document_type": "academic_pdf", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "bullet_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 95, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 67.8 + } + }, + { + "id": "1fd396d4_0", + "source": "04_Annotations.docx", + "content_hash": "9b3e57cd", + "content_length": 737, + "status": "SUCCESS", + "elapsed_seconds": 14.9, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [ + "formatting_artifacts" + ], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 184, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 9, + "noise_reduction_pct": 5.0, + "total_reduction_pct": 54.4 + } + }, + { + "id": "38d1cf0d_0", + "source": "05_Entering the third dimension.docx", + "content_hash": "8f56202a", + "content_length": 2175, + "status": "SUCCESS", + "elapsed_seconds": 19.9, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists", + "code_blocks" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 544, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 26.9 + } + }, + { + "id": "cfd1ee43_0", + "source": "05_Making things solid.docx", + "content_hash": "12634c4c", + "content_length": 692, + "status": "SUCCESS", + "elapsed_seconds": 13.4, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 173, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 53.6 + } + }, + { + "id": "31c729e4_8", + "source": "05_PPT_ALL_Machine Technology and Specifications_v3.pptx", + "content_hash": "bf8daf4b", + "content_length": 2886, + "status": "SUCCESS", + "elapsed_seconds": 20.0, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 722, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 21.7 + } + }, + { + "id": "a8d4d8a4_0", + "source": "06_3D_Editing.docx", + "content_hash": "887133dc", + "content_length": 157, + "status": "SUCCESS", + "elapsed_seconds": 9.9, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 39, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 83.6 + } + }, + { + "id": "c0d0659e_0", + "source": "06_Gumball.docx", + "content_hash": "c46dbc48", + "content_length": 1980, + "status": "SUCCESS", + "elapsed_seconds": 20.1, + "briefing": { + "document_type": "code", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 495, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 28.8 + } + }, + { + "id": "8f9d093e_4", + "source": "06_PPT_ALL_Design Considerations_From CAD to CAM_v3.pptx", + "content_hash": "0432ffd0", + "content_length": 2884, + "status": "SUCCESS", + "elapsed_seconds": 23.1, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 721, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 21.7 + } + }, + { + "id": "4e7db487_0", + "source": "07_Cube Assignment_2018f.docx", + "content_hash": "af2f5bab", + "content_length": 1316, + "status": "SUCCESS", + "elapsed_seconds": 18.9, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists" + ], + "noise_signals": [ + "formatting_artifacts" + ], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 329, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 16, + "noise_reduction_pct": 5.0, + "total_reduction_pct": 40.9 + } + }, + { + "id": "1a5b6da3_0", + "source": "07_Make2D.docx", + "content_hash": "d71c1df4", + "content_length": 834, + "status": "SUCCESS", + "elapsed_seconds": 14.6, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 208, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 49.0 + } + }, + { + "id": "97ba28bd_5", + "source": "07_PPT_ALL_Fabrication Considerations_v3.pptx", + "content_hash": "7ffb6f57", + "content_length": 710, + "status": "SUCCESS", + "elapsed_seconds": 14.6, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "tables" + ], + "noise_signals": [ + "repeated_headers" + ], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 178, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 9, + "noise_reduction_pct": 5.0, + "total_reduction_pct": 55.3 + } + }, + { + "id": "0892b9fa_2", + "source": "08_PPT_ALL_PostProcessing for FDM and PolyJet_v3.pptx", + "content_hash": "9fb49737", + "content_length": 2939, + "status": "SUCCESS", + "elapsed_seconds": 19.2, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 735, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 21.4 + } + }, + { + "id": "9446c72b_0", + "source": "08_Printing_Technicals.docx", + "content_hash": "365e53a6", + "content_length": 1310, + "status": "SUCCESS", + "elapsed_seconds": 18.7, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullets", + "images" + ], + "noise_signals": [ + "formatting_artifacts" + ], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 328, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 16, + "noise_reduction_pct": 5.0, + "total_reduction_pct": 41.0 + } + }, + { + "id": "44b7a630_0", + "source": "09_Tolerance Test Part.docx", + "content_hash": "f6a14d20", + "content_length": 817, + "status": "SUCCESS", + "elapsed_seconds": 14.8, + "briefing": { + "document_type": "academic_pdf", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [ + "formatting_artifacts" + ], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 204, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 10, + "noise_reduction_pct": 5.0, + "total_reduction_pct": 52.0 + } + }, + { + "id": "4a74de83_0", + "source": "09_Tolerance Test Part.pdf", + "content_hash": "7f3106a9", + "content_length": 1049, + "status": "SUCCESS", + "elapsed_seconds": 15.6, + "briefing": { + "document_type": "notes", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 262, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 43.3 + } + }, + { + "id": "aa807935_0", + "source": "10 Good Things about Aaron for DSI.docx", + "content_hash": "ff5081e9", + "content_length": 1126, + "status": "SUCCESS", + "elapsed_seconds": 17.7, + "briefing": { + "document_type": "unknown", + "primary_language": "en", + "density": "medium", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "bullet_lists" + ], + "noise_signals": [ + "page_numbers", + "formatting_artifacts" + ], + "extraction_priority": "partial" + }, + "token_reduction_estimate": { + "original_tokens_approx": 282, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 28, + "noise_reduction_pct": 10.0, + "total_reduction_pct": 47.4 + } + }, + { + "id": "90248749_1", + "source": "10_Moving Parts.docx", + "content_hash": "3c2e28b9", + "content_length": 218, + "status": "SUCCESS", + "elapsed_seconds": 10.2, + "briefing": { + "document_type": "code", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "no structure signals" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 54, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 78.6 + } + }, + { + "id": "958e5aac_0", + "source": "1119345.pdf", + "content_hash": "af7da5db", + "content_length": 1745, + "status": "SUCCESS", + "elapsed_seconds": 30.4, + "briefing": { + "document_type": "unknown", + "primary_language": "en", + "density": "medium", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists", + "numbered_lists", + "tables" + ], + "noise_signals": [ + "repeated_headers", + "page_numbers", + "formatting_artifacts" + ], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 436, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 65, + "noise_reduction_pct": 15.0, + "total_reduction_pct": 41.7 + } + }, + { + "id": "cbb41390_1", + "source": "2016 - DDF 205 - CAD I Syllabus.pdf", + "content_hash": "466b3184", + "content_length": 2161, + "status": "SUCCESS", + "elapsed_seconds": 20.2, + "briefing": { + "document_type": "syllabus", + "primary_language": "en", + "density": "medium", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": false, + "has_institutional_language": true, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [ + "headings" + ], + "noise_signals": [ + "page_numbers", + "formatting_artifacts" + ], + "extraction_priority": "partial" + }, + "token_reduction_estimate": { + "original_tokens_approx": 540, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 54, + "noise_reduction_pct": 10.0, + "total_reduction_pct": 34.3 + } + }, + { + "id": "86543785_1", + "source": "2017 - DDF 220 - Intro to Computational Media Syllabus.pdf", + "content_hash": "3ad85610", + "content_length": 2224, + "status": "SUCCESS", + "elapsed_seconds": 20.0, + "briefing": { + "document_type": "syllabus", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": false, + "has_institutional_language": true, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 556, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 26.5 + } + }, + { + "id": "fc7941cf_1", + "source": "2018 - DDF 205 - CAD I Syllabus.pdf", + "content_hash": "2632e62b", + "content_length": 2618, + "status": "SUCCESS", + "elapsed_seconds": 20.8, + "briefing": { + "document_type": "syllabus", + "primary_language": "en", + "density": "medium", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": false, + "has_institutional_language": true, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [ + "repeated_headers", + "page_numbers", + "formatting_artifacts", + "boilerplate" + ], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 654, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 131, + "noise_reduction_pct": 20.0, + "total_reduction_pct": 38.7 + } + }, + { + "id": "56b61c68_3", + "source": "2019-2020 Research and Creative Projects Awards Guidelines.FINAL.pdf", + "content_hash": "1a4e890b", + "content_length": 2228, + "status": "SUCCESS", + "elapsed_seconds": 20.6, + "briefing": { + "document_type": "form", + "primary_language": "en", + "density": "medium", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists" + ], + "noise_signals": [ + "formatting_artifacts" + ], + "extraction_priority": "partial" + }, + "token_reduction_estimate": { + "original_tokens_approx": 557, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 28, + "noise_reduction_pct": 5.0, + "total_reduction_pct": 30.1 + } + }, + { + "id": "9ed5c43e_2", + "source": "2019 - DDF 305 - Materials Syllabus.pdf", + "content_hash": "c0521ba2", + "content_length": 1842, + "status": "SUCCESS", + "elapsed_seconds": 22.0, + "briefing": { + "document_type": "syllabus", + "primary_language": "en", + "density": "medium", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists", + "numbered_lists", + "tables" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 460, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 30.3 + } + }, + { + "id": "2f02bfa5_2", + "source": "2020 - DDF 220 - Intro to Computational Media Syllabus.pdf", + "content_hash": "fe3ca5be", + "content_length": 2580, + "status": "SUCCESS", + "elapsed_seconds": 20.5, + "briefing": { + "document_type": "academic_pdf", + "primary_language": "en", + "density": "medium", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists", + "tables" + ], + "noise_signals": [ + "page_numbers", + "formatting_artifacts" + ], + "extraction_priority": "partial" + }, + "token_reduction_estimate": { + "original_tokens_approx": 645, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 64, + "noise_reduction_pct": 10.0, + "total_reduction_pct": 31.3 + } + }, + { + "id": "c0cd3599_3", + "source": "2021 - DDF 320 - Design Intents Syllabus.pdf", + "content_hash": "588d34a3", + "content_length": 1560, + "status": "SUCCESS", + "elapsed_seconds": 23.8, + "briefing": { + "document_type": "presentation", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": false, + "has_institutional_language": true, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists", + "numbered_lists" + ], + "noise_signals": [ + "repeated_headers", + "page_numbers" + ], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 390, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 39, + "noise_reduction_pct": 10.0, + "total_reduction_pct": 40.5 + } + }, + { + "id": "9ea5656f_2", + "source": "2023 Faculty Report Aaron Nelson.docx", + "content_hash": "fd68d021", + "content_length": 2698, + "status": "SUCCESS", + "elapsed_seconds": 19.6, + "briefing": { + "document_type": "syllabus", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists", + "numbered_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 674, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 22.9 + } + }, + { + "id": "33aae3e5_2", + "source": "2023 Faculty Report Template.docx", + "content_hash": "c2d50031", + "content_length": 2100, + "status": "SUCCESS", + "elapsed_seconds": 18.3, + "briefing": { + "document_type": "syllabus", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [ + "bullet_lists", + "numbered_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 525, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 27.6 + } + }, + { + "id": "bf155f9f_0", + "source": "2026-04-26-22-44-voice.md", + "content_hash": "41cc3d28", + "content_length": 165, + "status": "SUCCESS", + "elapsed_seconds": 10.9, + "briefing": { + "document_type": "unknown", + "primary_language": "en", + "density": "medium", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": false, + "has_institutional_language": false, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [ + "headings" + ], + "noise_signals": [ + "**type:**", + "**modality:**", + "**status:**" + ], + "extraction_priority": "partial" + }, + "token_reduction_estimate": { + "original_tokens_approx": 41, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 6, + "noise_reduction_pct": 15.0, + "total_reduction_pct": 85.5 + } + }, + { + "id": "5c9f5ad5_0", + "source": "2026-04-26-22-52-voice.md", + "content_hash": "0ed1efba", + "content_length": 171, + "status": "SUCCESS", + "elapsed_seconds": 10.8, + "briefing": { + "document_type": "chat_log", + "primary_language": "en", + "density": "medium", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": false, + "has_institutional_language": false, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [ + "bullet_lists" + ], + "noise_signals": [ + "repeated_headers", + "page_numbers", + "formatting_artifacts" + ], + "extraction_priority": "partial" + }, + "token_reduction_estimate": { + "original_tokens_approx": 43, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 6, + "noise_reduction_pct": 15.0, + "total_reduction_pct": 85.0 + } + }, + { + "id": "8bc956ad_0", + "source": "2026-04-26-23-04-voice.md", + "content_hash": "c455ef44", + "content_length": 931, + "status": "SUCCESS", + "elapsed_seconds": 16.0, + "briefing": { + "document_type": "chat_log", + "primary_language": "en", + "density": "medium", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": false, + "has_institutional_language": false, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [ + "headings" + ], + "noise_signals": [ + "formatting_artifacts" + ], + "extraction_priority": "partial" + }, + "token_reduction_estimate": { + "original_tokens_approx": 233, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 12, + "noise_reduction_pct": 5.0, + "total_reduction_pct": 48.9 + } + }, + { + "id": "af176130_0", + "source": "2026-04-26-lucid-1.md", + "content_hash": "d9c51a1c", + "content_length": 2444, + "status": "SUCCESS", + "elapsed_seconds": 20.2, + "briefing": { + "document_type": "chat_log", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": false, + "has_institutional_language": true, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bulleted_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 611, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 24.7 + } + }, + { + "id": "52114711_0", + "source": "2026-04-26-lucid.md", + "content_hash": "4c5fb648", + "content_length": 2437, + "status": "SUCCESS", + "elapsed_seconds": 18.4, + "briefing": { + "document_type": "chat_log", + "primary_language": "en", + "density": "medium", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": false, + "has_institutional_language": true, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [ + "headings" + ], + "noise_signals": [], + "extraction_priority": "partial" + }, + "token_reduction_estimate": { + "original_tokens_approx": 609, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 24.7 + } + }, + { + "id": "1bf832a0_0", + "source": "2026-04-26-nrem-1.md", + "content_hash": "1ad1e9c1", + "content_length": 1586, + "status": "SUCCESS", + "elapsed_seconds": 21.4, + "briefing": { + "document_type": "syllabus", + "primary_language": "en", + "density": "medium", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [ + "repeated_headers" + ], + "extraction_priority": "partial" + }, + "token_reduction_estimate": { + "original_tokens_approx": 396, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 20, + "noise_reduction_pct": 5.0, + "total_reduction_pct": 36.9 + } + }, + { + "id": "a16d6571_0", + "source": "2026-04-26-nrem.md", + "content_hash": "1714ccc0", + "content_length": 1638, + "status": "SUCCESS", + "elapsed_seconds": 19.4, + "briefing": { + "document_type": "syllabus", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 410, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 32.8 + } + }, + { + "id": "b696802f_0", + "source": "2026-04-27-04-34-image.md", + "content_hash": "3cce200d", + "content_length": 2027, + "status": "SUCCESS", + "elapsed_seconds": 20.3, + "briefing": { + "document_type": "presentation", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 507, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 28.3 + } + }, + { + "id": "6bc36d6f_0", + "source": "2026-04-27-04-36-image.md", + "content_hash": "29717d0c", + "content_length": 1755, + "status": "SUCCESS", + "elapsed_seconds": 20.1, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": false, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 439, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 31.3 + } + }, + { + "id": "8b7ed0da_0", + "source": "2026-04-27-04-41-image.md", + "content_hash": "47a1f451", + "content_length": 2148, + "status": "SUCCESS", + "elapsed_seconds": 20.2, + "briefing": { + "document_type": "presentation", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "visual description", + "content" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 537, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 27.1 + } + }, + { + "id": "700d4582_0", + "source": "2026-04-27-06-21-image.md", + "content_hash": "b143e6fc", + "content_length": 1643, + "status": "SUCCESS", + "elapsed_seconds": 19.7, + "briefing": { + "document_type": "notes", + "primary_language": "en", + "density": "low", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": false, + "has_institutional_language": false, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [], + "noise_signals": [ + "page_numbers" + ], + "extraction_priority": "partial" + }, + "token_reduction_estimate": { + "original_tokens_approx": 411, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 21, + "noise_reduction_pct": 5.0, + "total_reduction_pct": 36.1 + } + }, + { + "id": "31317444_0", + "source": "2026-04-27-19-04-image.md", + "content_hash": "8bd62d02", + "content_length": 1767, + "status": "SUCCESS", + "elapsed_seconds": 19.3, + "briefing": { + "document_type": "presentation", + "primary_language": "en", + "density": "medium", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": false, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [], + "extraction_priority": "partial" + }, + "token_reduction_estimate": { + "original_tokens_approx": 442, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 31.2 + } + }, + { + "id": "bc4bffcd_0", + "source": "2026-04-27-20-18-image.md", + "content_hash": "c33f8f22", + "content_length": 1856, + "status": "SUCCESS", + "elapsed_seconds": 20.3, + "briefing": { + "document_type": "presentation", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists" + ], + "noise_signals": [ + "formatting_artifacts" + ], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 464, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 23, + "noise_reduction_pct": 5.0, + "total_reduction_pct": 33.6 + } + } + ], + "summary": { + "total": 50, + "success": 50, + "failed": 0, + "success_rate": 100.0, + "extraction_priority_breakdown": { + "full": 39, + "partial": 11, + "skip": 0 + }, + "avg_token_reduction_pct": 42.3, + "total_elapsed_seconds": 909.7, + "avg_seconds_per_doc": 18.2, + "projected_50_doc_minutes": 15.2, + "approach_viable": true + } +} \ No newline at end of file diff --git a/consistency_test_results.json b/consistency_test_results.json new file mode 100644 index 0000000..88d8e06 --- /dev/null +++ b/consistency_test_results.json @@ -0,0 +1,1539 @@ +{ + "meta": { + "model": "mistral", + "passes": 3, + "sample_size": 50, + "started": "2026-04-28T01:39:19.750205", + "completed": "2026-04-28T01:56:58.176009" + }, + "documents": [ + { + "id": "claude_bbe0172d-3087-4238-a51c-7dca6c0b6f28_1326", + "source": "Claude: Setting up a custom OpenClaw instance", + "content_hash": "0ff1319e", + "content_length": 700, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 13.6, + 2.6, + 2.5 + ], + "extraction_sample": { + "people": [], + "organizations": [], + "locations": [], + "dates": [ + "30 days" + ], + "document_type": "conversation" + }, + "raw_samples": [ + "{\n \"people\": [],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [\"30 days\"],\n \"document_type\": \"conversation\"\n}", + "{\n \"people\": [],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [\"30 days\"],\n \"document_type\": \"text\"\n}", + "{\n \"people\": [],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [\"30 days\"],\n \"document_type\": \"conversation\"\n}" + ] + }, + { + "id": "1a545c4e_432", + "source": "ulysses.txt", + "content_hash": "5d6d9cf1", + "content_length": 2807, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 18.1, + 3.6, + 5.2 + ], + "extraction_sample": { + "people": [ + "bloom", + "corny kelleher", + "first watch", + "second watch" + ], + "organizations": [ + "behan's", + "jammet's" + ], + "locations": [ + "gardiner street", + "nighttown", + "scaffolding" + ], + "dates": [], + "document_type": "literature" + }, + "raw_samples": [ + "{\n \"people\": [\"Bloom\", \"First Watch\", \"Second Watch\", \"Corny Kelleher\"],\n \"organizations\": [\"Jammet's\", \"Behan's\"],\n \"locations\": [\"Nighttown\", \"Gardiner street\", \"scaffolding\"],\n \"document_type\":", + "{\n \"people\": [\"BLOOM\", \"CORNY KELLEHER\", \"Father\"],\n \"locations\": [\"nighttown\", \"Gardiner street\"],\n \"document_type\": \"fiction\"\n}", + "{\n \"people\": [\"Bloom\", \"Corny Kelleher\", \"First Watch\", \"Second Watch\", \"Father\"],\n \"organizations\": [],\n \"locations\": [\"Gardiner street\", \"nighttown\", \"Jammet's\"],\n \"dates\": [],\n \"document_type\"" + ] + }, + { + "id": "claude_bbe0172d-3087-4238-a51c-7dca6c0b6f28_430", + "source": "Claude: Setting up a custom OpenClaw instance", + "content_hash": "37dc9e1c", + "content_length": 2937, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 19.0, + 2.2, + 2.2 + ], + "extraction_sample": { + "people": [], + "organizations": [ + "aaronnelson.studio", + "nextcloud" + ], + "locations": [ + "nextcloud.aaronnelson.studio" + ], + "dates": [], + "document_type": "text" + }, + "raw_samples": [ + "{\n \"people\": [],\n \"organizations\": [\"Nextcloud\", \"aaronnelson.studio\"],\n \"locations\": [\"nextcloud.aaronnelson.studio\"],\n \"dates\": [],\n \"document_type\": \"text\"\n}", + "{\n \"locations\": [\"nextcloud.aaronnelson.studio\"],\n \"document_type\": \"text\"\n}", + "{\n \"locations\": [\"nextcloud.aaronnelson.studio\"],\n \"document_type\": \"plain text\"\n}" + ] + }, + { + "id": "0e18a972_12", + "source": "RH50-TM-L1-EN-20140902.pdf", + "content_hash": "dd1e2907", + "content_length": 2709, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 11.8, + 2.7, + 3.2 + ], + "extraction_sample": { + "people": [ + "robert mcneel" + ], + "organizations": [ + "robert mcneel & associates" + ], + "locations": [], + "dates": [], + "document_type": "" + }, + "raw_samples": [ + "{\n \"people\": [\"Robert McNeel\"],\n \"organizations\": [\"Robert McNeel & Associates\"]\n}", + "{\n \"people\": [\"Robert McNeel\"],\n \"organizations\": [\"Robert McNeel & Associates\"],\n \"document_type\": \"instructions\"\n}", + "{\n \"people\": [],\n \"organizations\": [\"Robert McNeel & Associates\"],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"Instruction manual\"\n}" + ] + }, + { + "id": "1a545c4e_197", + "source": "ulysses.txt", + "content_hash": "d911e3fd", + "content_length": 2741, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 16.4, + 5.9, + 5.8 + ], + "extraction_sample": { + "people": [ + "girlgold", + "hugh machugh", + "kennedy", + "lenehan", + "miss douce", + "mr dedalus", + "stephen" + ], + "organizations": [ + "mooney's en ville", + "mooney's sur mer" + ], + "locations": [], + "dates": [], + "document_type": "fiction" + }, + "raw_samples": [ + "{\n \"people\": [\"Kennedy\", \"girlgold\", \"Miss Douce\", \"Stephen\", \"Lenehan\", \"Mr Dedalus\", \"Hugh MacHugh\"],\n \"organizations\": [\"Mooney's EN VILLE\", \"Mooney's SUR MER\"],\n \"document_type\": \"fiction\"\n}", + "{\n \"people\": [\"Kennedy\", \"Miss Douce\", \"Lenehan\", \"Stephen\", \"Mr Dedalus\", \"Hugh MacHugh\"],\n \"organizations\": [\"Mooney's EN VILLE\", \"Mooney's SUR MER\"],\n \"locations\": [],\n \"dates\": [],\n \"document", + "{\n \"people\": [\"Kennedy\", \"Mr Dedalus\", \"Lenehan\", \"Stephen\", \"Hugh MacHugh\"],\n \"organizations\": [],\n \"locations\": [\"Mooney's EN VILLE\", \"Mooney's SUR MER\"],\n \"dates\": [],\n \"document_type\": \"ficti" + ] + }, + { + "id": "88602096_2", + "source": "Kingston Grads Ind Study SP2023 Syllabus.pdf", + "content_hash": "43b2ec27", + "content_length": 2525, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 15.7, + 7.2, + 7.0 + ], + "extraction_sample": { + "people": [ + "members", + "students" + ], + "organizations": [ + "dean of your school", + "department chair", + "office of human resources, diversity & inclusion (hrdi)", + "suny new paltz", + "title ix office" + ], + "locations": [ + "campus buildings", + "new paltz", + "student union, room 100 south" + ], + "dates": [], + "document_type": "policy statement" + }, + "raw_samples": [ + "{\n \"people\": [\"members\", \"students\"],\n \"organizations\": [\"SUNY New Paltz\", \"Title IX Office\", \"department chair\", \"dean of your school\", \"Office of Human Resources, Diversity & Inclusion (HRDI)\"],\n ", + "{\n \"people\": [\"members\", \"students\"],\n \"organizations\": [\"SUNY New Paltz\", \"Title IX Office\", \"department chair and/or the dean of your school\", \"Office of Human Resources, Diversity & Inclusion (HR", + "{\n \"people\": [\"members\", \"students\"],\n \"organizations\": [\"SUNY New Paltz\", \"Title IX Office\", \"department chair\", \"dean of your school\", \"Office of Human Resources, Diversity & Inclusion (HRDI)\"],\n " + ] + }, + { + "id": "7785818c_1", + "source": "The Extended Mind _ The Power of Thinking Outside the Brain -- Annie Murphy Paul.pdf", + "content_hash": "48faec1d", + "content_length": 2891, + "status": "CONSISTENT", + "consistent": true, + "pass_times_seconds": [ + 12.5, + 3.4, + 3.3 + ], + "extraction_sample": { + "people": [ + "fr\u00e9d\u00e9ric gros", + "nietzsche" + ], + "organizations": [], + "locations": [], + "dates": [], + "document_type": "article" + }, + "raw_samples": [ + "{\n \"people\": [\"Nietzsche\", \"Fr\u00e9d\u00e9ric Gros\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"article\"\n}", + "{\n \"people\": [\"Nietzsche\", \"Fr\u00e9d\u00e9ric Gros\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"article\"\n}", + "{\n \"people\": [\"Nietzsche\", \"Fr\u00e9d\u00e9ric Gros\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"article\"\n}" + ] + }, + { + "id": "1a545c4e_3", + "source": "ulysses.txt", + "content_hash": "73ad0e8d", + "content_length": 2733, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 13.8, + 3.8, + 3.7 + ], + "extraction_sample": { + "people": [ + "buck mulligan", + "stephen", + "ursula" + ], + "organizations": [], + "locations": [ + "irish art", + "island" + ], + "dates": [], + "document_type": "fiction" + }, + "raw_samples": [ + "{\n \"people\": [\"Stephen\", \"Buck Mulligan\", \"Ursula\"],\n \"organizations\": [],\n \"locations\": [\"Irish art\", \"Island\"],\n \"dates\": [],\n \"document_type\": \"fiction\"\n}", + "{\n \"people\": [\"Stephen\", \"Buck Mulligan\", \"Ursula\"],\n \"organizations\": [],\n \"locations\": [\"Irish art\", \"the island\"],\n \"dates\": [],\n \"document_type\": \"fiction\"\n}", + "{\n \"people\": [\"Stephen\", \"Buck Mulligan\", \"Ursula\"],\n \"organizations\": [],\n \"locations\": [\"Irish island\"],\n \"dates\": [],\n \"document_type\": \"fiction\"\n}" + ] + }, + { + "id": "1a545c4e_46", + "source": "ulysses.txt", + "content_hash": "f42b21d0", + "content_length": 2833, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 15.3, + 3.5, + 3.4 + ], + "extraction_sample": { + "people": [ + "eleven", + "she" + ], + "organizations": [], + "locations": [ + "dolphin's barn" + ], + "dates": [], + "document_type": "book title (assumed from 'ruby: the pride of the ring')" + }, + "raw_samples": [ + "{\n \"people\": [\"Eleven\", \"She\"],\n \"organizations\": [],\n \"locations\": [\"Dolphin's Barn\"],\n \"dates\": [],\n \"document_type\": \"Book Title (assumed from 'RUBY: THE PRIDE OF THE RING')\"\n}", + "{\n \"people\": [\"Eleven\", \"She\"],\n \"organizations\": [],\n \"locations\": [\"Dolphin's Barn\"],\n \"dates\": [],\n \"document_type\": \"Book\"\n}", + "{\n \"people\": [\"Eleven\", \"She\"],\n \"organizations\": [],\n \"locations\": [\"Dolphin's Barn\"],\n \"dates\": [],\n \"document_type\": \"book\"\n}" + ] + }, + { + "id": "9d366a6a_2", + "source": "Circuit Intro.pptx", + "content_hash": "5467e94e", + "content_length": 2946, + "status": "CONSISTENT", + "consistent": true, + "pass_times_seconds": [ + 11.7, + 2.4, + 2.5 + ], + "extraction_sample": { + "people": [], + "organizations": [], + "locations": [], + "dates": [], + "document_type": "text" + }, + "raw_samples": [ + "{\n \"people\": [],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"text\"\n}", + "{\n \"people\": [],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"text\"\n}", + "{\n \"people\": [],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"text\"\n}" + ] + }, + { + "id": "49e3545d_33", + "source": "RH50-TM-L1-EN-20140902.pdf", + "content_hash": "6e608b0e", + "content_length": 2928, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 12.0, + 3.2, + 3.3 + ], + "extraction_sample": { + "people": [], + "organizations": [ + "robert mcneel & associates" + ], + "locations": [], + "dates": [], + "document_type": "model" + }, + "raw_samples": [ + "{\n \"people\": [],\n \"organizations\": [\"Robert McNeel & Associates\"],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"model\"\n}", + "{\n \"people\": [],\n \"organizations\": [\"Robert McNeel & Associates\"],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"Cplanes.3dm\"\n}", + "{\n \"people\": [],\n \"organizations\": [\"Robert McNeel & Associates\"],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"Cplanes.3dm\"\n}" + ] + }, + { + "id": "fc378df0_203", + "source": "ulysses.txt", + "content_hash": "eb17907a", + "content_length": 2945, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 14.6, + 4.7, + 4.9 + ], + "extraction_sample": { + "people": [ + "ben dollard", + "george lidwell", + "lydia douce" + ], + "organizations": [ + "the independent" + ], + "locations": [], + "dates": [], + "document_type": "fiction" + }, + "raw_samples": [ + "{\n \"people\": [\"Ben Dollard\", \"Lydia Douce\", \"George Lidwell\"],\n \"organizations\": [\"The INDEPENDENT\"],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"Fiction\"\n}", + "{\n \"people\": [\"Bloom\", \"Molly\", \"Ben Dollard\", \"Miss Douce\", \"Lydia Douce\", \"George Lidwell\"],\n \"organizations\": [\"INDEPENDENT\"],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"fiction\"\n}", + "{\n \"people\": [\"Bloom\", \"Molly\", \"Ben Dollard\", \"Miss Douce\", \"Lydia Douce\", \"George Lidwell\"],\n \"organizations\": [\"INDEPENDENT\"],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"newspaper arti" + ] + }, + { + "id": "1a545c4e_12", + "source": "ulysses.txt", + "content_hash": "2d5907e9", + "content_length": 2800, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 13.3, + 3.5, + 4.3 + ], + "extraction_sample": { + "people": [ + "buck mulligan", + "haines", + "stephen" + ], + "organizations": [], + "locations": [ + "bay", + "elsinore", + "sea" + ], + "dates": [], + "document_type": "literature" + }, + "raw_samples": [ + "{\n \"people\": [\"Stephen\", \"Haines\", \"Buck Mulligan\"],\n \"locations\": [\"Elsinore\", \"sea\", \"bay\"],\n \"document_type\": \"literature\"\n}", + "{\n \"people\": [\"Stephen\", \"Haines\", \"Buck Mulligan\"],\n \"organizations\": [],\n \"locations\": [\"Elsinore\"],\n \"dates\": [],\n \"document_type\": \"fiction\"\n}", + "{\n \"people\": [\"Stephen\", \"Haines\", \"Buck Mulligan\"],\n \"organizations\": [],\n \"locations\": [\"Elsinore\", \"Sea\", \"Bay\", \"Muglins\"],\n \"dates\": [],\n \"document_type\": \"fiction\"\n}" + ] + }, + { + "id": "e6101efc_1", + "source": "3D Printing \u2013 Designing for Additive2.pptx", + "content_hash": "7bb3f96b", + "content_length": 639, + "status": "CONSISTENT", + "consistent": true, + "pass_times_seconds": [ + 11.1, + 2.9, + 2.3 + ], + "extraction_sample": { + "people": [], + "organizations": [ + "hvamc" + ], + "locations": [], + "dates": [ + "2017" + ], + "document_type": "article" + }, + "raw_samples": [ + "{\n \"document_type\": \"article\",\n \"locations\": [],\n \"people\": [],\n \"organizations\": [\"HVAMC\"],\n \"dates\": [\"2017\"]\n}", + "{\n \"document_type\": \"article\",\n \"organizations\": [\"HVAMC\"],\n \"locations\": [],\n \"dates\": [\"2017\"],\n \"people\": []\n}", + "{\n \"organizations\": [\"HVAMC\"],\n \"dates\": [\"2017\"],\n \"document_type\": \"article\"\n}" + ] + }, + { + "id": "fc378df0_79", + "source": "ulysses.txt", + "content_hash": "f9333d86", + "content_length": 2691, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 14.3, + 4.5, + 3.5 + ], + "extraction_sample": { + "people": [ + "dedalus", + "father coffey" + ], + "organizations": [], + "locations": [ + "saint werburgh's" + ], + "dates": [], + "document_type": "novel" + }, + "raw_samples": [ + "{\n \"people\": [\"Father Coffey\", \"Dedalus\"],\n \"organizations\": [],\n \"locations\": [\"Saint Werburgh's\"],\n \"dates\": [],\n \"document_type\": \"novel\"\n}", + "{\n \"people\": [\"Father Coffey\", \"Dedalus\", \"Molly\", \"Mervyn Browne\"],\n \"organizations\": [],\n \"locations\": [\"Saint Werburgh's\"],\n \"dates\": [],\n \"document_type\": \"novel\"\n}", + "{\n \"people\": [\"Father Coffey\", \"Dedalus\"],\n \"organizations\": [],\n \"locations\": [\"Saint Werburgh's\"],\n \"dates\": [],\n \"document_type\": \"novel\"\n}" + ] + }, + { + "id": "25df23c5_0", + "source": "MIC Work Instructions.docx", + "content_hash": "542035c0", + "content_length": 2192, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 11.3, + 2.9, + 3.3 + ], + "extraction_sample": { + "people": [ + "users" + ], + "organizations": [ + "mic", + "newpaltz" + ], + "locations": [ + "print center" + ], + "dates": [], + "document_type": "email" + }, + "raw_samples": [ + "{\n \"people\": [\"users\"],\n \"organizations\": [\"Newpaltz\", \"MIC\"],\n \"locations\": [\"print center\"],\n \"document_type\": \"email\"\n}", + "{\n \"people\": [\"users\"],\n \"organizations\": [\"Newpaltz.edu\", \"MIC\"],\n \"locations\": [\"print center\"],\n \"document_type\": \"email\"\n}", + "{\n \"people\": [],\n \"organizations\": [\"Newpaltz\", \"MIC\"],\n \"locations\": [\"print center\"],\n \"dates\": [],\n \"document_type\": \"email\"\n}" + ] + }, + { + "id": "7369fb88_0", + "source": "Selected FDM Materials - EN Chemical Compatibility (2).pdf", + "content_hash": "9577291b", + "content_length": 2976, + "status": "CONSISTENT", + "consistent": true, + "pass_times_seconds": [ + 13.5, + 3.0, + 2.9 + ], + "extraction_sample": { + "people": [], + "organizations": [ + "stratasys" + ], + "locations": [], + "dates": [], + "document_type": "data sheet" + }, + "raw_samples": [ + "{\n \"people\": [],\n \"organizations\": [\"Stratasys\"],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"Data Sheet\"\n}", + "{\n \"people\": [],\n \"organizations\": [\"Stratasys\"],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"Data Sheet\"\n}", + "{\n \"people\": [],\n \"organizations\": [\"Stratasys\"],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"Data Sheet\"\n}" + ] + }, + { + "id": "a230f2c3_593", + "source": "Landscape And Memory -- Simon Schama.pdf", + "content_hash": "b9bb7299", + "content_length": 1373, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 23.8, + 7.0, + 6.1 + ], + "extraction_sample": { + "people": [ + "carlo ginzburg", + "john hutchin", + "leo wieseltier" + ], + "organizations": [], + "locations": [ + "greenwood" + ], + "dates": [ + "1774", + "1808", + "1990", + "aug. 5, 1991" + ], + "document_type": "article" + }, + "raw_samples": [ + "{\n \"people\": [\"Carlo Ginzburg\", \"John Hutchin\", \"Leo Wieseltier\"],\n \"organizations\": [],\n \"locations\": [\"Greenwood\"],\n \"dates\": [\"1990\", \"Aug. 5, 1991\", \"1774\", \"1808\"],\n \"document_type\": \"articl", + "{\n \"people\": [\"Cambell\", \"American Scholar\", \"Carlo Ginzburg\", \"Leo Weiseltier\"],\n \"organizations\": [],\n \"locations\": [\"Greenwood\"],\n \"dates\": [\"1990\", \"Aug. 5, 1991\", \"1774\", \"1808\"],\n \"document", + "{\n \"people\": [\"Carlo Ginzburg\", \"John Hutchin\", \"Leon Weiseltier\"],\n \"organizations\": [],\n \"locations\": [\"Greenwood\", \"Dorset\", \"London\"],\n \"dates\": [\"1990\", \"1774\", \"1808\"],\n \"document_type\": \"a" + ] + }, + { + "id": "91ccefdd_35", + "source": "Cognition in the Wild (A Bradford Book) -- Hutchins, Edwin.pdf", + "content_hash": "02c3beb2", + "content_length": 2722, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 11.5, + 3.0, + 3.0 + ], + "extraction_sample": { + "people": [], + "organizations": [], + "locations": [ + "region around the ship", + "shore" + ], + "dates": [], + "document_type": "book" + }, + "raw_samples": [ + "{\n \"people\": [],\n \"organizations\": [],\n \"locations\": [\"region around the ship\", \"shore\"],\n \"dates\": [],\n \"document_type\": \"book\"\n}", + "{\n \"people\": [],\n \"organizations\": [],\n \"locations\": [\"region around the ship\", \"ship\"],\n \"dates\": [],\n \"document_type\": \"book\"\n}", + "{\n \"people\": [],\n \"organizations\": [],\n \"locations\": [\"region around the ship\", \"shore\"],\n \"dates\": [],\n \"document_type\": \"book\"\n}" + ] + }, + { + "id": "a230f2c3_310", + "source": "Landscape And Memory -- Simon Schama.pdf", + "content_hash": "310615d5", + "content_length": 2808, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 14.6, + 4.8, + 4.8 + ], + "extraction_sample": { + "people": [ + "keymis", + "ralegh" + ], + "organizations": [], + "locations": [ + "england", + "greenwich", + "newfoundland", + "plymouth", + "thames estuary", + "westminster" + ], + "dates": [ + "29" + ], + "document_type": "letter" + }, + "raw_samples": [ + "{\n \"people\": [\"Ralegh\", \"Keymis\"],\n \"organizations\": [],\n \"locations\": [\"Greenwich\", \"Westminster\", \"Plymouth\", \"Thames Estuary\", \"England\", \"Newfoundland\"],\n \"dates\": [\"29\"],\n \"document_type\": \"", + "{\n \"people\": [\"Ralegh\", \"Keymis\", \"Queen\"],\n \"locations\": [\"Keymis\", \"cabin\", \"Newfoundland\", \"Plymouth\", \"Thames estuary\", \"France\", \"Greenwich\", \"Westminster\"],\n \"document_type\": \"letter\"\n}", + "{\n \"people\": [\"Sir Walter Ralegh\", \"Keymis\"],\n \"locations\": [\"miserable Keymis\", \"cabin\", \"Newfoundland\", \"Plymouth\", \"Thames estuary\", \"Greenwich\", \"Westminster\"],\n \"document_type\": \"historical do" + ] + }, + { + "id": "claude_bbe0172d-3087-4238-a51c-7dca6c0b6f28_280", + "source": "Claude: Setting up a custom OpenClaw instance", + "content_hash": "995c8e57", + "content_length": 1454, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 15.6, + 3.0, + 3.5 + ], + "extraction_sample": { + "people": [ + "aaron" + ], + "organizations": [ + "aaronnelson.studio" + ], + "locations": [], + "dates": [ + "thu, 23 apr 2026" + ], + "document_type": "text" + }, + "raw_samples": [ + "{\n \"people\": [\"aaron\"],\n \"organizations\": [\"aaronnelson.studio\"],\n \"dates\": [\"Thu, 23 Apr 2026\"],\n \"document_type\": \"text\"\n}", + "{\n \"people\": [\"aaronnelson\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"text\"\n}", + "{\n \"people\": [\"aaron\"],\n \"organizations\": [\"aaronnelson.studio\"],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"text\"\n}" + ] + }, + { + "id": "91ccefdd_125", + "source": "Cognition in the Wild (A Bradford Book) -- Hutchins, Edwin.pdf", + "content_hash": "8a85596b", + "content_length": 2893, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 10.9, + 2.8, + 2.6 + ], + "extraction_sample": { + "people": [], + "organizations": [], + "locations": [], + "dates": [], + "document_type": "text" + }, + "raw_samples": [ + "{\n \"people\": [],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"text\"\n}", + "{\n \"people\": [],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"manual\"\n}", + "{\n \"people\": [],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"instruction\"\n}" + ] + }, + { + "id": "074bb5f9_85", + "source": "Mystery train _ images of America in rock 'n' roll music -- Marcus, Greil.pdf", + "content_hash": "ac830289", + "content_length": 2702, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 13.1, + 3.8, + 4.0 + ], + "extraction_sample": { + "people": [ + "staggerlee", + "superflake", + "taxi man" + ], + "organizations": [], + "locations": [ + "harlem" + ], + "dates": [], + "document_type": "movie_review" + }, + "raw_samples": [ + "{\n \"people\": [\"Superflake\", \"Taxi Man\", \"Staggerlee\"],\n \"organizations\": [],\n \"locations\": [\"Harlem\"],\n \"dates\": [],\n \"document_type\": \"movie_review\"\n}", + "{\n \"people\": [\"Taxi Man\", \"Staggerlee\", \"Superflake\", \"Franciosa\"],\n \"organizations\": [],\n \"locations\": [\"Harlem\"],\n \"dates\": [],\n \"document_type\": \"fiction\"\n}", + "{\n \"people\": [\"Superflake\", \"Taxi Man\", \"Staggerlee\", \"Fats Domino\"],\n \"organizations\": [],\n \"locations\": [\"Harlem\"],\n \"dates\": [],\n \"document_type\": \"movie script\"\n}" + ] + }, + { + "id": "9cf798f8_74", + "source": "Shop Class as Soulcraft An inquiry into the value of the -- Crawford, Matthew.pdf", + "content_hash": "22d20263", + "content_length": 2704, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 11.6, + 2.4, + 2.5 + ], + "extraction_sample": { + "people": [], + "organizations": [], + "locations": [], + "dates": [], + "document_type": "article" + }, + "raw_samples": [ + "{\n \"people\": [],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"article\"\n}", + "{\n \"people\": [],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"article\"\n}", + "{\n \"locations\": [],\n \"people\": [],\n \"organizations\": [],\n \"dates\": [],\n \"document_type\": \"story\"\n}" + ] + }, + { + "id": "a230f2c3_84", + "source": "Landscape And Memory -- Simon Schama.pdf", + "content_hash": "39832bd0", + "content_length": 2725, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 11.6, + 1.7, + 1.9 + ], + "extraction_sample": { + "people": [], + "organizations": [], + "locations": [ + "germania" + ], + "dates": [], + "document_type": "article" + }, + "raw_samples": [ + "{\n \"locations\": [\"Germania\"],\n \"document_type\": \"article\"\n}", + "{\n \"locations\": [\"Germania\"],\n \"document_type\": \"Historical Text\"\n}", + "{\n \"locations\": [\"Germania\", \"Latinized Gaul\"],\n \"document_type\": \"article\"\n}" + ] + }, + { + "id": "91ccefdd_282", + "source": "Cognition in the Wild (A Bradford Book) -- Hutchins, Edwin.pdf", + "content_hash": "a27a5292", + "content_length": 2868, + "status": "CONSISTENT", + "consistent": true, + "pass_times_seconds": [ + 10.3, + 2.6, + 2.6 + ], + "extraction_sample": { + "people": [ + "quartermaster chief" + ], + "organizations": [], + "locations": [], + "dates": [], + "document_type": "article" + }, + "raw_samples": [ + "{\n \"people\": [\"quartermaster chief\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"article\"\n}", + "{\n \"people\": [\"quartermaster chief\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"article\"\n}", + "{\n \"people\": [\"quartermaster chief\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"article\"\n}" + ] + }, + { + "id": "fc378df0_53", + "source": "ulysses.txt", + "content_hash": "62d1a05e", + "content_length": 2797, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 14.7, + 5.3, + 4.1 + ], + "extraction_sample": { + "people": [ + "griffith", + "henry flower", + "maud gonne" + ], + "organizations": [ + "p. o.", + "royal dublin fusiliers" + ], + "locations": [ + "city", + "westland row" + ], + "dates": [], + "document_type": "letter" + }, + "raw_samples": [ + "{\n \"people\": [\"Henry Flower\", \"Maud Gonne\", \"Griffith\"],\n \"organizations\": [\"P. O.\", \"Royal Dublin fusiliers\"],\n \"locations\": [\"Westland Row\", \"City\"],\n \"document_type\": \"letter\"\n}", + "{\n \"people\": [\"Henry Flower\", \"Maud Gonne\", \"old Tweedy\"],\n \"organizations\": [\"P.O.\", \"Royal Dublin fusiliers\", \"King's own\"],\n \"locations\": [\"Westland Row\", \"City\", \"O'Connell street\"],\n \"documen", + "{\n \"people\": [\"Henry Flower\", \"Maud Gonne\", \"Griffith\"],\n \"organizations\": [],\n \"locations\": [\"City\", \"Westland Row\"],\n \"dates\": [],\n \"document_type\": \"letter\"\n}" + ] + }, + { + "id": "074bb5f9_27", + "source": "Mystery train _ images of America in rock 'n' roll music -- Marcus, Greil.pdf", + "content_hash": "075f007a", + "content_length": 2752, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 13.2, + 4.1, + 4.2 + ], + "extraction_sample": { + "people": [ + "michael jackson", + "mick jagger", + "muddy waters", + "robert johnson" + ], + "organizations": [], + "locations": [], + "dates": [ + "thirties", + "two hundred years ago" + ], + "document_type": "article" + }, + "raw_samples": [ + "{\n \"people\": [\"Robert Johnson\", \"Muddy Waters\", \"Mick Jagger\", \"Michael Jackson\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [\"two hundred years ago\", \"thirties\"],\n \"document_type\": \"arti", + "{\n \"people\": [\"Robert Johnson\", \"Muddy Waters\", \"Mick Jagger\", \"Michael Jackson\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [\"two hundred years ago\"],\n \"document_type\": \"article\"\n}", + "{\n \"people\": [\"man\", \"Robert Johnson\", \"Son House\", \"Skip James\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [\"more than two hundred years ago\", \"the thirties\"],\n \"document_type\": \"articl" + ] + }, + { + "id": "7ec60c0a_2", + "source": "Slide Slam 2022 Bios.docx", + "content_hash": "20fbe52c", + "content_length": 1435, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 11.5, + 3.0, + 3.1 + ], + "extraction_sample": { + "people": [ + "alex", + "briana rascoe" + ], + "organizations": [], + "locations": [], + "dates": [], + "document_type": "article" + }, + "raw_samples": [ + "{\n \"people\": [\"Alex\", \"Briana Rascoe\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"article\"\n}", + "{\n \"people\": [\"Alex\", \"Briana Rascoe\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"article\"\n}", + "{\n \"people\": [\"Alex\", \"Briana Rascoe\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"news article\"\n}" + ] + }, + { + "id": "fec527fe_1", + "source": "Lecture 2 Update.pptx", + "content_hash": "cb8c74a3", + "content_length": 2704, + "status": "CONSISTENT", + "consistent": true, + "pass_times_seconds": [ + 17.8, + 3.2, + 2.7 + ], + "extraction_sample": { + "people": [], + "organizations": [], + "locations": [ + "(300, 300)", + "(640, 384)" + ], + "dates": [], + "document_type": "text" + }, + "raw_samples": [ + "{\n \"locations\": [\"(300, 300)\", \"(640, 384)\"],\n \"document_type\": \"text\"\n}", + "{\n \"locations\": [\"(300, 300)\", \"(640, 384)\"],\n \"document_type\": \"text\"\n}", + "{\n \"locations\": [\"(300, 300)\", \"(640, 384)\"],\n \"document_type\": \"text\"\n}" + ] + }, + { + "id": "1a545c4e_389", + "source": "ulysses.txt", + "content_hash": "2aeec086", + "content_length": 2874, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 19.9, + 4.2, + 4.0 + ], + "extraction_sample": { + "people": [ + "best", + "he", + "john eglinton", + "mananaun maclir" + ], + "organizations": [], + "locations": [], + "dates": [], + "document_type": "fiction" + }, + "raw_samples": [ + "{\n \"people\": [\"He\", \"Best\", \"John Eglinton\", \"Mananaun MacLir\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"fiction\"\n}", + "{\n \"people\": [\"Beer\", \"Bishop\", \"He is our friend\", \"Mananan MacLir\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"fiction\"\n}", + "{\n \"people\": [\"He\", \"BEST\", \"John Eglinton\", \"Mananau MacLir\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"literature\"\n}" + ] + }, + { + "id": "76c315b5_1", + "source": "CAD I Syllabus-SP19_Revised.docx", + "content_hash": "71d1c925", + "content_length": 2983, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 14.7, + 6.6, + 9.5 + ], + "extraction_sample": { + "people": [], + "organizations": [ + "new paltz", + "sojourner truth library" + ], + "locations": [ + "http://lib.newpaltz.edu/assistance/plag.html", + "https://www.newpaltz.edu/emergency/policy.html" + ], + "dates": [ + "april 2nd" + ], + "document_type": "academic policy document" + }, + "raw_samples": [ + "{\n \"people\": [],\n \"organizations\": [\"New Paltz\", \"Sojourner Truth Library\"],\n \"locations\": [\"https://www.newpaltz.edu/emergency/policy.html\", \"http://lib.newpaltz.edu/assistance/plag.html\"],\n \"dat", + "{\n \"people\": [],\n \"organizations\": [\"New Paltz\", \"Sojourner Truth Library\"],\n \"locations\": [\"https://www.newpaltz.edu/emergency/policy.html\", \"http://lib.newpaltz.edu/assistance/plag.html\"],\n \"dat", + "Services Office at New Paltz as soon as possible. The office can be reached at 845-257-3106 or via email at [disability@newpaltz.edu](mailto:disability@newpaltz.edu). Accommodations must be approved i" + ] + }, + { + "id": "074bb5f9_442", + "source": "Mystery train _ images of America in rock 'n' roll music -- Marcus, Greil.pdf", + "content_hash": "197e4a86", + "content_length": 1303, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 17.9, + 7.9, + 6.3 + ], + "extraction_sample": { + "people": [ + "bruce springsteen", + "elvis presley", + "greil marcus", + "lyle lovett", + "patty loveless", + "randy newman" + ], + "organizations": [], + "locations": [], + "dates": [ + "16", + "20", + "232", + "3", + "307" + ], + "document_type": "article" + }, + "raw_samples": [ + "{\n \"people\": [\"Elvis Presley\", \"Patty Loveless\", \"Randy Newman\", \"Lyle Lovett\", \"Greil Marcus\", \"Bruce Springsteen\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [\"16\", \"232\", \"307\", \"3\", \"2", + "{\n \"people\": [\"Elvis Presley\", \"Patty Loveless\", \"Randy Newman\", \"Lyle Lovett\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [\"20 years before the 1975\", \"16\", \"232\", \"307\", \"3 11\", \"34\"],\n ", + "{\n \"people\": [\"Elvis Presley\", \"Patty Loveless\", \"Randy Newman\", \"Lyle Lovett\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [\"16\", \"337\", \"232\", \"307\", \"3\", \"20 years before\", \"1975\"],\n \"d" + ] + }, + { + "id": "a230f2c3_599", + "source": "Landscape And Memory -- Simon Schama.pdf", + "content_hash": "4cf471ff", + "content_length": 2271, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 24.8, + 8.3, + 8.4 + ], + "extraction_sample": { + "people": [ + "alexander hunter", + "batty langley", + "charnock", + "e. p. thompson", + "roger fisher", + "sir henry wood" + ], + "organizations": [ + "house of commons committee of inquiry", + "royal society of arts" + ], + "locations": [ + "london" + ], + "dates": [ + "1728", + "1771", + "1772", + "1789", + "1800-2", + "1912", + "1975" + ], + "document_type": "book" + }, + "raw_samples": [ + "{\n \"people\": [\"Charnock\", \"Batty Langley\", \"E. P. Thompson\", \"Roger Fisher\", \"Sir Henry Wood\", \"Alexander Hunter\"],\n \"organizations\": [\"House of Commons committee of inquiry\", \"Royal Society of Arts", + "{\n \"people\": [\"Charnock\", \"Batty Langley\", \"E. P. Thompson\", \"Roger Fisher\", \"Sir Henry Wood\", \"Alexander Hunter\"],\n \"organizations\": [\"House of Commons committee of inquiry\", \"Royal Society of Arts", + "{\n \"people\": [\"Charnock\", \"Batty Langley\", \"E. P. Thompson\", \"Roger Fisher\", \"Sir Henry Wood\", \"Alexander Hunter\"],\n \"organizations\": [\"The Royal Society of Arts\", \"House of Commons committee of inq" + ] + }, + { + "id": "074bb5f9_90", + "source": "Mystery train _ images of America in rock 'n' roll music -- Marcus, Greil.pdf", + "content_hash": "7e625551", + "content_length": 2748, + "status": "CONSISTENT", + "consistent": true, + "pass_times_seconds": [ + 14.0, + 2.8, + 2.8 + ], + "extraction_sample": { + "people": [ + "richard avedon", + "rosie stone", + "sly" + ], + "organizations": [], + "locations": [], + "dates": [ + "1973" + ], + "document_type": "article" + }, + "raw_samples": [ + "{\n \"people\": [\"Sly\", \"Richard Avedon\", \"Rosie Stone\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [\"1973\"],\n \"document_type\": \"article\"\n}", + "{\n \"people\": [\"Sly\", \"Richard Avedon\", \"Rosie Stone\"],\n \"dates\": [\"1973\"],\n \"document_type\": \"article\"\n}", + "{\n \"people\": [\"Sly\", \"Richard Avedon\", \"Rosie Stone\"],\n \"dates\": [\"1973\"],\n \"document_type\": \"article\"\n}" + ] + }, + { + "id": "fc378df0_355", + "source": "ulysses.txt", + "content_hash": "5a4d6725", + "content_length": 2883, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 20.4, + 4.4, + 4.9 + ], + "extraction_sample": { + "people": [ + "bloom", + "bridie kelly", + "mrs marion", + "sweny" + ], + "organizations": [], + "locations": [ + "east", + "south" + ], + "dates": [], + "document_type": "literature" + }, + "raw_samples": [ + "{\n \"people\": [\"Mrs Marion\", \"Bloom\", \"Sweny\", \"Bridie Kelly\"],\n \"organizations\": [],\n \"locations\": [\"South\", \"East\"],\n \"dates\": [],\n \"document_type\": \"literature\"\n}", + "{\n \"people\": [\"Mrs Marion\", \"Bloom\", \"Sweny\", \"Marion\"],\n \"organizations\": [],\n \"locations\": [\"South\", \"East\", \"Hatch street\"],\n \"dates\": [],\n \"document_type\": \"fiction\"\n}", + "{\n \"people\": [\"Mrs Marion\", \"Bloom\", \"Marion\", \"Sweny\", \"Bawd\", \"Bridie Kelly\"],\n \"organizations\": [],\n \"locations\": [\"South\", \"East\"],\n \"dates\": [],\n \"document_type\": \"Fiction\"\n}" + ] + }, + { + "id": "211f2199_71", + "source": "The E-myth revisited _ why most small businesses don't work -- Michael E_ Gerber.pdf", + "content_hash": "11665e8e", + "content_length": 2809, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 11.3, + 2.6, + 2.9 + ], + "extraction_sample": { + "people": [ + "sarah" + ], + "organizations": [], + "locations": [], + "dates": [], + "document_type": "letter" + }, + "raw_samples": [ + "{\n \"people\": [\"Sarah\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"letter\"\n}", + "{\n \"people\": [\"Sarah\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"business\"\n}", + "{\n \"people\": [\"Sarah\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"text\"\n}" + ] + }, + { + "id": "3ab67806_0", + "source": "CMakeLists.txt", + "content_hash": "15a52de3", + "content_length": 1020, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 7.9, + 0.9, + 0.9 + ], + "extraction_sample": { + "people": [], + "organizations": [], + "locations": [], + "dates": [], + "document_type": "cmake script" + }, + "raw_samples": [ + "{\n \"document_type\": \"cmake script\"\n}", + "{\n \"document_type\": \"cmake script\"\n}", + "{\n \"document_type\": \"cmake\"\n}" + ] + }, + { + "id": "eab75ec0_0", + "source": "CMakeLists.txt", + "content_hash": "d3d4f035", + "content_length": 1881, + "status": "CONSISTENT", + "consistent": true, + "pass_times_seconds": [ + 13.9, + 1.0, + 1.0 + ], + "extraction_sample": { + "people": [], + "organizations": [], + "locations": [], + "dates": [], + "document_type": "cmake file" + }, + "raw_samples": [ + "{\n \"document_type\": \"CMake file\"\n}", + "{\n \"document_type\": \"CMake file\"\n}", + "{\n \"document_type\": \"CMake file\"\n}" + ] + }, + { + "id": "293288df_31", + "source": "RhinoPythonPrimerRev3.pdf", + "content_hash": "6400418d", + "content_length": 2742, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 11.2, + 0.9, + 2.0 + ], + "extraction_sample": { + "people": [], + "organizations": [], + "locations": [ + "(0,0,0)" + ], + "dates": [], + "document_type": "code" + }, + "raw_samples": [ + "{\n \"document_type\": \"code\",\n \"locations\": [\"(0,0,0)\"]\n}", + "{\n \"document_type\": \"code\"\n}", + "{\n \"document_type\": \"code\",\n \"locations\": [\"(0,0,0)\"]\n}" + ] + }, + { + "id": "f6a8d07a_18", + "source": "Arduino Microcontroller Basics.pdf", + "content_hash": "b4c3f332", + "content_length": 2669, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 11.3, + 2.6, + 2.9 + ], + "extraction_sample": { + "people": [], + "organizations": [], + "locations": [], + "dates": [], + "document_type": "arduino code" + }, + "raw_samples": [ + "{\n \"people\": [],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"Arduino Code\"\n}", + "{\n \"locations\": [],\n \"organizations\": [\"Arduino\"],\n \"people\": [],\n \"dates\": [],\n \"document_type\": \"program\"\n}", + "{\n \"document_type\": \"programming code\",\n \"locations\": [],\n \"people\": [],\n \"organizations\": [],\n \"dates\": []\n}" + ] + }, + { + "id": "05b9d711_116", + "source": "Company of One -- Paul Jarvis.pdf", + "content_hash": "6e7a4314", + "content_length": 2945, + "status": "CONSISTENT", + "consistent": true, + "pass_times_seconds": [ + 12.3, + 3.1, + 2.6 + ], + "extraction_sample": { + "people": [ + "derek" + ], + "organizations": [ + "cdbaby" + ], + "locations": [], + "dates": [ + "2008" + ], + "document_type": "article" + }, + "raw_samples": [ + "{\n \"people\": [\"Derek\"],\n \"organizations\": [\"CDBaby\"],\n \"locations\": [],\n \"dates\": [\"2008\"],\n \"document_type\": \"article\"\n}", + "{\n \"people\": [\"Derek\"],\n \"organizations\": [\"CDBaby\"],\n \"locations\": [],\n \"dates\": [\"2008\"],\n \"document_type\": \"article\"\n}", + "{\n \"people\": [\"Derek\"],\n \"organizations\": [\"CDBaby\"],\n \"dates\": [\"2008\"],\n \"document_type\": \"article\"\n}" + ] + }, + { + "id": "a8366d89_185", + "source": "Hackers and Painters_ Big Ideas from the Computer Age -- Graham, Paul.pdf", + "content_hash": "40a71745", + "content_length": 2878, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 13.1, + 3.6, + 4.3 + ], + "extraction_sample": { + "people": [ + "harvard", + "princeton" + ], + "organizations": [ + "national center for education statistics" + ], + "locations": [], + "dates": [ + "1800", + "1970" + ], + "document_type": "book chapter" + }, + "raw_samples": [ + "{\n \"people\": [\"Princeton\", \"Harvard\"],\n \"organizations\": [\"National Center for Education Statistics\"],\n \"locations\": [],\n \"dates\": [\"1970\", \"1800\"],\n \"document_type\": \"book chapter\"\n}", + "{\n \"people\": [\"Prince-ton\", \"Harvard\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [\"1970\", \"1800\"],\n \"document_type\": \"book\"\n}", + "{\n \"people\": [\"Prince-ton\", \"Harvard\"],\n \"organizations\": [\"National Center for Education Statistics\"],\n \"locations\": [],\n \"dates\": [\"1800\", \"1970\"],\n \"document_type\": \"book\"\n}" + ] + }, + { + "id": "87abf773_21", + "source": "GrasshopperPrimer_V3-3_EN.pdf", + "content_hash": "beae43ed", + "content_length": 2746, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 12.1, + 3.0, + 2.2 + ], + "extraction_sample": { + "people": [], + "organizations": [ + "grasshopper" + ], + "locations": [], + "dates": [], + "document_type": "text" + }, + "raw_samples": [ + "{\n \"people\": [],\n \"organizations\": [\"Grasshopper\"],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"text\"\n}", + "{\n \"people\": [],\n \"organizations\": [\"Grasshopper\"],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"instruction\"\n}", + "{\n \"locations\": [\"Graph Mapper\", \"Dropdown List mode\"],\n \"document_type\": \"instruction manual\"\n}" + ] + }, + { + "id": "05812e17_18", + "source": "EXPERIENCE MACHINE _ how our minds predict and shape reality -- Andy Clark.pdf", + "content_hash": "ddabef58", + "content_length": 2920, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 9.8, + 2.8, + 2.0 + ], + "extraction_sample": { + "people": [ + "andy clark" + ], + "organizations": [], + "locations": [], + "dates": [], + "document_type": "article" + }, + "raw_samples": [ + "{\n \"people\": [\"ANDY CLARK\"],\n \"document_type\": \"article\"\n}", + "{\n \"people\": [\"ANDY CLARK\"],\n \"dates\": [],\n \"organizations\": [],\n \"locations\": [],\n \"document_type\": \"text\"\n}", + "{\n \"people\": [\"ANDY CLARK\"],\n \"document_type\": \"text\"\n}" + ] + }, + { + "id": "f548556b_2", + "source": "AARON_NELSON-DIGITAL_HAPTIC_2020.pdf", + "content_hash": "aae21623", + "content_length": 2076, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 10.7, + 2.9, + 2.8 + ], + "extraction_sample": { + "people": [], + "organizations": [ + "grasshopper", + "history of design" + ], + "locations": [], + "dates": [], + "document_type": "course_description" + }, + "raw_samples": [ + "{\n \"people\": [],\n \"organizations\": [\"Grasshopper\", \"History of Design\"],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"course_description\"\n}", + "{\n \"people\": [],\n \"organizations\": [\"Grasshopper\"],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"Course Outline\"\n}", + "{\n \"people\": [],\n \"organizations\": [\"Grasshopper\"],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"Course Description\"\n}" + ] + }, + { + "id": "211f2199_13", + "source": "The E-myth revisited _ why most small businesses don't work -- Michael E_ Gerber.pdf", + "content_hash": "0356a33c", + "content_length": 2754, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 11.4, + 1.7, + 1.6 + ], + "extraction_sample": { + "people": [ + "the fat guy", + "you" + ], + "organizations": [], + "locations": [], + "dates": [], + "document_type": "story" + }, + "raw_samples": [ + "{\n \"people\": [\"You\", \"The Fat Guy\"],\n \"document_type\": \"story\"\n}", + "{\n \"people\": [\"You\", \"The Fat Guy\"],\n \"document_type\": \"fiction\"\n}", + "{\n \"people\": [\"You\", \"The Fat Guy\"],\n \"document_type\": \"story\"\n}" + ] + }, + { + "id": "cd3d1914_76", + "source": "The world beyond your head _ on becoming an individual in an -- Crawford, Matthew B.pdf", + "content_hash": "e9b45cb5", + "content_length": 2944, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 11.2, + 3.3, + 2.9 + ], + "extraction_sample": { + "people": [ + "nicholas schiill" + ], + "organizations": [], + "locations": [ + "las vegas" + ], + "dates": [], + "document_type": "book review" + }, + "raw_samples": [ + "{\n \"people\": [\"Nicholas Schiill\"],\n \"organizations\": [],\n \"locations\": [\"Las Vegas\"],\n \"dates\": [],\n \"document_type\": \"book review\"\n}", + "{\n \"people\": [\"Nick Yablonsky\", \"Schiill\"],\n \"organizations\": [],\n \"locations\": [\"Las Vegas\"],\n \"dates\": [],\n \"document_type\": \"book review\"\n}", + "{\n \"people\": [\"Schiill\"],\n \"organizations\": [],\n \"locations\": [\"Las Vegas\"],\n \"dates\": [],\n \"document_type\": \"book review\"\n}" + ] + }, + { + "id": "fc378df0_366", + "source": "ulysses.txt", + "content_hash": "293108da", + "content_length": 2798, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 15.6, + 4.2, + 4.1 + ], + "extraction_sample": { + "people": [ + "bloom", + "j. j. o'molloy" + ], + "organizations": [], + "locations": [ + "the land of the pharaoh" + ], + "dates": [], + "document_type": "dialogue" + }, + "raw_samples": [ + "{\n \"people\": [\"J. J. O'Molloy\", \"Bloom\"],\n \"organizations\": [],\n \"locations\": [\"the land of the Pharaoh\"],\n \"dates\": [],\n \"document_type\": \"dialogue\"\n}", + "{\n \"people\": [\"J. J. O'Molloy\", \"My client\", \"Bloom\"],\n \"organizations\": [],\n \"locations\": [\"the land of the Pharaoh\"],\n \"dates\": [],\n \"document_type\": \"legal-transcript\"\n}", + "{\n \"people\": [\"J. J. O'Molloy\", \"Bloom\"],\n \"organizations\": [],\n \"locations\": [\"the land of the Pharaoh\"],\n \"dates\": [],\n \"document_type\": \"legal_transcript\"\n}" + ] + }, + { + "id": "a230f2c3_488", + "source": "Landscape And Memory -- Simon Schama.pdf", + "content_hash": "4799c645", + "content_length": 2612, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 15.1, + 4.8, + 5.2 + ], + "extraction_sample": { + "people": [ + "john ruskin" + ], + "organizations": [], + "locations": [ + "arcadia", + "euphrates", + "london", + "reat mountains" + ], + "dates": [ + "1894", + "not specified" + ], + "document_type": "book" + }, + "raw_samples": [ + "{\n \"people\": [\"John Ruskin\"],\n \"organizations\": [],\n \"locations\": [\"Reat Mountains\", \"Euphrates\", \"Arcadia\", \"London\"],\n \"dates\": [\"Not specified\", \"1894\"],\n \"document_type\": \"Book\"\n}", + "{\n \"people\": [\"John Ruskin\", \"Georgies\"],\n \"organizations\": [],\n \"locations\": [\"Reat Mountains\", \"deep Euphrates\", \"Arcaadia\", \"London\"],\n \"dates\": [\"Not specified\", \"1894\"],\n \"document_type\": \"b", + "{\n \"people\": [\"John Ruskin\", \"\"],\n \"organizations\": [\"Modern Painters\"],\n \"locations\": [\"reat mountains\", \"deep Euphrates\", \"Arcaedia\", \"London\", \"Parliament Hill\"],\n \"dates\": [\"not specified\", \"1" + ] + } + ], + "summary": { + "total": 50, + "consistent": 9, + "inconsistent": 41, + "failed": 0, + "timeout": 0, + "consistency_rate": 18.0, + "cascade_viable": false + } +} \ No newline at end of file diff --git a/experiments/briefing_test_results.json b/experiments/briefing_test_results.json new file mode 100644 index 0000000..7a870d4 --- /dev/null +++ b/experiments/briefing_test_results.json @@ -0,0 +1,1673 @@ +{ + "meta": { + "model": "mistral", + "sample_size": 50, + "started": "2026-04-28T02:09:30.069385", + "completed": "2026-04-28T02:24:39.485763", + "total_elapsed_seconds": 909.7, + "avg_seconds_per_doc": 18.2 + }, + "documents": [ + { + "id": "5ee0b3bb_0", + "source": "00_Syllabus.docx", + "content_hash": "848c971c", + "content_length": 2273, + "status": "SUCCESS", + "elapsed_seconds": 35.6, + "briefing": { + "document_type": "presentation", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 568, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 26.0 + } + }, + { + "id": "4451e0d5_1", + "source": "01_ALL_Overview of AM and 3DP_v3.pptx", + "content_hash": "1e3ff98f", + "content_length": 2167, + "status": "SUCCESS", + "elapsed_seconds": 21.2, + "briefing": { + "document_type": "presentation", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": false, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 542, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 27.0 + } + }, + { + "id": "0619cec0_0", + "source": "01_NURBS Curves.docx", + "content_hash": "2ac1bb56", + "content_length": 1401, + "status": "SUCCESS", + "elapsed_seconds": 19.7, + "briefing": { + "document_type": "code", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 350, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 36.3 + } + }, + { + "id": "d0a3917e_0", + "source": "02_2D Geometry.docx", + "content_hash": "5d53f099", + "content_length": 188, + "status": "SUCCESS", + "elapsed_seconds": 10.4, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 47, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 81.0 + } + }, + { + "id": "89fed291_0", + "source": "02_Point of Curves - AARON.docx", + "content_hash": "864be8ed", + "content_length": 2116, + "status": "SUCCESS", + "elapsed_seconds": 20.0, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 529, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 27.4 + } + }, + { + "id": "2a15be8d_0", + "source": "02_Point of Curves.docx", + "content_hash": "4b683753", + "content_length": 1338, + "status": "SUCCESS", + "elapsed_seconds": 16.2, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullets", + "code_blocks" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 334, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 37.4 + } + }, + { + "id": "a2a7a8d3_2", + "source": "02_PPT_ALL_AM_Technologies_for_3DP_v3.pptx", + "content_hash": "1c260a03", + "content_length": 1675, + "status": "SUCCESS", + "elapsed_seconds": 21.7, + "briefing": { + "document_type": "presentation", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": false, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 419, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 32.3 + } + }, + { + "id": "2b3b1c34_0", + "source": "03_2D Transformation and Deformation.docx", + "content_hash": "306dc581", + "content_length": 418, + "status": "SUCCESS", + "elapsed_seconds": 11.2, + "briefing": { + "document_type": "academic_pdf", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 104, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 65.7 + } + }, + { + "id": "b425985b_0", + "source": "03_2D Transformation, Deformation, and Editing-AARON.docx", + "content_hash": "9f9c422a", + "content_length": 541, + "status": "SUCCESS", + "elapsed_seconds": 12.1, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": false, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 135, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 59.7 + } + }, + { + "id": "1c78c79f_0", + "source": "03_Editing Geometry.docx", + "content_hash": "9491f6cc", + "content_length": 171, + "status": "SUCCESS", + "elapsed_seconds": 10.1, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 43, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 82.4 + } + }, + { + "id": "6453f3a8_7", + "source": "04_ALL_Materials and Their Properties_v3.pptx", + "content_hash": "0c319651", + "content_length": 380, + "status": "SUCCESS", + "elapsed_seconds": 12.3, + "briefing": { + "document_type": "academic_pdf", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "bullet_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 95, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 67.8 + } + }, + { + "id": "1fd396d4_0", + "source": "04_Annotations.docx", + "content_hash": "9b3e57cd", + "content_length": 737, + "status": "SUCCESS", + "elapsed_seconds": 14.9, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [ + "formatting_artifacts" + ], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 184, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 9, + "noise_reduction_pct": 5.0, + "total_reduction_pct": 54.4 + } + }, + { + "id": "38d1cf0d_0", + "source": "05_Entering the third dimension.docx", + "content_hash": "8f56202a", + "content_length": 2175, + "status": "SUCCESS", + "elapsed_seconds": 19.9, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists", + "code_blocks" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 544, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 26.9 + } + }, + { + "id": "cfd1ee43_0", + "source": "05_Making things solid.docx", + "content_hash": "12634c4c", + "content_length": 692, + "status": "SUCCESS", + "elapsed_seconds": 13.4, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 173, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 53.6 + } + }, + { + "id": "31c729e4_8", + "source": "05_PPT_ALL_Machine Technology and Specifications_v3.pptx", + "content_hash": "bf8daf4b", + "content_length": 2886, + "status": "SUCCESS", + "elapsed_seconds": 20.0, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 722, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 21.7 + } + }, + { + "id": "a8d4d8a4_0", + "source": "06_3D_Editing.docx", + "content_hash": "887133dc", + "content_length": 157, + "status": "SUCCESS", + "elapsed_seconds": 9.9, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 39, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 83.6 + } + }, + { + "id": "c0d0659e_0", + "source": "06_Gumball.docx", + "content_hash": "c46dbc48", + "content_length": 1980, + "status": "SUCCESS", + "elapsed_seconds": 20.1, + "briefing": { + "document_type": "code", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 495, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 28.8 + } + }, + { + "id": "8f9d093e_4", + "source": "06_PPT_ALL_Design Considerations_From CAD to CAM_v3.pptx", + "content_hash": "0432ffd0", + "content_length": 2884, + "status": "SUCCESS", + "elapsed_seconds": 23.1, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 721, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 21.7 + } + }, + { + "id": "4e7db487_0", + "source": "07_Cube Assignment_2018f.docx", + "content_hash": "af2f5bab", + "content_length": 1316, + "status": "SUCCESS", + "elapsed_seconds": 18.9, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists" + ], + "noise_signals": [ + "formatting_artifacts" + ], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 329, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 16, + "noise_reduction_pct": 5.0, + "total_reduction_pct": 40.9 + } + }, + { + "id": "1a5b6da3_0", + "source": "07_Make2D.docx", + "content_hash": "d71c1df4", + "content_length": 834, + "status": "SUCCESS", + "elapsed_seconds": 14.6, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 208, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 49.0 + } + }, + { + "id": "97ba28bd_5", + "source": "07_PPT_ALL_Fabrication Considerations_v3.pptx", + "content_hash": "7ffb6f57", + "content_length": 710, + "status": "SUCCESS", + "elapsed_seconds": 14.6, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "tables" + ], + "noise_signals": [ + "repeated_headers" + ], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 178, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 9, + "noise_reduction_pct": 5.0, + "total_reduction_pct": 55.3 + } + }, + { + "id": "0892b9fa_2", + "source": "08_PPT_ALL_PostProcessing for FDM and PolyJet_v3.pptx", + "content_hash": "9fb49737", + "content_length": 2939, + "status": "SUCCESS", + "elapsed_seconds": 19.2, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 735, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 21.4 + } + }, + { + "id": "9446c72b_0", + "source": "08_Printing_Technicals.docx", + "content_hash": "365e53a6", + "content_length": 1310, + "status": "SUCCESS", + "elapsed_seconds": 18.7, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullets", + "images" + ], + "noise_signals": [ + "formatting_artifacts" + ], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 328, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 16, + "noise_reduction_pct": 5.0, + "total_reduction_pct": 41.0 + } + }, + { + "id": "44b7a630_0", + "source": "09_Tolerance Test Part.docx", + "content_hash": "f6a14d20", + "content_length": 817, + "status": "SUCCESS", + "elapsed_seconds": 14.8, + "briefing": { + "document_type": "academic_pdf", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [ + "formatting_artifacts" + ], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 204, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 10, + "noise_reduction_pct": 5.0, + "total_reduction_pct": 52.0 + } + }, + { + "id": "4a74de83_0", + "source": "09_Tolerance Test Part.pdf", + "content_hash": "7f3106a9", + "content_length": 1049, + "status": "SUCCESS", + "elapsed_seconds": 15.6, + "briefing": { + "document_type": "notes", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 262, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 43.3 + } + }, + { + "id": "aa807935_0", + "source": "10 Good Things about Aaron for DSI.docx", + "content_hash": "ff5081e9", + "content_length": 1126, + "status": "SUCCESS", + "elapsed_seconds": 17.7, + "briefing": { + "document_type": "unknown", + "primary_language": "en", + "density": "medium", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "bullet_lists" + ], + "noise_signals": [ + "page_numbers", + "formatting_artifacts" + ], + "extraction_priority": "partial" + }, + "token_reduction_estimate": { + "original_tokens_approx": 282, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 28, + "noise_reduction_pct": 10.0, + "total_reduction_pct": 47.4 + } + }, + { + "id": "90248749_1", + "source": "10_Moving Parts.docx", + "content_hash": "3c2e28b9", + "content_length": 218, + "status": "SUCCESS", + "elapsed_seconds": 10.2, + "briefing": { + "document_type": "code", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "no structure signals" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 54, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 78.6 + } + }, + { + "id": "958e5aac_0", + "source": "1119345.pdf", + "content_hash": "af7da5db", + "content_length": 1745, + "status": "SUCCESS", + "elapsed_seconds": 30.4, + "briefing": { + "document_type": "unknown", + "primary_language": "en", + "density": "medium", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists", + "numbered_lists", + "tables" + ], + "noise_signals": [ + "repeated_headers", + "page_numbers", + "formatting_artifacts" + ], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 436, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 65, + "noise_reduction_pct": 15.0, + "total_reduction_pct": 41.7 + } + }, + { + "id": "cbb41390_1", + "source": "2016 - DDF 205 - CAD I Syllabus.pdf", + "content_hash": "466b3184", + "content_length": 2161, + "status": "SUCCESS", + "elapsed_seconds": 20.2, + "briefing": { + "document_type": "syllabus", + "primary_language": "en", + "density": "medium", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": false, + "has_institutional_language": true, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [ + "headings" + ], + "noise_signals": [ + "page_numbers", + "formatting_artifacts" + ], + "extraction_priority": "partial" + }, + "token_reduction_estimate": { + "original_tokens_approx": 540, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 54, + "noise_reduction_pct": 10.0, + "total_reduction_pct": 34.3 + } + }, + { + "id": "86543785_1", + "source": "2017 - DDF 220 - Intro to Computational Media Syllabus.pdf", + "content_hash": "3ad85610", + "content_length": 2224, + "status": "SUCCESS", + "elapsed_seconds": 20.0, + "briefing": { + "document_type": "syllabus", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": false, + "has_institutional_language": true, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 556, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 26.5 + } + }, + { + "id": "fc7941cf_1", + "source": "2018 - DDF 205 - CAD I Syllabus.pdf", + "content_hash": "2632e62b", + "content_length": 2618, + "status": "SUCCESS", + "elapsed_seconds": 20.8, + "briefing": { + "document_type": "syllabus", + "primary_language": "en", + "density": "medium", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": false, + "has_institutional_language": true, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [ + "repeated_headers", + "page_numbers", + "formatting_artifacts", + "boilerplate" + ], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 654, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 131, + "noise_reduction_pct": 20.0, + "total_reduction_pct": 38.7 + } + }, + { + "id": "56b61c68_3", + "source": "2019-2020 Research and Creative Projects Awards Guidelines.FINAL.pdf", + "content_hash": "1a4e890b", + "content_length": 2228, + "status": "SUCCESS", + "elapsed_seconds": 20.6, + "briefing": { + "document_type": "form", + "primary_language": "en", + "density": "medium", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists" + ], + "noise_signals": [ + "formatting_artifacts" + ], + "extraction_priority": "partial" + }, + "token_reduction_estimate": { + "original_tokens_approx": 557, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 28, + "noise_reduction_pct": 5.0, + "total_reduction_pct": 30.1 + } + }, + { + "id": "9ed5c43e_2", + "source": "2019 - DDF 305 - Materials Syllabus.pdf", + "content_hash": "c0521ba2", + "content_length": 1842, + "status": "SUCCESS", + "elapsed_seconds": 22.0, + "briefing": { + "document_type": "syllabus", + "primary_language": "en", + "density": "medium", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists", + "numbered_lists", + "tables" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 460, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 30.3 + } + }, + { + "id": "2f02bfa5_2", + "source": "2020 - DDF 220 - Intro to Computational Media Syllabus.pdf", + "content_hash": "fe3ca5be", + "content_length": 2580, + "status": "SUCCESS", + "elapsed_seconds": 20.5, + "briefing": { + "document_type": "academic_pdf", + "primary_language": "en", + "density": "medium", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists", + "tables" + ], + "noise_signals": [ + "page_numbers", + "formatting_artifacts" + ], + "extraction_priority": "partial" + }, + "token_reduction_estimate": { + "original_tokens_approx": 645, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 64, + "noise_reduction_pct": 10.0, + "total_reduction_pct": 31.3 + } + }, + { + "id": "c0cd3599_3", + "source": "2021 - DDF 320 - Design Intents Syllabus.pdf", + "content_hash": "588d34a3", + "content_length": 1560, + "status": "SUCCESS", + "elapsed_seconds": 23.8, + "briefing": { + "document_type": "presentation", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": false, + "has_institutional_language": true, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists", + "numbered_lists" + ], + "noise_signals": [ + "repeated_headers", + "page_numbers" + ], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 390, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 39, + "noise_reduction_pct": 10.0, + "total_reduction_pct": 40.5 + } + }, + { + "id": "9ea5656f_2", + "source": "2023 Faculty Report Aaron Nelson.docx", + "content_hash": "fd68d021", + "content_length": 2698, + "status": "SUCCESS", + "elapsed_seconds": 19.6, + "briefing": { + "document_type": "syllabus", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists", + "numbered_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 674, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 22.9 + } + }, + { + "id": "33aae3e5_2", + "source": "2023 Faculty Report Template.docx", + "content_hash": "c2d50031", + "content_length": 2100, + "status": "SUCCESS", + "elapsed_seconds": 18.3, + "briefing": { + "document_type": "syllabus", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [ + "bullet_lists", + "numbered_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 525, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 27.6 + } + }, + { + "id": "bf155f9f_0", + "source": "2026-04-26-22-44-voice.md", + "content_hash": "41cc3d28", + "content_length": 165, + "status": "SUCCESS", + "elapsed_seconds": 10.9, + "briefing": { + "document_type": "unknown", + "primary_language": "en", + "density": "medium", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": false, + "has_institutional_language": false, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [ + "headings" + ], + "noise_signals": [ + "**type:**", + "**modality:**", + "**status:**" + ], + "extraction_priority": "partial" + }, + "token_reduction_estimate": { + "original_tokens_approx": 41, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 6, + "noise_reduction_pct": 15.0, + "total_reduction_pct": 85.5 + } + }, + { + "id": "5c9f5ad5_0", + "source": "2026-04-26-22-52-voice.md", + "content_hash": "0ed1efba", + "content_length": 171, + "status": "SUCCESS", + "elapsed_seconds": 10.8, + "briefing": { + "document_type": "chat_log", + "primary_language": "en", + "density": "medium", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": false, + "has_institutional_language": false, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [ + "bullet_lists" + ], + "noise_signals": [ + "repeated_headers", + "page_numbers", + "formatting_artifacts" + ], + "extraction_priority": "partial" + }, + "token_reduction_estimate": { + "original_tokens_approx": 43, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 6, + "noise_reduction_pct": 15.0, + "total_reduction_pct": 85.0 + } + }, + { + "id": "8bc956ad_0", + "source": "2026-04-26-23-04-voice.md", + "content_hash": "c455ef44", + "content_length": 931, + "status": "SUCCESS", + "elapsed_seconds": 16.0, + "briefing": { + "document_type": "chat_log", + "primary_language": "en", + "density": "medium", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": false, + "has_institutional_language": false, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [ + "headings" + ], + "noise_signals": [ + "formatting_artifacts" + ], + "extraction_priority": "partial" + }, + "token_reduction_estimate": { + "original_tokens_approx": 233, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 12, + "noise_reduction_pct": 5.0, + "total_reduction_pct": 48.9 + } + }, + { + "id": "af176130_0", + "source": "2026-04-26-lucid-1.md", + "content_hash": "d9c51a1c", + "content_length": 2444, + "status": "SUCCESS", + "elapsed_seconds": 20.2, + "briefing": { + "document_type": "chat_log", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": false, + "has_institutional_language": true, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bulleted_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 611, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 24.7 + } + }, + { + "id": "52114711_0", + "source": "2026-04-26-lucid.md", + "content_hash": "4c5fb648", + "content_length": 2437, + "status": "SUCCESS", + "elapsed_seconds": 18.4, + "briefing": { + "document_type": "chat_log", + "primary_language": "en", + "density": "medium", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": false, + "has_institutional_language": true, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [ + "headings" + ], + "noise_signals": [], + "extraction_priority": "partial" + }, + "token_reduction_estimate": { + "original_tokens_approx": 609, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 24.7 + } + }, + { + "id": "1bf832a0_0", + "source": "2026-04-26-nrem-1.md", + "content_hash": "1ad1e9c1", + "content_length": 1586, + "status": "SUCCESS", + "elapsed_seconds": 21.4, + "briefing": { + "document_type": "syllabus", + "primary_language": "en", + "density": "medium", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [ + "repeated_headers" + ], + "extraction_priority": "partial" + }, + "token_reduction_estimate": { + "original_tokens_approx": 396, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 20, + "noise_reduction_pct": 5.0, + "total_reduction_pct": 36.9 + } + }, + { + "id": "a16d6571_0", + "source": "2026-04-26-nrem.md", + "content_hash": "1714ccc0", + "content_length": 1638, + "status": "SUCCESS", + "elapsed_seconds": 19.4, + "briefing": { + "document_type": "syllabus", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 410, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 32.8 + } + }, + { + "id": "b696802f_0", + "source": "2026-04-27-04-34-image.md", + "content_hash": "3cce200d", + "content_length": 2027, + "status": "SUCCESS", + "elapsed_seconds": 20.3, + "briefing": { + "document_type": "presentation", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 507, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 28.3 + } + }, + { + "id": "6bc36d6f_0", + "source": "2026-04-27-04-36-image.md", + "content_hash": "29717d0c", + "content_length": 1755, + "status": "SUCCESS", + "elapsed_seconds": 20.1, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": false, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 439, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 31.3 + } + }, + { + "id": "8b7ed0da_0", + "source": "2026-04-27-04-41-image.md", + "content_hash": "47a1f451", + "content_length": 2148, + "status": "SUCCESS", + "elapsed_seconds": 20.2, + "briefing": { + "document_type": "presentation", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "visual description", + "content" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 537, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 27.1 + } + }, + { + "id": "700d4582_0", + "source": "2026-04-27-06-21-image.md", + "content_hash": "b143e6fc", + "content_length": 1643, + "status": "SUCCESS", + "elapsed_seconds": 19.7, + "briefing": { + "document_type": "notes", + "primary_language": "en", + "density": "low", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": false, + "has_institutional_language": false, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [], + "noise_signals": [ + "page_numbers" + ], + "extraction_priority": "partial" + }, + "token_reduction_estimate": { + "original_tokens_approx": 411, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 21, + "noise_reduction_pct": 5.0, + "total_reduction_pct": 36.1 + } + }, + { + "id": "31317444_0", + "source": "2026-04-27-19-04-image.md", + "content_hash": "8bd62d02", + "content_length": 1767, + "status": "SUCCESS", + "elapsed_seconds": 19.3, + "briefing": { + "document_type": "presentation", + "primary_language": "en", + "density": "medium", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": false, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [], + "extraction_priority": "partial" + }, + "token_reduction_estimate": { + "original_tokens_approx": 442, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 31.2 + } + }, + { + "id": "bc4bffcd_0", + "source": "2026-04-27-20-18-image.md", + "content_hash": "c33f8f22", + "content_length": 1856, + "status": "SUCCESS", + "elapsed_seconds": 20.3, + "briefing": { + "document_type": "presentation", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists" + ], + "noise_signals": [ + "formatting_artifacts" + ], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 464, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 23, + "noise_reduction_pct": 5.0, + "total_reduction_pct": 33.6 + } + } + ], + "summary": { + "total": 50, + "success": 50, + "failed": 0, + "success_rate": 100.0, + "extraction_priority_breakdown": { + "full": 39, + "partial": 11, + "skip": 0 + }, + "avg_token_reduction_pct": 42.3, + "total_elapsed_seconds": 909.7, + "avg_seconds_per_doc": 18.2, + "projected_50_doc_minutes": 15.2, + "approach_viable": true + } +} \ No newline at end of file diff --git a/experiments/consistency_test_results.json b/experiments/consistency_test_results.json new file mode 100644 index 0000000..88d8e06 --- /dev/null +++ b/experiments/consistency_test_results.json @@ -0,0 +1,1539 @@ +{ + "meta": { + "model": "mistral", + "passes": 3, + "sample_size": 50, + "started": "2026-04-28T01:39:19.750205", + "completed": "2026-04-28T01:56:58.176009" + }, + "documents": [ + { + "id": "claude_bbe0172d-3087-4238-a51c-7dca6c0b6f28_1326", + "source": "Claude: Setting up a custom OpenClaw instance", + "content_hash": "0ff1319e", + "content_length": 700, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 13.6, + 2.6, + 2.5 + ], + "extraction_sample": { + "people": [], + "organizations": [], + "locations": [], + "dates": [ + "30 days" + ], + "document_type": "conversation" + }, + "raw_samples": [ + "{\n \"people\": [],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [\"30 days\"],\n \"document_type\": \"conversation\"\n}", + "{\n \"people\": [],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [\"30 days\"],\n \"document_type\": \"text\"\n}", + "{\n \"people\": [],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [\"30 days\"],\n \"document_type\": \"conversation\"\n}" + ] + }, + { + "id": "1a545c4e_432", + "source": "ulysses.txt", + "content_hash": "5d6d9cf1", + "content_length": 2807, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 18.1, + 3.6, + 5.2 + ], + "extraction_sample": { + "people": [ + "bloom", + "corny kelleher", + "first watch", + "second watch" + ], + "organizations": [ + "behan's", + "jammet's" + ], + "locations": [ + "gardiner street", + "nighttown", + "scaffolding" + ], + "dates": [], + "document_type": "literature" + }, + "raw_samples": [ + "{\n \"people\": [\"Bloom\", \"First Watch\", \"Second Watch\", \"Corny Kelleher\"],\n \"organizations\": [\"Jammet's\", \"Behan's\"],\n \"locations\": [\"Nighttown\", \"Gardiner street\", \"scaffolding\"],\n \"document_type\":", + "{\n \"people\": [\"BLOOM\", \"CORNY KELLEHER\", \"Father\"],\n \"locations\": [\"nighttown\", \"Gardiner street\"],\n \"document_type\": \"fiction\"\n}", + "{\n \"people\": [\"Bloom\", \"Corny Kelleher\", \"First Watch\", \"Second Watch\", \"Father\"],\n \"organizations\": [],\n \"locations\": [\"Gardiner street\", \"nighttown\", \"Jammet's\"],\n \"dates\": [],\n \"document_type\"" + ] + }, + { + "id": "claude_bbe0172d-3087-4238-a51c-7dca6c0b6f28_430", + "source": "Claude: Setting up a custom OpenClaw instance", + "content_hash": "37dc9e1c", + "content_length": 2937, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 19.0, + 2.2, + 2.2 + ], + "extraction_sample": { + "people": [], + "organizations": [ + "aaronnelson.studio", + "nextcloud" + ], + "locations": [ + "nextcloud.aaronnelson.studio" + ], + "dates": [], + "document_type": "text" + }, + "raw_samples": [ + "{\n \"people\": [],\n \"organizations\": [\"Nextcloud\", \"aaronnelson.studio\"],\n \"locations\": [\"nextcloud.aaronnelson.studio\"],\n \"dates\": [],\n \"document_type\": \"text\"\n}", + "{\n \"locations\": [\"nextcloud.aaronnelson.studio\"],\n \"document_type\": \"text\"\n}", + "{\n \"locations\": [\"nextcloud.aaronnelson.studio\"],\n \"document_type\": \"plain text\"\n}" + ] + }, + { + "id": "0e18a972_12", + "source": "RH50-TM-L1-EN-20140902.pdf", + "content_hash": "dd1e2907", + "content_length": 2709, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 11.8, + 2.7, + 3.2 + ], + "extraction_sample": { + "people": [ + "robert mcneel" + ], + "organizations": [ + "robert mcneel & associates" + ], + "locations": [], + "dates": [], + "document_type": "" + }, + "raw_samples": [ + "{\n \"people\": [\"Robert McNeel\"],\n \"organizations\": [\"Robert McNeel & Associates\"]\n}", + "{\n \"people\": [\"Robert McNeel\"],\n \"organizations\": [\"Robert McNeel & Associates\"],\n \"document_type\": \"instructions\"\n}", + "{\n \"people\": [],\n \"organizations\": [\"Robert McNeel & Associates\"],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"Instruction manual\"\n}" + ] + }, + { + "id": "1a545c4e_197", + "source": "ulysses.txt", + "content_hash": "d911e3fd", + "content_length": 2741, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 16.4, + 5.9, + 5.8 + ], + "extraction_sample": { + "people": [ + "girlgold", + "hugh machugh", + "kennedy", + "lenehan", + "miss douce", + "mr dedalus", + "stephen" + ], + "organizations": [ + "mooney's en ville", + "mooney's sur mer" + ], + "locations": [], + "dates": [], + "document_type": "fiction" + }, + "raw_samples": [ + "{\n \"people\": [\"Kennedy\", \"girlgold\", \"Miss Douce\", \"Stephen\", \"Lenehan\", \"Mr Dedalus\", \"Hugh MacHugh\"],\n \"organizations\": [\"Mooney's EN VILLE\", \"Mooney's SUR MER\"],\n \"document_type\": \"fiction\"\n}", + "{\n \"people\": [\"Kennedy\", \"Miss Douce\", \"Lenehan\", \"Stephen\", \"Mr Dedalus\", \"Hugh MacHugh\"],\n \"organizations\": [\"Mooney's EN VILLE\", \"Mooney's SUR MER\"],\n \"locations\": [],\n \"dates\": [],\n \"document", + "{\n \"people\": [\"Kennedy\", \"Mr Dedalus\", \"Lenehan\", \"Stephen\", \"Hugh MacHugh\"],\n \"organizations\": [],\n \"locations\": [\"Mooney's EN VILLE\", \"Mooney's SUR MER\"],\n \"dates\": [],\n \"document_type\": \"ficti" + ] + }, + { + "id": "88602096_2", + "source": "Kingston Grads Ind Study SP2023 Syllabus.pdf", + "content_hash": "43b2ec27", + "content_length": 2525, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 15.7, + 7.2, + 7.0 + ], + "extraction_sample": { + "people": [ + "members", + "students" + ], + "organizations": [ + "dean of your school", + "department chair", + "office of human resources, diversity & inclusion (hrdi)", + "suny new paltz", + "title ix office" + ], + "locations": [ + "campus buildings", + "new paltz", + "student union, room 100 south" + ], + "dates": [], + "document_type": "policy statement" + }, + "raw_samples": [ + "{\n \"people\": [\"members\", \"students\"],\n \"organizations\": [\"SUNY New Paltz\", \"Title IX Office\", \"department chair\", \"dean of your school\", \"Office of Human Resources, Diversity & Inclusion (HRDI)\"],\n ", + "{\n \"people\": [\"members\", \"students\"],\n \"organizations\": [\"SUNY New Paltz\", \"Title IX Office\", \"department chair and/or the dean of your school\", \"Office of Human Resources, Diversity & Inclusion (HR", + "{\n \"people\": [\"members\", \"students\"],\n \"organizations\": [\"SUNY New Paltz\", \"Title IX Office\", \"department chair\", \"dean of your school\", \"Office of Human Resources, Diversity & Inclusion (HRDI)\"],\n " + ] + }, + { + "id": "7785818c_1", + "source": "The Extended Mind _ The Power of Thinking Outside the Brain -- Annie Murphy Paul.pdf", + "content_hash": "48faec1d", + "content_length": 2891, + "status": "CONSISTENT", + "consistent": true, + "pass_times_seconds": [ + 12.5, + 3.4, + 3.3 + ], + "extraction_sample": { + "people": [ + "fr\u00e9d\u00e9ric gros", + "nietzsche" + ], + "organizations": [], + "locations": [], + "dates": [], + "document_type": "article" + }, + "raw_samples": [ + "{\n \"people\": [\"Nietzsche\", \"Fr\u00e9d\u00e9ric Gros\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"article\"\n}", + "{\n \"people\": [\"Nietzsche\", \"Fr\u00e9d\u00e9ric Gros\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"article\"\n}", + "{\n \"people\": [\"Nietzsche\", \"Fr\u00e9d\u00e9ric Gros\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"article\"\n}" + ] + }, + { + "id": "1a545c4e_3", + "source": "ulysses.txt", + "content_hash": "73ad0e8d", + "content_length": 2733, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 13.8, + 3.8, + 3.7 + ], + "extraction_sample": { + "people": [ + "buck mulligan", + "stephen", + "ursula" + ], + "organizations": [], + "locations": [ + "irish art", + "island" + ], + "dates": [], + "document_type": "fiction" + }, + "raw_samples": [ + "{\n \"people\": [\"Stephen\", \"Buck Mulligan\", \"Ursula\"],\n \"organizations\": [],\n \"locations\": [\"Irish art\", \"Island\"],\n \"dates\": [],\n \"document_type\": \"fiction\"\n}", + "{\n \"people\": [\"Stephen\", \"Buck Mulligan\", \"Ursula\"],\n \"organizations\": [],\n \"locations\": [\"Irish art\", \"the island\"],\n \"dates\": [],\n \"document_type\": \"fiction\"\n}", + "{\n \"people\": [\"Stephen\", \"Buck Mulligan\", \"Ursula\"],\n \"organizations\": [],\n \"locations\": [\"Irish island\"],\n \"dates\": [],\n \"document_type\": \"fiction\"\n}" + ] + }, + { + "id": "1a545c4e_46", + "source": "ulysses.txt", + "content_hash": "f42b21d0", + "content_length": 2833, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 15.3, + 3.5, + 3.4 + ], + "extraction_sample": { + "people": [ + "eleven", + "she" + ], + "organizations": [], + "locations": [ + "dolphin's barn" + ], + "dates": [], + "document_type": "book title (assumed from 'ruby: the pride of the ring')" + }, + "raw_samples": [ + "{\n \"people\": [\"Eleven\", \"She\"],\n \"organizations\": [],\n \"locations\": [\"Dolphin's Barn\"],\n \"dates\": [],\n \"document_type\": \"Book Title (assumed from 'RUBY: THE PRIDE OF THE RING')\"\n}", + "{\n \"people\": [\"Eleven\", \"She\"],\n \"organizations\": [],\n \"locations\": [\"Dolphin's Barn\"],\n \"dates\": [],\n \"document_type\": \"Book\"\n}", + "{\n \"people\": [\"Eleven\", \"She\"],\n \"organizations\": [],\n \"locations\": [\"Dolphin's Barn\"],\n \"dates\": [],\n \"document_type\": \"book\"\n}" + ] + }, + { + "id": "9d366a6a_2", + "source": "Circuit Intro.pptx", + "content_hash": "5467e94e", + "content_length": 2946, + "status": "CONSISTENT", + "consistent": true, + "pass_times_seconds": [ + 11.7, + 2.4, + 2.5 + ], + "extraction_sample": { + "people": [], + "organizations": [], + "locations": [], + "dates": [], + "document_type": "text" + }, + "raw_samples": [ + "{\n \"people\": [],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"text\"\n}", + "{\n \"people\": [],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"text\"\n}", + "{\n \"people\": [],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"text\"\n}" + ] + }, + { + "id": "49e3545d_33", + "source": "RH50-TM-L1-EN-20140902.pdf", + "content_hash": "6e608b0e", + "content_length": 2928, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 12.0, + 3.2, + 3.3 + ], + "extraction_sample": { + "people": [], + "organizations": [ + "robert mcneel & associates" + ], + "locations": [], + "dates": [], + "document_type": "model" + }, + "raw_samples": [ + "{\n \"people\": [],\n \"organizations\": [\"Robert McNeel & Associates\"],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"model\"\n}", + "{\n \"people\": [],\n \"organizations\": [\"Robert McNeel & Associates\"],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"Cplanes.3dm\"\n}", + "{\n \"people\": [],\n \"organizations\": [\"Robert McNeel & Associates\"],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"Cplanes.3dm\"\n}" + ] + }, + { + "id": "fc378df0_203", + "source": "ulysses.txt", + "content_hash": "eb17907a", + "content_length": 2945, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 14.6, + 4.7, + 4.9 + ], + "extraction_sample": { + "people": [ + "ben dollard", + "george lidwell", + "lydia douce" + ], + "organizations": [ + "the independent" + ], + "locations": [], + "dates": [], + "document_type": "fiction" + }, + "raw_samples": [ + "{\n \"people\": [\"Ben Dollard\", \"Lydia Douce\", \"George Lidwell\"],\n \"organizations\": [\"The INDEPENDENT\"],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"Fiction\"\n}", + "{\n \"people\": [\"Bloom\", \"Molly\", \"Ben Dollard\", \"Miss Douce\", \"Lydia Douce\", \"George Lidwell\"],\n \"organizations\": [\"INDEPENDENT\"],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"fiction\"\n}", + "{\n \"people\": [\"Bloom\", \"Molly\", \"Ben Dollard\", \"Miss Douce\", \"Lydia Douce\", \"George Lidwell\"],\n \"organizations\": [\"INDEPENDENT\"],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"newspaper arti" + ] + }, + { + "id": "1a545c4e_12", + "source": "ulysses.txt", + "content_hash": "2d5907e9", + "content_length": 2800, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 13.3, + 3.5, + 4.3 + ], + "extraction_sample": { + "people": [ + "buck mulligan", + "haines", + "stephen" + ], + "organizations": [], + "locations": [ + "bay", + "elsinore", + "sea" + ], + "dates": [], + "document_type": "literature" + }, + "raw_samples": [ + "{\n \"people\": [\"Stephen\", \"Haines\", \"Buck Mulligan\"],\n \"locations\": [\"Elsinore\", \"sea\", \"bay\"],\n \"document_type\": \"literature\"\n}", + "{\n \"people\": [\"Stephen\", \"Haines\", \"Buck Mulligan\"],\n \"organizations\": [],\n \"locations\": [\"Elsinore\"],\n \"dates\": [],\n \"document_type\": \"fiction\"\n}", + "{\n \"people\": [\"Stephen\", \"Haines\", \"Buck Mulligan\"],\n \"organizations\": [],\n \"locations\": [\"Elsinore\", \"Sea\", \"Bay\", \"Muglins\"],\n \"dates\": [],\n \"document_type\": \"fiction\"\n}" + ] + }, + { + "id": "e6101efc_1", + "source": "3D Printing \u2013 Designing for Additive2.pptx", + "content_hash": "7bb3f96b", + "content_length": 639, + "status": "CONSISTENT", + "consistent": true, + "pass_times_seconds": [ + 11.1, + 2.9, + 2.3 + ], + "extraction_sample": { + "people": [], + "organizations": [ + "hvamc" + ], + "locations": [], + "dates": [ + "2017" + ], + "document_type": "article" + }, + "raw_samples": [ + "{\n \"document_type\": \"article\",\n \"locations\": [],\n \"people\": [],\n \"organizations\": [\"HVAMC\"],\n \"dates\": [\"2017\"]\n}", + "{\n \"document_type\": \"article\",\n \"organizations\": [\"HVAMC\"],\n \"locations\": [],\n \"dates\": [\"2017\"],\n \"people\": []\n}", + "{\n \"organizations\": [\"HVAMC\"],\n \"dates\": [\"2017\"],\n \"document_type\": \"article\"\n}" + ] + }, + { + "id": "fc378df0_79", + "source": "ulysses.txt", + "content_hash": "f9333d86", + "content_length": 2691, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 14.3, + 4.5, + 3.5 + ], + "extraction_sample": { + "people": [ + "dedalus", + "father coffey" + ], + "organizations": [], + "locations": [ + "saint werburgh's" + ], + "dates": [], + "document_type": "novel" + }, + "raw_samples": [ + "{\n \"people\": [\"Father Coffey\", \"Dedalus\"],\n \"organizations\": [],\n \"locations\": [\"Saint Werburgh's\"],\n \"dates\": [],\n \"document_type\": \"novel\"\n}", + "{\n \"people\": [\"Father Coffey\", \"Dedalus\", \"Molly\", \"Mervyn Browne\"],\n \"organizations\": [],\n \"locations\": [\"Saint Werburgh's\"],\n \"dates\": [],\n \"document_type\": \"novel\"\n}", + "{\n \"people\": [\"Father Coffey\", \"Dedalus\"],\n \"organizations\": [],\n \"locations\": [\"Saint Werburgh's\"],\n \"dates\": [],\n \"document_type\": \"novel\"\n}" + ] + }, + { + "id": "25df23c5_0", + "source": "MIC Work Instructions.docx", + "content_hash": "542035c0", + "content_length": 2192, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 11.3, + 2.9, + 3.3 + ], + "extraction_sample": { + "people": [ + "users" + ], + "organizations": [ + "mic", + "newpaltz" + ], + "locations": [ + "print center" + ], + "dates": [], + "document_type": "email" + }, + "raw_samples": [ + "{\n \"people\": [\"users\"],\n \"organizations\": [\"Newpaltz\", \"MIC\"],\n \"locations\": [\"print center\"],\n \"document_type\": \"email\"\n}", + "{\n \"people\": [\"users\"],\n \"organizations\": [\"Newpaltz.edu\", \"MIC\"],\n \"locations\": [\"print center\"],\n \"document_type\": \"email\"\n}", + "{\n \"people\": [],\n \"organizations\": [\"Newpaltz\", \"MIC\"],\n \"locations\": [\"print center\"],\n \"dates\": [],\n \"document_type\": \"email\"\n}" + ] + }, + { + "id": "7369fb88_0", + "source": "Selected FDM Materials - EN Chemical Compatibility (2).pdf", + "content_hash": "9577291b", + "content_length": 2976, + "status": "CONSISTENT", + "consistent": true, + "pass_times_seconds": [ + 13.5, + 3.0, + 2.9 + ], + "extraction_sample": { + "people": [], + "organizations": [ + "stratasys" + ], + "locations": [], + "dates": [], + "document_type": "data sheet" + }, + "raw_samples": [ + "{\n \"people\": [],\n \"organizations\": [\"Stratasys\"],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"Data Sheet\"\n}", + "{\n \"people\": [],\n \"organizations\": [\"Stratasys\"],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"Data Sheet\"\n}", + "{\n \"people\": [],\n \"organizations\": [\"Stratasys\"],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"Data Sheet\"\n}" + ] + }, + { + "id": "a230f2c3_593", + "source": "Landscape And Memory -- Simon Schama.pdf", + "content_hash": "b9bb7299", + "content_length": 1373, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 23.8, + 7.0, + 6.1 + ], + "extraction_sample": { + "people": [ + "carlo ginzburg", + "john hutchin", + "leo wieseltier" + ], + "organizations": [], + "locations": [ + "greenwood" + ], + "dates": [ + "1774", + "1808", + "1990", + "aug. 5, 1991" + ], + "document_type": "article" + }, + "raw_samples": [ + "{\n \"people\": [\"Carlo Ginzburg\", \"John Hutchin\", \"Leo Wieseltier\"],\n \"organizations\": [],\n \"locations\": [\"Greenwood\"],\n \"dates\": [\"1990\", \"Aug. 5, 1991\", \"1774\", \"1808\"],\n \"document_type\": \"articl", + "{\n \"people\": [\"Cambell\", \"American Scholar\", \"Carlo Ginzburg\", \"Leo Weiseltier\"],\n \"organizations\": [],\n \"locations\": [\"Greenwood\"],\n \"dates\": [\"1990\", \"Aug. 5, 1991\", \"1774\", \"1808\"],\n \"document", + "{\n \"people\": [\"Carlo Ginzburg\", \"John Hutchin\", \"Leon Weiseltier\"],\n \"organizations\": [],\n \"locations\": [\"Greenwood\", \"Dorset\", \"London\"],\n \"dates\": [\"1990\", \"1774\", \"1808\"],\n \"document_type\": \"a" + ] + }, + { + "id": "91ccefdd_35", + "source": "Cognition in the Wild (A Bradford Book) -- Hutchins, Edwin.pdf", + "content_hash": "02c3beb2", + "content_length": 2722, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 11.5, + 3.0, + 3.0 + ], + "extraction_sample": { + "people": [], + "organizations": [], + "locations": [ + "region around the ship", + "shore" + ], + "dates": [], + "document_type": "book" + }, + "raw_samples": [ + "{\n \"people\": [],\n \"organizations\": [],\n \"locations\": [\"region around the ship\", \"shore\"],\n \"dates\": [],\n \"document_type\": \"book\"\n}", + "{\n \"people\": [],\n \"organizations\": [],\n \"locations\": [\"region around the ship\", \"ship\"],\n \"dates\": [],\n \"document_type\": \"book\"\n}", + "{\n \"people\": [],\n \"organizations\": [],\n \"locations\": [\"region around the ship\", \"shore\"],\n \"dates\": [],\n \"document_type\": \"book\"\n}" + ] + }, + { + "id": "a230f2c3_310", + "source": "Landscape And Memory -- Simon Schama.pdf", + "content_hash": "310615d5", + "content_length": 2808, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 14.6, + 4.8, + 4.8 + ], + "extraction_sample": { + "people": [ + "keymis", + "ralegh" + ], + "organizations": [], + "locations": [ + "england", + "greenwich", + "newfoundland", + "plymouth", + "thames estuary", + "westminster" + ], + "dates": [ + "29" + ], + "document_type": "letter" + }, + "raw_samples": [ + "{\n \"people\": [\"Ralegh\", \"Keymis\"],\n \"organizations\": [],\n \"locations\": [\"Greenwich\", \"Westminster\", \"Plymouth\", \"Thames Estuary\", \"England\", \"Newfoundland\"],\n \"dates\": [\"29\"],\n \"document_type\": \"", + "{\n \"people\": [\"Ralegh\", \"Keymis\", \"Queen\"],\n \"locations\": [\"Keymis\", \"cabin\", \"Newfoundland\", \"Plymouth\", \"Thames estuary\", \"France\", \"Greenwich\", \"Westminster\"],\n \"document_type\": \"letter\"\n}", + "{\n \"people\": [\"Sir Walter Ralegh\", \"Keymis\"],\n \"locations\": [\"miserable Keymis\", \"cabin\", \"Newfoundland\", \"Plymouth\", \"Thames estuary\", \"Greenwich\", \"Westminster\"],\n \"document_type\": \"historical do" + ] + }, + { + "id": "claude_bbe0172d-3087-4238-a51c-7dca6c0b6f28_280", + "source": "Claude: Setting up a custom OpenClaw instance", + "content_hash": "995c8e57", + "content_length": 1454, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 15.6, + 3.0, + 3.5 + ], + "extraction_sample": { + "people": [ + "aaron" + ], + "organizations": [ + "aaronnelson.studio" + ], + "locations": [], + "dates": [ + "thu, 23 apr 2026" + ], + "document_type": "text" + }, + "raw_samples": [ + "{\n \"people\": [\"aaron\"],\n \"organizations\": [\"aaronnelson.studio\"],\n \"dates\": [\"Thu, 23 Apr 2026\"],\n \"document_type\": \"text\"\n}", + "{\n \"people\": [\"aaronnelson\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"text\"\n}", + "{\n \"people\": [\"aaron\"],\n \"organizations\": [\"aaronnelson.studio\"],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"text\"\n}" + ] + }, + { + "id": "91ccefdd_125", + "source": "Cognition in the Wild (A Bradford Book) -- Hutchins, Edwin.pdf", + "content_hash": "8a85596b", + "content_length": 2893, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 10.9, + 2.8, + 2.6 + ], + "extraction_sample": { + "people": [], + "organizations": [], + "locations": [], + "dates": [], + "document_type": "text" + }, + "raw_samples": [ + "{\n \"people\": [],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"text\"\n}", + "{\n \"people\": [],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"manual\"\n}", + "{\n \"people\": [],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"instruction\"\n}" + ] + }, + { + "id": "074bb5f9_85", + "source": "Mystery train _ images of America in rock 'n' roll music -- Marcus, Greil.pdf", + "content_hash": "ac830289", + "content_length": 2702, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 13.1, + 3.8, + 4.0 + ], + "extraction_sample": { + "people": [ + "staggerlee", + "superflake", + "taxi man" + ], + "organizations": [], + "locations": [ + "harlem" + ], + "dates": [], + "document_type": "movie_review" + }, + "raw_samples": [ + "{\n \"people\": [\"Superflake\", \"Taxi Man\", \"Staggerlee\"],\n \"organizations\": [],\n \"locations\": [\"Harlem\"],\n \"dates\": [],\n \"document_type\": \"movie_review\"\n}", + "{\n \"people\": [\"Taxi Man\", \"Staggerlee\", \"Superflake\", \"Franciosa\"],\n \"organizations\": [],\n \"locations\": [\"Harlem\"],\n \"dates\": [],\n \"document_type\": \"fiction\"\n}", + "{\n \"people\": [\"Superflake\", \"Taxi Man\", \"Staggerlee\", \"Fats Domino\"],\n \"organizations\": [],\n \"locations\": [\"Harlem\"],\n \"dates\": [],\n \"document_type\": \"movie script\"\n}" + ] + }, + { + "id": "9cf798f8_74", + "source": "Shop Class as Soulcraft An inquiry into the value of the -- Crawford, Matthew.pdf", + "content_hash": "22d20263", + "content_length": 2704, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 11.6, + 2.4, + 2.5 + ], + "extraction_sample": { + "people": [], + "organizations": [], + "locations": [], + "dates": [], + "document_type": "article" + }, + "raw_samples": [ + "{\n \"people\": [],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"article\"\n}", + "{\n \"people\": [],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"article\"\n}", + "{\n \"locations\": [],\n \"people\": [],\n \"organizations\": [],\n \"dates\": [],\n \"document_type\": \"story\"\n}" + ] + }, + { + "id": "a230f2c3_84", + "source": "Landscape And Memory -- Simon Schama.pdf", + "content_hash": "39832bd0", + "content_length": 2725, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 11.6, + 1.7, + 1.9 + ], + "extraction_sample": { + "people": [], + "organizations": [], + "locations": [ + "germania" + ], + "dates": [], + "document_type": "article" + }, + "raw_samples": [ + "{\n \"locations\": [\"Germania\"],\n \"document_type\": \"article\"\n}", + "{\n \"locations\": [\"Germania\"],\n \"document_type\": \"Historical Text\"\n}", + "{\n \"locations\": [\"Germania\", \"Latinized Gaul\"],\n \"document_type\": \"article\"\n}" + ] + }, + { + "id": "91ccefdd_282", + "source": "Cognition in the Wild (A Bradford Book) -- Hutchins, Edwin.pdf", + "content_hash": "a27a5292", + "content_length": 2868, + "status": "CONSISTENT", + "consistent": true, + "pass_times_seconds": [ + 10.3, + 2.6, + 2.6 + ], + "extraction_sample": { + "people": [ + "quartermaster chief" + ], + "organizations": [], + "locations": [], + "dates": [], + "document_type": "article" + }, + "raw_samples": [ + "{\n \"people\": [\"quartermaster chief\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"article\"\n}", + "{\n \"people\": [\"quartermaster chief\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"article\"\n}", + "{\n \"people\": [\"quartermaster chief\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"article\"\n}" + ] + }, + { + "id": "fc378df0_53", + "source": "ulysses.txt", + "content_hash": "62d1a05e", + "content_length": 2797, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 14.7, + 5.3, + 4.1 + ], + "extraction_sample": { + "people": [ + "griffith", + "henry flower", + "maud gonne" + ], + "organizations": [ + "p. o.", + "royal dublin fusiliers" + ], + "locations": [ + "city", + "westland row" + ], + "dates": [], + "document_type": "letter" + }, + "raw_samples": [ + "{\n \"people\": [\"Henry Flower\", \"Maud Gonne\", \"Griffith\"],\n \"organizations\": [\"P. O.\", \"Royal Dublin fusiliers\"],\n \"locations\": [\"Westland Row\", \"City\"],\n \"document_type\": \"letter\"\n}", + "{\n \"people\": [\"Henry Flower\", \"Maud Gonne\", \"old Tweedy\"],\n \"organizations\": [\"P.O.\", \"Royal Dublin fusiliers\", \"King's own\"],\n \"locations\": [\"Westland Row\", \"City\", \"O'Connell street\"],\n \"documen", + "{\n \"people\": [\"Henry Flower\", \"Maud Gonne\", \"Griffith\"],\n \"organizations\": [],\n \"locations\": [\"City\", \"Westland Row\"],\n \"dates\": [],\n \"document_type\": \"letter\"\n}" + ] + }, + { + "id": "074bb5f9_27", + "source": "Mystery train _ images of America in rock 'n' roll music -- Marcus, Greil.pdf", + "content_hash": "075f007a", + "content_length": 2752, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 13.2, + 4.1, + 4.2 + ], + "extraction_sample": { + "people": [ + "michael jackson", + "mick jagger", + "muddy waters", + "robert johnson" + ], + "organizations": [], + "locations": [], + "dates": [ + "thirties", + "two hundred years ago" + ], + "document_type": "article" + }, + "raw_samples": [ + "{\n \"people\": [\"Robert Johnson\", \"Muddy Waters\", \"Mick Jagger\", \"Michael Jackson\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [\"two hundred years ago\", \"thirties\"],\n \"document_type\": \"arti", + "{\n \"people\": [\"Robert Johnson\", \"Muddy Waters\", \"Mick Jagger\", \"Michael Jackson\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [\"two hundred years ago\"],\n \"document_type\": \"article\"\n}", + "{\n \"people\": [\"man\", \"Robert Johnson\", \"Son House\", \"Skip James\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [\"more than two hundred years ago\", \"the thirties\"],\n \"document_type\": \"articl" + ] + }, + { + "id": "7ec60c0a_2", + "source": "Slide Slam 2022 Bios.docx", + "content_hash": "20fbe52c", + "content_length": 1435, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 11.5, + 3.0, + 3.1 + ], + "extraction_sample": { + "people": [ + "alex", + "briana rascoe" + ], + "organizations": [], + "locations": [], + "dates": [], + "document_type": "article" + }, + "raw_samples": [ + "{\n \"people\": [\"Alex\", \"Briana Rascoe\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"article\"\n}", + "{\n \"people\": [\"Alex\", \"Briana Rascoe\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"article\"\n}", + "{\n \"people\": [\"Alex\", \"Briana Rascoe\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"news article\"\n}" + ] + }, + { + "id": "fec527fe_1", + "source": "Lecture 2 Update.pptx", + "content_hash": "cb8c74a3", + "content_length": 2704, + "status": "CONSISTENT", + "consistent": true, + "pass_times_seconds": [ + 17.8, + 3.2, + 2.7 + ], + "extraction_sample": { + "people": [], + "organizations": [], + "locations": [ + "(300, 300)", + "(640, 384)" + ], + "dates": [], + "document_type": "text" + }, + "raw_samples": [ + "{\n \"locations\": [\"(300, 300)\", \"(640, 384)\"],\n \"document_type\": \"text\"\n}", + "{\n \"locations\": [\"(300, 300)\", \"(640, 384)\"],\n \"document_type\": \"text\"\n}", + "{\n \"locations\": [\"(300, 300)\", \"(640, 384)\"],\n \"document_type\": \"text\"\n}" + ] + }, + { + "id": "1a545c4e_389", + "source": "ulysses.txt", + "content_hash": "2aeec086", + "content_length": 2874, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 19.9, + 4.2, + 4.0 + ], + "extraction_sample": { + "people": [ + "best", + "he", + "john eglinton", + "mananaun maclir" + ], + "organizations": [], + "locations": [], + "dates": [], + "document_type": "fiction" + }, + "raw_samples": [ + "{\n \"people\": [\"He\", \"Best\", \"John Eglinton\", \"Mananaun MacLir\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"fiction\"\n}", + "{\n \"people\": [\"Beer\", \"Bishop\", \"He is our friend\", \"Mananan MacLir\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"fiction\"\n}", + "{\n \"people\": [\"He\", \"BEST\", \"John Eglinton\", \"Mananau MacLir\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"literature\"\n}" + ] + }, + { + "id": "76c315b5_1", + "source": "CAD I Syllabus-SP19_Revised.docx", + "content_hash": "71d1c925", + "content_length": 2983, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 14.7, + 6.6, + 9.5 + ], + "extraction_sample": { + "people": [], + "organizations": [ + "new paltz", + "sojourner truth library" + ], + "locations": [ + "http://lib.newpaltz.edu/assistance/plag.html", + "https://www.newpaltz.edu/emergency/policy.html" + ], + "dates": [ + "april 2nd" + ], + "document_type": "academic policy document" + }, + "raw_samples": [ + "{\n \"people\": [],\n \"organizations\": [\"New Paltz\", \"Sojourner Truth Library\"],\n \"locations\": [\"https://www.newpaltz.edu/emergency/policy.html\", \"http://lib.newpaltz.edu/assistance/plag.html\"],\n \"dat", + "{\n \"people\": [],\n \"organizations\": [\"New Paltz\", \"Sojourner Truth Library\"],\n \"locations\": [\"https://www.newpaltz.edu/emergency/policy.html\", \"http://lib.newpaltz.edu/assistance/plag.html\"],\n \"dat", + "Services Office at New Paltz as soon as possible. The office can be reached at 845-257-3106 or via email at [disability@newpaltz.edu](mailto:disability@newpaltz.edu). Accommodations must be approved i" + ] + }, + { + "id": "074bb5f9_442", + "source": "Mystery train _ images of America in rock 'n' roll music -- Marcus, Greil.pdf", + "content_hash": "197e4a86", + "content_length": 1303, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 17.9, + 7.9, + 6.3 + ], + "extraction_sample": { + "people": [ + "bruce springsteen", + "elvis presley", + "greil marcus", + "lyle lovett", + "patty loveless", + "randy newman" + ], + "organizations": [], + "locations": [], + "dates": [ + "16", + "20", + "232", + "3", + "307" + ], + "document_type": "article" + }, + "raw_samples": [ + "{\n \"people\": [\"Elvis Presley\", \"Patty Loveless\", \"Randy Newman\", \"Lyle Lovett\", \"Greil Marcus\", \"Bruce Springsteen\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [\"16\", \"232\", \"307\", \"3\", \"2", + "{\n \"people\": [\"Elvis Presley\", \"Patty Loveless\", \"Randy Newman\", \"Lyle Lovett\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [\"20 years before the 1975\", \"16\", \"232\", \"307\", \"3 11\", \"34\"],\n ", + "{\n \"people\": [\"Elvis Presley\", \"Patty Loveless\", \"Randy Newman\", \"Lyle Lovett\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [\"16\", \"337\", \"232\", \"307\", \"3\", \"20 years before\", \"1975\"],\n \"d" + ] + }, + { + "id": "a230f2c3_599", + "source": "Landscape And Memory -- Simon Schama.pdf", + "content_hash": "4cf471ff", + "content_length": 2271, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 24.8, + 8.3, + 8.4 + ], + "extraction_sample": { + "people": [ + "alexander hunter", + "batty langley", + "charnock", + "e. p. thompson", + "roger fisher", + "sir henry wood" + ], + "organizations": [ + "house of commons committee of inquiry", + "royal society of arts" + ], + "locations": [ + "london" + ], + "dates": [ + "1728", + "1771", + "1772", + "1789", + "1800-2", + "1912", + "1975" + ], + "document_type": "book" + }, + "raw_samples": [ + "{\n \"people\": [\"Charnock\", \"Batty Langley\", \"E. P. Thompson\", \"Roger Fisher\", \"Sir Henry Wood\", \"Alexander Hunter\"],\n \"organizations\": [\"House of Commons committee of inquiry\", \"Royal Society of Arts", + "{\n \"people\": [\"Charnock\", \"Batty Langley\", \"E. P. Thompson\", \"Roger Fisher\", \"Sir Henry Wood\", \"Alexander Hunter\"],\n \"organizations\": [\"House of Commons committee of inquiry\", \"Royal Society of Arts", + "{\n \"people\": [\"Charnock\", \"Batty Langley\", \"E. P. Thompson\", \"Roger Fisher\", \"Sir Henry Wood\", \"Alexander Hunter\"],\n \"organizations\": [\"The Royal Society of Arts\", \"House of Commons committee of inq" + ] + }, + { + "id": "074bb5f9_90", + "source": "Mystery train _ images of America in rock 'n' roll music -- Marcus, Greil.pdf", + "content_hash": "7e625551", + "content_length": 2748, + "status": "CONSISTENT", + "consistent": true, + "pass_times_seconds": [ + 14.0, + 2.8, + 2.8 + ], + "extraction_sample": { + "people": [ + "richard avedon", + "rosie stone", + "sly" + ], + "organizations": [], + "locations": [], + "dates": [ + "1973" + ], + "document_type": "article" + }, + "raw_samples": [ + "{\n \"people\": [\"Sly\", \"Richard Avedon\", \"Rosie Stone\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [\"1973\"],\n \"document_type\": \"article\"\n}", + "{\n \"people\": [\"Sly\", \"Richard Avedon\", \"Rosie Stone\"],\n \"dates\": [\"1973\"],\n \"document_type\": \"article\"\n}", + "{\n \"people\": [\"Sly\", \"Richard Avedon\", \"Rosie Stone\"],\n \"dates\": [\"1973\"],\n \"document_type\": \"article\"\n}" + ] + }, + { + "id": "fc378df0_355", + "source": "ulysses.txt", + "content_hash": "5a4d6725", + "content_length": 2883, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 20.4, + 4.4, + 4.9 + ], + "extraction_sample": { + "people": [ + "bloom", + "bridie kelly", + "mrs marion", + "sweny" + ], + "organizations": [], + "locations": [ + "east", + "south" + ], + "dates": [], + "document_type": "literature" + }, + "raw_samples": [ + "{\n \"people\": [\"Mrs Marion\", \"Bloom\", \"Sweny\", \"Bridie Kelly\"],\n \"organizations\": [],\n \"locations\": [\"South\", \"East\"],\n \"dates\": [],\n \"document_type\": \"literature\"\n}", + "{\n \"people\": [\"Mrs Marion\", \"Bloom\", \"Sweny\", \"Marion\"],\n \"organizations\": [],\n \"locations\": [\"South\", \"East\", \"Hatch street\"],\n \"dates\": [],\n \"document_type\": \"fiction\"\n}", + "{\n \"people\": [\"Mrs Marion\", \"Bloom\", \"Marion\", \"Sweny\", \"Bawd\", \"Bridie Kelly\"],\n \"organizations\": [],\n \"locations\": [\"South\", \"East\"],\n \"dates\": [],\n \"document_type\": \"Fiction\"\n}" + ] + }, + { + "id": "211f2199_71", + "source": "The E-myth revisited _ why most small businesses don't work -- Michael E_ Gerber.pdf", + "content_hash": "11665e8e", + "content_length": 2809, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 11.3, + 2.6, + 2.9 + ], + "extraction_sample": { + "people": [ + "sarah" + ], + "organizations": [], + "locations": [], + "dates": [], + "document_type": "letter" + }, + "raw_samples": [ + "{\n \"people\": [\"Sarah\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"letter\"\n}", + "{\n \"people\": [\"Sarah\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"business\"\n}", + "{\n \"people\": [\"Sarah\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"text\"\n}" + ] + }, + { + "id": "3ab67806_0", + "source": "CMakeLists.txt", + "content_hash": "15a52de3", + "content_length": 1020, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 7.9, + 0.9, + 0.9 + ], + "extraction_sample": { + "people": [], + "organizations": [], + "locations": [], + "dates": [], + "document_type": "cmake script" + }, + "raw_samples": [ + "{\n \"document_type\": \"cmake script\"\n}", + "{\n \"document_type\": \"cmake script\"\n}", + "{\n \"document_type\": \"cmake\"\n}" + ] + }, + { + "id": "eab75ec0_0", + "source": "CMakeLists.txt", + "content_hash": "d3d4f035", + "content_length": 1881, + "status": "CONSISTENT", + "consistent": true, + "pass_times_seconds": [ + 13.9, + 1.0, + 1.0 + ], + "extraction_sample": { + "people": [], + "organizations": [], + "locations": [], + "dates": [], + "document_type": "cmake file" + }, + "raw_samples": [ + "{\n \"document_type\": \"CMake file\"\n}", + "{\n \"document_type\": \"CMake file\"\n}", + "{\n \"document_type\": \"CMake file\"\n}" + ] + }, + { + "id": "293288df_31", + "source": "RhinoPythonPrimerRev3.pdf", + "content_hash": "6400418d", + "content_length": 2742, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 11.2, + 0.9, + 2.0 + ], + "extraction_sample": { + "people": [], + "organizations": [], + "locations": [ + "(0,0,0)" + ], + "dates": [], + "document_type": "code" + }, + "raw_samples": [ + "{\n \"document_type\": \"code\",\n \"locations\": [\"(0,0,0)\"]\n}", + "{\n \"document_type\": \"code\"\n}", + "{\n \"document_type\": \"code\",\n \"locations\": [\"(0,0,0)\"]\n}" + ] + }, + { + "id": "f6a8d07a_18", + "source": "Arduino Microcontroller Basics.pdf", + "content_hash": "b4c3f332", + "content_length": 2669, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 11.3, + 2.6, + 2.9 + ], + "extraction_sample": { + "people": [], + "organizations": [], + "locations": [], + "dates": [], + "document_type": "arduino code" + }, + "raw_samples": [ + "{\n \"people\": [],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"Arduino Code\"\n}", + "{\n \"locations\": [],\n \"organizations\": [\"Arduino\"],\n \"people\": [],\n \"dates\": [],\n \"document_type\": \"program\"\n}", + "{\n \"document_type\": \"programming code\",\n \"locations\": [],\n \"people\": [],\n \"organizations\": [],\n \"dates\": []\n}" + ] + }, + { + "id": "05b9d711_116", + "source": "Company of One -- Paul Jarvis.pdf", + "content_hash": "6e7a4314", + "content_length": 2945, + "status": "CONSISTENT", + "consistent": true, + "pass_times_seconds": [ + 12.3, + 3.1, + 2.6 + ], + "extraction_sample": { + "people": [ + "derek" + ], + "organizations": [ + "cdbaby" + ], + "locations": [], + "dates": [ + "2008" + ], + "document_type": "article" + }, + "raw_samples": [ + "{\n \"people\": [\"Derek\"],\n \"organizations\": [\"CDBaby\"],\n \"locations\": [],\n \"dates\": [\"2008\"],\n \"document_type\": \"article\"\n}", + "{\n \"people\": [\"Derek\"],\n \"organizations\": [\"CDBaby\"],\n \"locations\": [],\n \"dates\": [\"2008\"],\n \"document_type\": \"article\"\n}", + "{\n \"people\": [\"Derek\"],\n \"organizations\": [\"CDBaby\"],\n \"dates\": [\"2008\"],\n \"document_type\": \"article\"\n}" + ] + }, + { + "id": "a8366d89_185", + "source": "Hackers and Painters_ Big Ideas from the Computer Age -- Graham, Paul.pdf", + "content_hash": "40a71745", + "content_length": 2878, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 13.1, + 3.6, + 4.3 + ], + "extraction_sample": { + "people": [ + "harvard", + "princeton" + ], + "organizations": [ + "national center for education statistics" + ], + "locations": [], + "dates": [ + "1800", + "1970" + ], + "document_type": "book chapter" + }, + "raw_samples": [ + "{\n \"people\": [\"Princeton\", \"Harvard\"],\n \"organizations\": [\"National Center for Education Statistics\"],\n \"locations\": [],\n \"dates\": [\"1970\", \"1800\"],\n \"document_type\": \"book chapter\"\n}", + "{\n \"people\": [\"Prince-ton\", \"Harvard\"],\n \"organizations\": [],\n \"locations\": [],\n \"dates\": [\"1970\", \"1800\"],\n \"document_type\": \"book\"\n}", + "{\n \"people\": [\"Prince-ton\", \"Harvard\"],\n \"organizations\": [\"National Center for Education Statistics\"],\n \"locations\": [],\n \"dates\": [\"1800\", \"1970\"],\n \"document_type\": \"book\"\n}" + ] + }, + { + "id": "87abf773_21", + "source": "GrasshopperPrimer_V3-3_EN.pdf", + "content_hash": "beae43ed", + "content_length": 2746, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 12.1, + 3.0, + 2.2 + ], + "extraction_sample": { + "people": [], + "organizations": [ + "grasshopper" + ], + "locations": [], + "dates": [], + "document_type": "text" + }, + "raw_samples": [ + "{\n \"people\": [],\n \"organizations\": [\"Grasshopper\"],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"text\"\n}", + "{\n \"people\": [],\n \"organizations\": [\"Grasshopper\"],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"instruction\"\n}", + "{\n \"locations\": [\"Graph Mapper\", \"Dropdown List mode\"],\n \"document_type\": \"instruction manual\"\n}" + ] + }, + { + "id": "05812e17_18", + "source": "EXPERIENCE MACHINE _ how our minds predict and shape reality -- Andy Clark.pdf", + "content_hash": "ddabef58", + "content_length": 2920, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 9.8, + 2.8, + 2.0 + ], + "extraction_sample": { + "people": [ + "andy clark" + ], + "organizations": [], + "locations": [], + "dates": [], + "document_type": "article" + }, + "raw_samples": [ + "{\n \"people\": [\"ANDY CLARK\"],\n \"document_type\": \"article\"\n}", + "{\n \"people\": [\"ANDY CLARK\"],\n \"dates\": [],\n \"organizations\": [],\n \"locations\": [],\n \"document_type\": \"text\"\n}", + "{\n \"people\": [\"ANDY CLARK\"],\n \"document_type\": \"text\"\n}" + ] + }, + { + "id": "f548556b_2", + "source": "AARON_NELSON-DIGITAL_HAPTIC_2020.pdf", + "content_hash": "aae21623", + "content_length": 2076, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 10.7, + 2.9, + 2.8 + ], + "extraction_sample": { + "people": [], + "organizations": [ + "grasshopper", + "history of design" + ], + "locations": [], + "dates": [], + "document_type": "course_description" + }, + "raw_samples": [ + "{\n \"people\": [],\n \"organizations\": [\"Grasshopper\", \"History of Design\"],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"course_description\"\n}", + "{\n \"people\": [],\n \"organizations\": [\"Grasshopper\"],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"Course Outline\"\n}", + "{\n \"people\": [],\n \"organizations\": [\"Grasshopper\"],\n \"locations\": [],\n \"dates\": [],\n \"document_type\": \"Course Description\"\n}" + ] + }, + { + "id": "211f2199_13", + "source": "The E-myth revisited _ why most small businesses don't work -- Michael E_ Gerber.pdf", + "content_hash": "0356a33c", + "content_length": 2754, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 11.4, + 1.7, + 1.6 + ], + "extraction_sample": { + "people": [ + "the fat guy", + "you" + ], + "organizations": [], + "locations": [], + "dates": [], + "document_type": "story" + }, + "raw_samples": [ + "{\n \"people\": [\"You\", \"The Fat Guy\"],\n \"document_type\": \"story\"\n}", + "{\n \"people\": [\"You\", \"The Fat Guy\"],\n \"document_type\": \"fiction\"\n}", + "{\n \"people\": [\"You\", \"The Fat Guy\"],\n \"document_type\": \"story\"\n}" + ] + }, + { + "id": "cd3d1914_76", + "source": "The world beyond your head _ on becoming an individual in an -- Crawford, Matthew B.pdf", + "content_hash": "e9b45cb5", + "content_length": 2944, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 11.2, + 3.3, + 2.9 + ], + "extraction_sample": { + "people": [ + "nicholas schiill" + ], + "organizations": [], + "locations": [ + "las vegas" + ], + "dates": [], + "document_type": "book review" + }, + "raw_samples": [ + "{\n \"people\": [\"Nicholas Schiill\"],\n \"organizations\": [],\n \"locations\": [\"Las Vegas\"],\n \"dates\": [],\n \"document_type\": \"book review\"\n}", + "{\n \"people\": [\"Nick Yablonsky\", \"Schiill\"],\n \"organizations\": [],\n \"locations\": [\"Las Vegas\"],\n \"dates\": [],\n \"document_type\": \"book review\"\n}", + "{\n \"people\": [\"Schiill\"],\n \"organizations\": [],\n \"locations\": [\"Las Vegas\"],\n \"dates\": [],\n \"document_type\": \"book review\"\n}" + ] + }, + { + "id": "fc378df0_366", + "source": "ulysses.txt", + "content_hash": "293108da", + "content_length": 2798, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 15.6, + 4.2, + 4.1 + ], + "extraction_sample": { + "people": [ + "bloom", + "j. j. o'molloy" + ], + "organizations": [], + "locations": [ + "the land of the pharaoh" + ], + "dates": [], + "document_type": "dialogue" + }, + "raw_samples": [ + "{\n \"people\": [\"J. J. O'Molloy\", \"Bloom\"],\n \"organizations\": [],\n \"locations\": [\"the land of the Pharaoh\"],\n \"dates\": [],\n \"document_type\": \"dialogue\"\n}", + "{\n \"people\": [\"J. J. O'Molloy\", \"My client\", \"Bloom\"],\n \"organizations\": [],\n \"locations\": [\"the land of the Pharaoh\"],\n \"dates\": [],\n \"document_type\": \"legal-transcript\"\n}", + "{\n \"people\": [\"J. J. O'Molloy\", \"Bloom\"],\n \"organizations\": [],\n \"locations\": [\"the land of the Pharaoh\"],\n \"dates\": [],\n \"document_type\": \"legal_transcript\"\n}" + ] + }, + { + "id": "a230f2c3_488", + "source": "Landscape And Memory -- Simon Schama.pdf", + "content_hash": "4799c645", + "content_length": 2612, + "status": "INCONSISTENT", + "consistent": false, + "pass_times_seconds": [ + 15.1, + 4.8, + 5.2 + ], + "extraction_sample": { + "people": [ + "john ruskin" + ], + "organizations": [], + "locations": [ + "arcadia", + "euphrates", + "london", + "reat mountains" + ], + "dates": [ + "1894", + "not specified" + ], + "document_type": "book" + }, + "raw_samples": [ + "{\n \"people\": [\"John Ruskin\"],\n \"organizations\": [],\n \"locations\": [\"Reat Mountains\", \"Euphrates\", \"Arcadia\", \"London\"],\n \"dates\": [\"Not specified\", \"1894\"],\n \"document_type\": \"Book\"\n}", + "{\n \"people\": [\"John Ruskin\", \"Georgies\"],\n \"organizations\": [],\n \"locations\": [\"Reat Mountains\", \"deep Euphrates\", \"Arcaadia\", \"London\"],\n \"dates\": [\"Not specified\", \"1894\"],\n \"document_type\": \"b", + "{\n \"people\": [\"John Ruskin\", \"\"],\n \"organizations\": [\"Modern Painters\"],\n \"locations\": [\"reat mountains\", \"deep Euphrates\", \"Arcaedia\", \"London\", \"Parliament Hill\"],\n \"dates\": [\"not specified\", \"1" + ] + } + ], + "summary": { + "total": 50, + "consistent": 9, + "inconsistent": 41, + "failed": 0, + "timeout": 0, + "consistency_rate": 18.0, + "cascade_viable": false + } +} \ No newline at end of file diff --git a/scripts/briefing_test.py b/scripts/briefing_test.py new file mode 100644 index 0000000..1d00b09 --- /dev/null +++ b/scripts/briefing_test.py @@ -0,0 +1,313 @@ +#!/usr/bin/env python3 +""" +BirdAI Briefing Generator Test +=============================== +Tests the local LLM as a document briefing generator. +The local model produces a structured roadmap for the API — +cleaning, structure detection, signal flagging — without semantic judgment. +Results written to ~/aaronai/briefing_test_results.json +""" + +import json +import os +import urllib.request +import urllib.error +import psycopg2 +import psycopg2.extras +import hashlib +import time +from datetime import datetime, timedelta +from dotenv import load_dotenv + +load_dotenv(os.path.expanduser("~/aaronai/.env")) + +PG_DSN = os.getenv("PG_DSN") +RESULTS_FILE = os.path.expanduser("~/aaronai/briefing_test_results.json") +MODEL = "mistral" +SAMPLE_SIZE = 50 +OLLAMA_URL = "http://localhost:11434/api/generate" + +VALID_DOC_TYPES = { + "academic_pdf", "technical_doc", "chat_log", "code", + "presentation", "book_excerpt", "form", "syllabus", + "email", "notes", "unknown" +} +VALID_DENSITIES = {"high", "medium", "low"} +VALID_PRIORITIES = {"full", "partial", "skip"} + +BRIEFING_PROMPT = """Analyze this document and return a JSON briefing. No explanation, no prose, JSON only. + +Return exactly this structure: +{ + "document_type": "one of: academic_pdf, technical_doc, chat_log, code, presentation, book_excerpt, form, syllabus, email, notes, unknown", + "primary_language": "language code e.g. en, fr, de", + "density": "one of: high, medium, low", + "has_proper_nouns": true or false, + "has_dates": true or false, + "has_numeric_data": true or false, + "has_institutional_language": true or false, + "has_technical_terms": true or false, + "likely_has_named_entities": true or false, + "structure_signals": [], + "noise_signals": [], + "extraction_priority": "one of: full, partial, skip" +} + +Rules: +- document_type: identify from formatting patterns and vocabulary, not meaning +- density: high=information dense technical or academic text, medium=mixed, low=narrative/literary/sparse +- has_proper_nouns: true if you see capitalized words that are not sentence starts +- has_dates: true if you see date patterns (numbers with months, years, slashes) +- has_numeric_data: true if you see measurements, percentages, statistics +- has_institutional_language: true if you see words like university, department, policy, committee, grant +- has_technical_terms: true if you see domain-specific jargon or acronyms +- likely_has_named_entities: true if has_proper_nouns is true +- structure_signals: list any structural markers you see e.g. ["headings", "bullet_lists", "numbered_lists", "tables", "code_blocks", "citations"] +- noise_signals: list any noise patterns you see e.g. ["repeated_headers", "page_numbers", "formatting_artifacts", "boilerplate"] +- extraction_priority: full if density=high and likely_has_named_entities=true, skip if density=low and likely_has_named_entities=false, partial otherwise + +Document: +""" + + +def get_sample_documents(): + if not PG_DSN: + raise RuntimeError("PG_DSN not found in .env — cannot connect to database") + conn = psycopg2.connect(PG_DSN) + cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) + cur.execute(""" + SELECT DISTINCT ON (source) id, document, source, created_at + FROM embeddings + WHERE length(document) > 100 + AND length(document) < 3000 + ORDER BY source, random() + LIMIT %s + """, (SAMPLE_SIZE,)) + docs = cur.fetchall() + cur.close() + conn.close() + return docs + + +def run_briefing(text): + prompt = BRIEFING_PROMPT + text[:1500] + payload = json.dumps({ + "model": MODEL, + "prompt": prompt, + "stream": False + }).encode() + raw = "" + try: + req = urllib.request.Request( + OLLAMA_URL, + data=payload, + headers={"Content-Type": "application/json"} + ) + with urllib.request.urlopen(req, timeout=180) as resp: + result = json.loads(resp.read().decode()) + raw = result.get("response", "").strip() + start = raw.find("{") + end = raw.rfind("}") + 1 + if start == -1 or end == 0: + return None, f"NO_JSON: {raw[:200]}" + json_str = raw[start:end] + parsed = json.loads(json_str) + if not isinstance(parsed, dict): + return None, f"NOT_DICT: {raw[:100]}" + return parsed, raw + except urllib.error.URLError as e: + return None, f"URL_ERROR: {e}" + except TimeoutError: + return None, "TIMEOUT" + except json.JSONDecodeError as e: + return None, f"JSON_ERROR: {e} | raw: {raw[:200]}" + except Exception as e: + return None, f"ERROR: {type(e).__name__}: {e}" + + +def sanitize_briefing(briefing): + safe = {} + dt = str(briefing.get("document_type", "unknown")).lower().strip() + safe["document_type"] = dt if dt in VALID_DOC_TYPES else "unknown" + safe["primary_language"] = str(briefing.get("primary_language", "en")).lower().strip()[:10] + density = str(briefing.get("density", "medium")).lower().strip() + safe["density"] = density if density in VALID_DENSITIES else "medium" + for field in ["has_proper_nouns", "has_dates", "has_numeric_data", + "has_institutional_language", "has_technical_terms", + "likely_has_named_entities"]: + val = briefing.get(field, False) + if isinstance(val, bool): + safe[field] = val + elif isinstance(val, str): + safe[field] = val.lower() in ("true", "yes", "1") + else: + safe[field] = bool(val) + for field in ["structure_signals", "noise_signals"]: + val = briefing.get(field, []) + if isinstance(val, list): + safe[field] = [str(v) for v in val if v] + elif isinstance(val, str): + safe[field] = [val] if val else [] + else: + safe[field] = [] + priority = str(briefing.get("extraction_priority", "partial")).lower().strip() + safe["extraction_priority"] = priority if priority in VALID_PRIORITIES else "partial" + return safe + + +def estimate_token_reduction(original_text, briefing): + original_tokens = max(len(original_text) / 4, 1) + orientation_saved = 200 + if briefing.get("extraction_priority") == "skip": + return { + "original_tokens_approx": round(original_tokens), + "orientation_tokens_saved": round(original_tokens + 200), + "noise_reduction_pct": 100.0, + "total_reduction_pct": 100.0, + "note": "skip — no API call" + } + noise_count = len(briefing.get("noise_signals", [])) + noise_reduction_pct = min(noise_count * 0.05, 0.40) + noise_tokens_saved = original_tokens * noise_reduction_pct + total_saved = orientation_saved + noise_tokens_saved + total_cost = original_tokens + 200 + reduction_pct = min((total_saved / total_cost) * 100, 99.0) + return { + "original_tokens_approx": round(original_tokens), + "orientation_tokens_saved": orientation_saved, + "noise_tokens_saved": round(noise_tokens_saved), + "noise_reduction_pct": round(noise_reduction_pct * 100, 1), + "total_reduction_pct": round(reduction_pct, 1) + } + + +def format_eta(elapsed_times, completed, total): + if completed == 0: + return "ETA: --:--" + avg = sum(elapsed_times) / completed + remaining = (total - completed) * avg + eta = timedelta(seconds=int(remaining)) + return f"ETA: {str(eta)}" + + +def content_hash(text): + return hashlib.md5(text.encode()).hexdigest()[:8] + + +def main(): + test_start = time.time() + print(f"\nBirdAI Briefing Generator Test") + print(f"Model: {MODEL} | Sample: {SAMPLE_SIZE} docs (distinct sources)") + print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + print(f"Results: {RESULTS_FILE}") + print("-" * 75) + + docs = get_sample_documents() + print(f"Loaded {len(docs)} distinct source documents from pgvector\n") + + results = { + "meta": { + "model": MODEL, + "sample_size": len(docs), + "started": datetime.now().isoformat(), + "completed": None, + "total_elapsed_seconds": None, + "avg_seconds_per_doc": None + }, + "documents": [], + "summary": {} + } + + success_count = 0 + failed_count = 0 + priority_counts = {"full": 0, "partial": 0, "skip": 0} + total_reduction_pct = 0.0 + elapsed_times = [] + + for i, doc in enumerate(docs): + doc_id = doc["id"] + content = doc["document"] + source = doc.get("source", "unknown") + chash = content_hash(content) + eta_str = format_eta(elapsed_times, i, len(docs)) + print(f"[{i+1:02d}/{len(docs)}] {source[:38]:<38} {eta_str:<14}", end=" ", flush=True) + + t_start = time.time() + briefing, raw = run_briefing(content) + elapsed = round(time.time() - t_start, 1) + elapsed_times.append(elapsed) + + if briefing is None: + failed_count += 1 + print(f"→ FAILED {elapsed}s | {raw[:50]}") + results["documents"].append({ + "id": doc_id, "source": source, "content_hash": chash, + "content_length": len(content), "status": "FAILED", + "error": raw, "elapsed_seconds": elapsed + }) + else: + briefing = sanitize_briefing(briefing) + success_count += 1 + priority = briefing["extraction_priority"] + doc_type = briefing["document_type"] + density = briefing["density"] + priority_counts[priority] = priority_counts.get(priority, 0) + 1 + reduction = estimate_token_reduction(content, briefing) + total_reduction_pct += reduction["total_reduction_pct"] + print(f"→ {priority.upper():<7} {doc_type:<15} density:{density:<6} -{reduction['total_reduction_pct']:>5.1f}% {elapsed}s") + results["documents"].append({ + "id": doc_id, "source": source, "content_hash": chash, + "content_length": len(content), "status": "SUCCESS", + "elapsed_seconds": elapsed, "briefing": briefing, + "token_reduction_estimate": reduction + }) + + with open(RESULTS_FILE, "w") as f: + json.dump(results, f, indent=2, default=str) + + total_elapsed = round(time.time() - test_start, 1) + avg_per_doc = round(total_elapsed / len(docs), 1) if docs else 0 + completed_at = datetime.now().isoformat() + results["meta"]["completed"] = completed_at + results["meta"]["total_elapsed_seconds"] = total_elapsed + results["meta"]["avg_seconds_per_doc"] = avg_per_doc + + total = len(docs) + avg_reduction = round(total_reduction_pct / success_count, 1) if success_count else 0 + + summary = { + "total": total, + "success": success_count, + "failed": failed_count, + "success_rate": round(success_count / total * 100, 1), + "extraction_priority_breakdown": priority_counts, + "avg_token_reduction_pct": avg_reduction, + "total_elapsed_seconds": total_elapsed, + "avg_seconds_per_doc": avg_per_doc, + "projected_50_doc_minutes": round((avg_per_doc * 50) / 60, 1), + "approach_viable": success_count / total >= 0.8 + } + results["summary"] = summary + + with open(RESULTS_FILE, "w") as f: + json.dump(results, f, indent=2, default=str) + + print("\n" + "=" * 75) + print(f"RESULTS") + print(f" Success rate: {success_count}/{total} ({summary['success_rate']}%)") + print(f" Failed: {failed_count}") + print(f" Priority — full: {priority_counts.get('full', 0)}") + print(f" Priority — partial: {priority_counts.get('partial', 0)}") + print(f" Priority — skip: {priority_counts.get('skip', 0)}") + print(f" Avg token reduction: {avg_reduction}%") + print(f" Total elapsed: {total_elapsed}s ({round(total_elapsed/60, 1)} min)") + print(f" Avg per document: {avg_per_doc}s") + print(f" Projected 50 docs: {summary['projected_50_doc_minutes']} min") + print(f" Approach viable: {'YES' if summary['approach_viable'] else 'NO'}") + print(f" Completed: {completed_at}") + print(f" Full results: {RESULTS_FILE}") + print("=" * 75) + + +if __name__ == "__main__": + main() diff --git a/scripts/consistency_test.py b/scripts/consistency_test.py new file mode 100644 index 0000000..546555a --- /dev/null +++ b/scripts/consistency_test.py @@ -0,0 +1,248 @@ +#!/usr/bin/env python3 +""" +BirdAI Cascaded Extraction — Consistency Test +""" + +import json +import os +import urllib.request +import urllib.error +import psycopg2 +import psycopg2.extras +import hashlib +import time +from datetime import datetime +from dotenv import load_dotenv + +load_dotenv(os.path.expanduser("~/aaronai/.env")) + +PG_DSN = os.getenv("PG_DSN") +RESULTS_FILE = os.path.expanduser("~/aaronai/consistency_test_results.json") +MODEL = "mistral" +PASSES = 3 +SAMPLE_SIZE = 50 +OLLAMA_URL = "http://localhost:11434/api/generate" + +EXTRACTION_PROMPT = """Extract named entities from this text. Return JSON only, no explanation, no prose. +Use exactly these fields (omit any field you are uncertain about, use empty list if none found): +{ + "people": [], + "organizations": [], + "locations": [], + "dates": [], + "document_type": "" +} +Rules: +- Every value in people, organizations, locations, dates must be a plain string +- document_type must be a plain string +- No nested objects, no nested lists +- Only include entities you are certain about +- If uncertain about anything, omit it +Text: """ + + +def get_sample_documents(): + conn = psycopg2.connect(PG_DSN) + cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) + cur.execute(""" + SELECT id, document, source, created_at + FROM embeddings + WHERE length(document) > 100 + AND length(document) < 3000 + ORDER BY random() + LIMIT %s + """, (SAMPLE_SIZE,)) + docs = cur.fetchall() + cur.close() + conn.close() + return docs + + +def run_extraction(text): + prompt = EXTRACTION_PROMPT + text[:1500] + payload = json.dumps({ + "model": MODEL, + "prompt": prompt, + "stream": False + }).encode() + try: + req = urllib.request.Request( + OLLAMA_URL, + data=payload, + headers={"Content-Type": "application/json"} + ) + with urllib.request.urlopen(req, timeout=180) as resp: + result = json.loads(resp.read().decode()) + raw = result.get("response", "").strip() + start = raw.find("{") + end = raw.rfind("}") + 1 + if start == -1 or end == 0: + return None, f"NO_JSON: {raw[:100]}" + json_str = raw[start:end] + parsed = json.loads(json_str) + if not isinstance(parsed, dict): + return None, f"NOT_DICT: {json_str[:100]}" + return parsed, raw + except urllib.error.URLError as e: + return None, f"URL_ERROR: {e}" + except TimeoutError: + return None, "TIMEOUT" + except json.JSONDecodeError as e: + return None, f"JSON_ERROR: {e}" + except Exception as e: + return None, f"ERROR: {type(e).__name__}: {e}" + + +def flatten_value(v): + if isinstance(v, str): + return v.lower().strip() + elif isinstance(v, dict): + return json.dumps(v, sort_keys=True).lower() + elif isinstance(v, list): + return json.dumps(sorted([flatten_value(i) for i in v])) + else: + return str(v).lower().strip() + + +def normalize_extraction(extracted): + if extracted is None: + return None + normalized = {} + expected_fields = ["people", "organizations", "locations", "dates", "document_type"] + for key in expected_fields: + val = extracted.get(key, [] if key != "document_type" else "") + if isinstance(val, list): + normalized[key] = sorted([flatten_value(v) for v in val]) + else: + normalized[key] = flatten_value(val) + return normalized + + +def extractions_consistent(extractions): + if any(e is None for e in extractions): + return False + normalized = [normalize_extraction(e) for e in extractions] + if any(n is None for n in normalized): + return False + return all(n == normalized[0] for n in normalized[1:]) + + +def content_hash(text): + return hashlib.md5(text.encode()).hexdigest()[:8] + + +def main(): + print(f"\nBirdAI Consistency Test") + print(f"Model: {MODEL} | Passes: {PASSES} | Sample: {SAMPLE_SIZE} docs") + print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + print(f"Results: {RESULTS_FILE}") + print("-" * 60) + + docs = get_sample_documents() + print(f"Loaded {len(docs)} documents from pgvector\n") + + results = { + "meta": { + "model": MODEL, + "passes": PASSES, + "sample_size": len(docs), + "started": datetime.now().isoformat(), + "completed": None + }, + "documents": [], + "summary": {} + } + + consistent_count = 0 + failed_count = 0 + timeout_count = 0 + + for i, doc in enumerate(docs): + doc_id = doc["id"] + content = doc["document"] + source = doc.get("source", "unknown") + chash = content_hash(content) + + print(f"[{i+1:02d}/{len(docs)}] {source[:50]:<50} hash:{chash}", end=" ", flush=True) + + passes = [] + pass_times = [] + raw_outputs = [] + + for p in range(PASSES): + t_start = time.time() + extracted, raw = run_extraction(content) + t_end = time.time() + passes.append(extracted) + pass_times.append(round(t_end - t_start, 1)) + raw_outputs.append(raw[:200] if raw else "") + + consistent = extractions_consistent(passes) + any_timeout = any("TIMEOUT" in str(r) for r in raw_outputs) + any_failed = any(p is None for p in passes) + + if any_timeout: + timeout_count += 1 + status = "TIMEOUT" + elif any_failed: + failed_count += 1 + status = "FAILED" + elif consistent: + consistent_count += 1 + status = "CONSISTENT" + else: + status = "INCONSISTENT" + + print(f"→ {status} ({'/'.join(str(t) for t in pass_times)}s)") + + try: + sample_extraction = normalize_extraction(passes[0]) if passes[0] else None + except Exception: + sample_extraction = None + + results["documents"].append({ + "id": doc_id, + "source": source, + "content_hash": chash, + "content_length": len(content), + "status": status, + "consistent": consistent, + "pass_times_seconds": pass_times, + "extraction_sample": sample_extraction, + "raw_samples": raw_outputs + }) + + with open(RESULTS_FILE, "w") as f: + json.dump(results, f, indent=2, default=str) + + total = len(docs) + completed_at = datetime.now().isoformat() + results["meta"]["completed"] = completed_at + + summary = { + "total": total, + "consistent": consistent_count, + "inconsistent": total - consistent_count - failed_count - timeout_count, + "failed": failed_count, + "timeout": timeout_count, + "consistency_rate": round(consistent_count / total * 100, 1), + "cascade_viable": consistent_count / total >= 0.5 + } + results["summary"] = summary + + with open(RESULTS_FILE, "w") as f: + json.dump(results, f, indent=2, default=str) + + print("\n" + "=" * 60) + print(f"RESULTS") + print(f" Consistent: {consistent_count}/{total} ({summary['consistency_rate']}%)") + print(f" Inconsistent: {summary['inconsistent']}") + print(f" Failed/Timeout: {failed_count + timeout_count}") + print(f" Cascade viable: {'YES' if summary['cascade_viable'] else 'NO — reconsider architecture'}") + print(f" Completed: {completed_at}") + print(f" Full results: {RESULTS_FILE}") + print("=" * 60) + + +if __name__ == "__main__": + main()