From 655dea6ae594a038b2f4484dc45f49eb3113cd0e Mon Sep 17 00:00:00 2001 From: Aaron Nelson Date: Thu, 30 Apr 2026 18:06:52 +0000 Subject: [PATCH] add remaining experiment result files --- briefing_test_v2_results.json | 1718 +++++++++++++++++++++++++++++++++ large_bucket_sources.json | 12 + 2 files changed, 1730 insertions(+) create mode 100644 briefing_test_v2_results.json create mode 100644 large_bucket_sources.json diff --git a/briefing_test_v2_results.json b/briefing_test_v2_results.json new file mode 100644 index 0000000..2df6e24 --- /dev/null +++ b/briefing_test_v2_results.json @@ -0,0 +1,1718 @@ +{ + "meta": { + "model": "mistral", + "version": "v2", + "sample_size": 50, + "started": "2026-04-28T02:50:20.797376", + "completed": "2026-04-28T03:08:37.169551", + "total_elapsed_seconds": 1096.7, + "avg_seconds_per_doc": 21.9 + }, + "documents": [ + { + "id": "5ee0b3bb_0", + "source": "00_Syllabus.docx", + "content_hash": "848c971c", + "content_length": 2273, + "cleaned_content_length": 2273, + "status": "SUCCESS", + "pre_classified_type": null, + "was_pre_classified": false, + "elapsed_seconds": 36.3, + "briefing": { + "document_type": "syllabus", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 568, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 26.0 + } + }, + { + "id": "4451e0d5_1", + "source": "01_ALL_Overview of AM and 3DP_v3.pptx", + "content_hash": "1e3ff98f", + "content_length": 2167, + "cleaned_content_length": 2167, + "status": "SUCCESS", + "pre_classified_type": "presentation", + "was_pre_classified": true, + "elapsed_seconds": 33.7, + "briefing": { + "document_type": "presentation", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": false, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 542, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 27.0 + } + }, + { + "id": "0619cec0_0", + "source": "01_NURBS Curves.docx", + "content_hash": "2ac1bb56", + "content_length": 1401, + "cleaned_content_length": 1401, + "status": "SUCCESS", + "pre_classified_type": null, + "was_pre_classified": false, + "elapsed_seconds": 33.0, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 350, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 36.3 + } + }, + { + "id": "d0a3917e_0", + "source": "02_2D Geometry.docx", + "content_hash": "5d53f099", + "content_length": 188, + "cleaned_content_length": 188, + "status": "SUCCESS", + "pre_classified_type": null, + "was_pre_classified": false, + "elapsed_seconds": 10.6, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 47, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 81.0 + } + }, + { + "id": "89fed291_0", + "source": "02_Point of Curves - AARON.docx", + "content_hash": "864be8ed", + "content_length": 2116, + "cleaned_content_length": 2116, + "status": "SUCCESS", + "pre_classified_type": null, + "was_pre_classified": false, + "elapsed_seconds": 20.1, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 529, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 27.4 + } + }, + { + "id": "2a15be8d_0", + "source": "02_Point of Curves.docx", + "content_hash": "4b683753", + "content_length": 1338, + "cleaned_content_length": 1338, + "status": "SUCCESS", + "pre_classified_type": null, + "was_pre_classified": false, + "elapsed_seconds": 15.8, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 334, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 37.4 + } + }, + { + "id": "a2a7a8d3_2", + "source": "02_PPT_ALL_AM_Technologies_for_3DP_v3.pptx", + "content_hash": "1c260a03", + "content_length": 1675, + "cleaned_content_length": 1675, + "status": "SUCCESS", + "pre_classified_type": "presentation", + "was_pre_classified": true, + "elapsed_seconds": 34.3, + "briefing": { + "document_type": "presentation", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": false, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 419, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 32.3 + } + }, + { + "id": "2b3b1c34_0", + "source": "03_2D Transformation and Deformation.docx", + "content_hash": "306dc581", + "content_length": 418, + "cleaned_content_length": 418, + "status": "SUCCESS", + "pre_classified_type": null, + "was_pre_classified": false, + "elapsed_seconds": 24.7, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 104, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 65.7 + } + }, + { + "id": "b425985b_0", + "source": "03_2D Transformation, Deformation, and Editing-AARON.docx", + "content_hash": "9f9c422a", + "content_length": 541, + "cleaned_content_length": 541, + "status": "SUCCESS", + "pre_classified_type": null, + "was_pre_classified": false, + "elapsed_seconds": 12.0, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": false, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 135, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 59.7 + } + }, + { + "id": "1c78c79f_0", + "source": "03_Editing Geometry.docx", + "content_hash": "9491f6cc", + "content_length": 171, + "cleaned_content_length": 171, + "status": "SUCCESS", + "pre_classified_type": null, + "was_pre_classified": false, + "elapsed_seconds": 10.1, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 43, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 82.4 + } + }, + { + "id": "6453f3a8_6", + "source": "04_ALL_Materials and Their Properties_v3.pptx", + "content_hash": "d1ff494c", + "content_length": 2999, + "cleaned_content_length": 2999, + "status": "SUCCESS", + "pre_classified_type": "presentation", + "was_pre_classified": true, + "elapsed_seconds": 33.7, + "briefing": { + "document_type": "presentation", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 750, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 21.1 + } + }, + { + "id": "1fd396d4_0", + "source": "04_Annotations.docx", + "content_hash": "9b3e57cd", + "content_length": 737, + "cleaned_content_length": 737, + "status": "SUCCESS", + "pre_classified_type": null, + "was_pre_classified": false, + "elapsed_seconds": 27.0, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 184, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 52.0 + } + }, + { + "id": "38d1cf0d_0", + "source": "05_Entering the third dimension.docx", + "content_hash": "8f56202a", + "content_length": 2175, + "cleaned_content_length": 2175, + "status": "SUCCESS", + "pre_classified_type": null, + "was_pre_classified": false, + "elapsed_seconds": 19.2, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 544, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 26.9 + } + }, + { + "id": "cfd1ee43_0", + "source": "05_Making things solid.docx", + "content_hash": "12634c4c", + "content_length": 692, + "cleaned_content_length": 692, + "status": "SUCCESS", + "pre_classified_type": null, + "was_pre_classified": false, + "elapsed_seconds": 13.8, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists" + ], + "noise_signals": [ + "boilerplate" + ], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 173, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 9, + "noise_reduction_pct": 5.0, + "total_reduction_pct": 55.9 + } + }, + { + "id": "31c729e4_8", + "source": "05_PPT_ALL_Machine Technology and Specifications_v3.pptx", + "content_hash": "bf8daf4b", + "content_length": 2886, + "cleaned_content_length": 2886, + "status": "SUCCESS", + "pre_classified_type": "presentation", + "was_pre_classified": true, + "elapsed_seconds": 32.0, + "briefing": { + "document_type": "presentation", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 722, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 21.7 + } + }, + { + "id": "a8d4d8a4_0", + "source": "06_3D_Editing.docx", + "content_hash": "887133dc", + "content_length": 157, + "cleaned_content_length": 157, + "status": "SUCCESS", + "pre_classified_type": null, + "was_pre_classified": false, + "elapsed_seconds": 23.4, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 39, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 83.6 + } + }, + { + "id": "c0d0659e_0", + "source": "06_Gumball.docx", + "content_hash": "c46dbc48", + "content_length": 1980, + "cleaned_content_length": 1980, + "status": "SUCCESS", + "pre_classified_type": null, + "was_pre_classified": false, + "elapsed_seconds": 21.0, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 495, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 28.8 + } + }, + { + "id": "8f9d093e_5", + "source": "06_PPT_ALL_Design Considerations_From CAD to CAM_v3.pptx", + "content_hash": "2215d29f", + "content_length": 2945, + "cleaned_content_length": 2945, + "status": "SUCCESS", + "pre_classified_type": "presentation", + "was_pre_classified": true, + "elapsed_seconds": 33.4, + "briefing": { + "document_type": "presentation", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists", + "tables" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 736, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 21.4 + } + }, + { + "id": "4e7db487_0", + "source": "07_Cube Assignment_2018f.docx", + "content_hash": "af2f5bab", + "content_length": 1316, + "cleaned_content_length": 1316, + "status": "SUCCESS", + "pre_classified_type": null, + "was_pre_classified": false, + "elapsed_seconds": 32.0, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 329, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 37.8 + } + }, + { + "id": "1a5b6da3_0", + "source": "07_Make2D.docx", + "content_hash": "d71c1df4", + "content_length": 834, + "cleaned_content_length": 834, + "status": "SUCCESS", + "pre_classified_type": null, + "was_pre_classified": false, + "elapsed_seconds": 14.8, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 208, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 49.0 + } + }, + { + "id": "97ba28bd_5", + "source": "07_PPT_ALL_Fabrication Considerations_v3.pptx", + "content_hash": "7ffb6f57", + "content_length": 710, + "cleaned_content_length": 710, + "status": "SUCCESS", + "pre_classified_type": "presentation", + "was_pre_classified": true, + "elapsed_seconds": 26.6, + "briefing": { + "document_type": "presentation", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "tables", + "headings" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 178, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 53.0 + } + }, + { + "id": "0892b9fa_8", + "source": "08_PPT_ALL_PostProcessing for FDM and PolyJet_v3.pptx", + "content_hash": "139aa114", + "content_length": 365, + "cleaned_content_length": 365, + "status": "SUCCESS", + "pre_classified_type": "presentation", + "was_pre_classified": true, + "elapsed_seconds": 12.0, + "briefing": { + "document_type": "presentation", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": false, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 91, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 68.7 + } + }, + { + "id": "9446c72b_0", + "source": "08_Printing_Technicals.docx", + "content_hash": "365e53a6", + "content_length": 1310, + "cleaned_content_length": 1310, + "status": "SUCCESS", + "pre_classified_type": null, + "was_pre_classified": false, + "elapsed_seconds": 31.2, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 328, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 37.9 + } + }, + { + "id": "44b7a630_0", + "source": "09_Tolerance Test Part.docx", + "content_hash": "f6a14d20", + "content_length": 817, + "cleaned_content_length": 817, + "status": "SUCCESS", + "pre_classified_type": null, + "was_pre_classified": false, + "elapsed_seconds": 14.0, + "briefing": { + "document_type": "syllabus", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 204, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 49.5 + } + }, + { + "id": "4a74de83_0", + "source": "09_Tolerance Test Part.pdf", + "content_hash": "7f3106a9", + "content_length": 1049, + "cleaned_content_length": 1049, + "status": "SUCCESS", + "pre_classified_type": null, + "was_pre_classified": false, + "elapsed_seconds": 15.9, + "briefing": { + "document_type": "syllabus", + "primary_language": "en", + "density": "medium", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 262, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 43.3 + } + }, + { + "id": "aa807935_0", + "source": "10 Good Things about Aaron for DSI.docx", + "content_hash": "ff5081e9", + "content_length": 1126, + "cleaned_content_length": 1126, + "status": "SUCCESS", + "pre_classified_type": null, + "was_pre_classified": false, + "elapsed_seconds": 17.4, + "briefing": { + "document_type": "unknown", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "bullet_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 282, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 41.5 + } + }, + { + "id": "90248749_1", + "source": "10_Moving Parts.docx", + "content_hash": "3c2e28b9", + "content_length": 218, + "cleaned_content_length": 218, + "status": "SUCCESS", + "pre_classified_type": null, + "was_pre_classified": false, + "elapsed_seconds": 11.1, + "briefing": { + "document_type": "technical_doc", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 54, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 78.6 + } + }, + { + "id": "958e5aac_0", + "source": "1119345.pdf", + "content_hash": "af7da5db", + "content_length": 1745, + "cleaned_content_length": 1745, + "status": "SUCCESS", + "pre_classified_type": null, + "was_pre_classified": false, + "elapsed_seconds": 30.7, + "briefing": { + "document_type": "invoice", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists", + "tables" + ], + "noise_signals": [ + "page_numbers", + "formatting_artifacts", + "encoding_artifacts" + ], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 436, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 65, + "noise_reduction_pct": 15.0, + "total_reduction_pct": 41.7 + } + }, + { + "id": "adbfb7b8_1", + "source": "2016 - DDF 205 - CAD I Syllabus.pdf", + "content_hash": "466b3184", + "content_length": 2161, + "cleaned_content_length": 2161, + "status": "SUCCESS", + "pre_classified_type": null, + "was_pre_classified": false, + "elapsed_seconds": 20.0, + "briefing": { + "document_type": "syllabus", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": false, + "has_institutional_language": true, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [ + "headings" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 540, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 27.0 + } + }, + { + "id": "86543785_1", + "source": "2017 - DDF 220 - Intro to Computational Media Syllabus.pdf", + "content_hash": "3ad85610", + "content_length": 2224, + "cleaned_content_length": 2224, + "status": "SUCCESS", + "pre_classified_type": null, + "was_pre_classified": false, + "elapsed_seconds": 19.9, + "briefing": { + "document_type": "syllabus", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": false, + "has_institutional_language": true, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [ + "headings" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 556, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 26.5 + } + }, + { + "id": "4e199480_1", + "source": "2018 - DDF 205 - CAD I Syllabus.pdf", + "content_hash": "2632e62b", + "content_length": 2618, + "cleaned_content_length": 2618, + "status": "SUCCESS", + "pre_classified_type": null, + "was_pre_classified": false, + "elapsed_seconds": 19.3, + "briefing": { + "document_type": "syllabus", + "primary_language": "en", + "density": "medium", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": false, + "has_institutional_language": true, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [], + "extraction_priority": "partial" + }, + "token_reduction_estimate": { + "original_tokens_approx": 654, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 23.4 + } + }, + { + "id": "56b61c68_3", + "source": "2019-2020 Research and Creative Projects Awards Guidelines.FINAL.pdf", + "content_hash": "1a4e890b", + "content_length": 2228, + "cleaned_content_length": 2228, + "status": "SUCCESS", + "pre_classified_type": null, + "was_pre_classified": false, + "elapsed_seconds": 20.5, + "briefing": { + "document_type": "unknown", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 557, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 26.4 + } + }, + { + "id": "9ed5c43e_2", + "source": "2019 - DDF 305 - Materials Syllabus.pdf", + "content_hash": "c0521ba2", + "content_length": 1842, + "cleaned_content_length": 1842, + "status": "SUCCESS", + "pre_classified_type": null, + "was_pre_classified": false, + "elapsed_seconds": 21.7, + "briefing": { + "document_type": "syllabus", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "numbered_lists", + "tables" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 460, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 30.3 + } + }, + { + "id": "2e264727_2", + "source": "2020 - DDF 220 - Intro to Computational Media Syllabus.pdf", + "content_hash": "fe3ca5be", + "content_length": 2580, + "cleaned_content_length": 2580, + "status": "SUCCESS", + "pre_classified_type": null, + "was_pre_classified": false, + "elapsed_seconds": 19.4, + "briefing": { + "document_type": "academic_pdf", + "primary_language": "en", + "density": "medium", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "tables", + "headings" + ], + "noise_signals": [ + "line_numbers" + ], + "extraction_priority": "partial" + }, + "token_reduction_estimate": { + "original_tokens_approx": 645, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 32, + "noise_reduction_pct": 5.0, + "total_reduction_pct": 27.5 + } + }, + { + "id": "c0cd3599_3", + "source": "2021 - DDF 320 - Design Intents Syllabus.pdf", + "content_hash": "588d34a3", + "content_length": 1560, + "cleaned_content_length": 1560, + "status": "SUCCESS", + "pre_classified_type": null, + "was_pre_classified": false, + "elapsed_seconds": 22.8, + "briefing": { + "document_type": "syllabus", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": true, + "has_numeric_data": false, + "has_institutional_language": true, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists", + "numbered_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 390, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 33.9 + } + }, + { + "id": "9ea5656f_2", + "source": "2023 Faculty Report Aaron Nelson.docx", + "content_hash": "fd68d021", + "content_length": 2698, + "cleaned_content_length": 2698, + "status": "SUCCESS", + "pre_classified_type": null, + "was_pre_classified": false, + "elapsed_seconds": 19.2, + "briefing": { + "document_type": "academic_pdf", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 674, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 22.9 + } + }, + { + "id": "33aae3e5_2", + "source": "2023 Faculty Report Template.docx", + "content_hash": "c2d50031", + "content_length": 2100, + "cleaned_content_length": 2100, + "status": "SUCCESS", + "pre_classified_type": null, + "was_pre_classified": false, + "elapsed_seconds": 18.9, + "briefing": { + "document_type": "syllabus", + "primary_language": "en", + "density": "medium", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": false, + "has_institutional_language": true, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [ + "headings" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 525, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 27.6 + } + }, + { + "id": "bf155f9f_0", + "source": "2026-04-26-22-44-voice.md", + "content_hash": "41cc3d28", + "content_length": 165, + "cleaned_content_length": 72, + "status": "SUCCESS", + "pre_classified_type": "voice_capture", + "was_pre_classified": true, + "elapsed_seconds": 21.0, + "briefing": { + "document_type": "voice_capture", + "primary_language": "en", + "density": "low", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": false, + "has_institutional_language": false, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [], + "noise_signals": [], + "extraction_priority": "partial" + }, + "token_reduction_estimate": { + "original_tokens_approx": 18, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 91.7 + } + }, + { + "id": "5c9f5ad5_0", + "source": "2026-04-26-22-52-voice.md", + "content_hash": "0ed1efba", + "content_length": 171, + "cleaned_content_length": 78, + "status": "SUCCESS", + "pre_classified_type": "voice_capture", + "was_pre_classified": true, + "elapsed_seconds": 8.9, + "briefing": { + "document_type": "voice_capture", + "primary_language": "en", + "density": "medium", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": false, + "has_institutional_language": false, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [ + "headings" + ], + "noise_signals": [], + "extraction_priority": "partial" + }, + "token_reduction_estimate": { + "original_tokens_approx": 20, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 91.1 + } + }, + { + "id": "8bc956ad_0", + "source": "2026-04-26-23-04-voice.md", + "content_hash": "c455ef44", + "content_length": 931, + "cleaned_content_length": 838, + "status": "SUCCESS", + "pre_classified_type": "voice_capture", + "was_pre_classified": true, + "elapsed_seconds": 14.9, + "briefing": { + "document_type": "voice_capture", + "primary_language": "en", + "density": "low", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": false, + "has_institutional_language": false, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [], + "noise_signals": [ + "formatting_artifacts" + ], + "extraction_priority": "partial" + }, + "token_reduction_estimate": { + "original_tokens_approx": 210, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 10, + "noise_reduction_pct": 5.0, + "total_reduction_pct": 51.4 + } + }, + { + "id": "af176130_0", + "source": "2026-04-26-lucid-1.md", + "content_hash": "d9c51a1c", + "content_length": 2444, + "cleaned_content_length": 2302, + "status": "SUCCESS", + "pre_classified_type": "dream_lucid", + "was_pre_classified": true, + "elapsed_seconds": 31.7, + "briefing": { + "document_type": "dream_lucid", + "primary_language": "en", + "density": "medium", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [], + "noise_signals": [], + "extraction_priority": "partial" + }, + "token_reduction_estimate": { + "original_tokens_approx": 576, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 25.8 + } + }, + { + "id": "52114711_0", + "source": "2026-04-26-lucid.md", + "content_hash": "4c5fb648", + "content_length": 2437, + "cleaned_content_length": 2295, + "status": "SUCCESS", + "pre_classified_type": "dream_lucid", + "was_pre_classified": true, + "elapsed_seconds": 19.0, + "briefing": { + "document_type": "dream_lucid", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": false, + "has_institutional_language": true, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "headings", + "bullet_lists" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 574, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 25.8 + } + }, + { + "id": "1bf832a0_0", + "source": "2026-04-26-nrem-1.md", + "content_hash": "1ad1e9c1", + "content_length": 1586, + "cleaned_content_length": 1548, + "status": "SUCCESS", + "pre_classified_type": "dream_nrem", + "was_pre_classified": true, + "elapsed_seconds": 33.7, + "briefing": { + "document_type": "dream_nrem", + "primary_language": "en", + "density": "medium", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [ + "bullet_lists", + "numbered_lists" + ], + "noise_signals": [], + "extraction_priority": "partial" + }, + "token_reduction_estimate": { + "original_tokens_approx": 387, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 34.1 + } + }, + { + "id": "a16d6571_0", + "source": "2026-04-26-nrem.md", + "content_hash": "1714ccc0", + "content_length": 1638, + "cleaned_content_length": 1600, + "status": "SUCCESS", + "pre_classified_type": "dream_nrem", + "was_pre_classified": true, + "elapsed_seconds": 19.6, + "briefing": { + "document_type": "dream_nrem", + "primary_language": "en", + "density": "medium", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": true, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [ + "bullet_lists", + "numbered_lists" + ], + "noise_signals": [], + "extraction_priority": "partial" + }, + "token_reduction_estimate": { + "original_tokens_approx": 400, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 33.3 + } + }, + { + "id": "b696802f_0", + "source": "2026-04-27-04-34-image.md", + "content_hash": "3cce200d", + "content_length": 2027, + "cleaned_content_length": 1853, + "status": "SUCCESS", + "pre_classified_type": "image_capture", + "was_pre_classified": true, + "elapsed_seconds": 31.5, + "briefing": { + "document_type": "image_capture", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": true, + "has_institutional_language": false, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [ + "images" + ], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 463, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 30.2 + } + }, + { + "id": "6bc36d6f_0", + "source": "2026-04-27-04-36-image.md", + "content_hash": "29717d0c", + "content_length": 1755, + "status": "FAILED", + "pre_classified_type": "image_capture", + "error": "JSON_ERROR: Expecting property name enclosed in double quotes: line 11 column 36 (char 308) | raw: {\n \"document_type\": \"image_capture\",\n \"primary_language\": \"en\",\n \"density\": \"high\",\n \"has_proper_nouns\": true,\n \"has_dates\": false,\n \"has_numeric_data\": false,\n \"has_institutional_language\": fa", + "elapsed_seconds": 19.5 + }, + { + "id": "8b7ed0da_0", + "source": "2026-04-27-04-41-image.md", + "content_hash": "47a1f451", + "content_length": 2148, + "status": "FAILED", + "pre_classified_type": "image_capture", + "error": "JSON_ERROR: Expecting property name enclosed in double quotes: line 11 column 38 (char 310) | raw: {\n \"document_type\": \"image_capture\",\n \"primary_language\": \"en\",\n \"density\": \"medium\",\n \"has_proper_nouns\": true,\n \"has_dates\": false,\n \"has_numeric_data\": true,\n \"has_institutional_language\": t", + "elapsed_seconds": 19.3 + }, + { + "id": "700d4582_0", + "source": "2026-04-27-06-21-image.md", + "content_hash": "b143e6fc", + "content_length": 1643, + "cleaned_content_length": 1469, + "status": "SUCCESS", + "pre_classified_type": "image_capture", + "was_pre_classified": true, + "elapsed_seconds": 18.3, + "briefing": { + "document_type": "image_capture", + "primary_language": "en", + "density": "low", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": false, + "has_institutional_language": false, + "has_technical_terms": false, + "likely_has_named_entities": true, + "structure_signals": [], + "noise_signals": [], + "extraction_priority": "partial" + }, + "token_reduction_estimate": { + "original_tokens_approx": 367, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 35.3 + } + }, + { + "id": "31317444_0", + "source": "2026-04-27-19-04-image.md", + "content_hash": "8bd62d02", + "content_length": 1767, + "cleaned_content_length": 1593, + "status": "SUCCESS", + "pre_classified_type": "image_capture", + "was_pre_classified": true, + "elapsed_seconds": 18.4, + "briefing": { + "document_type": "image_capture", + "primary_language": "en", + "density": "medium", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": false, + "has_institutional_language": true, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [], + "noise_signals": [], + "extraction_priority": "partial" + }, + "token_reduction_estimate": { + "original_tokens_approx": 398, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 33.4 + } + }, + { + "id": "bc4bffcd_0", + "source": "2026-04-27-20-18-image.md", + "content_hash": "c33f8f22", + "content_length": 1856, + "cleaned_content_length": 1682, + "status": "SUCCESS", + "pre_classified_type": "image_capture", + "was_pre_classified": true, + "elapsed_seconds": 18.8, + "briefing": { + "document_type": "image_capture", + "primary_language": "en", + "density": "high", + "has_proper_nouns": true, + "has_dates": false, + "has_numeric_data": false, + "has_institutional_language": true, + "has_technical_terms": true, + "likely_has_named_entities": true, + "structure_signals": [], + "noise_signals": [], + "extraction_priority": "full" + }, + "token_reduction_estimate": { + "original_tokens_approx": 420, + "orientation_tokens_saved": 200, + "noise_tokens_saved": 0, + "noise_reduction_pct": 0.0, + "total_reduction_pct": 32.2 + } + } + ], + "summary": { + "total": 50, + "success": 48, + "failed": 2, + "success_rate": 96.0, + "pre_classified_by_rule": 20, + "classified_by_model": 30, + "extraction_priority_breakdown": { + "full": 38, + "partial": 10, + "skip": 0 + }, + "avg_token_reduction_pct": 42.0, + "total_elapsed_seconds": 1096.7, + "avg_seconds_per_doc": 21.9, + "projected_50_doc_minutes": 18.2, + "approach_viable": true + } +} \ No newline at end of file diff --git a/large_bucket_sources.json b/large_bucket_sources.json new file mode 100644 index 0000000..6b7a812 --- /dev/null +++ b/large_bucket_sources.json @@ -0,0 +1,12 @@ +[ + "Berube Independent Study Form.pdf", + "Aaron Nelson - Student Work.pdf", + "3dCOMp.pdf", + "Claude: Preparing for dinner with Jim Agutter", + "Annual Report - 2020.pdf", + "Wearable Marquees uw4.pptx", + "ChatGPT: Movie Quote Clarification", + "Mod07_Insight_2023.pptx", + "CAD I Syllabus.docx", + "ChatGPT: RMA armor discount codes" +] \ No newline at end of file