From a27f22ceaf95412710a395d0bd5655cd1ad15c40 Mon Sep 17 00:00:00 2001
From: Aaron Nelson <aaron@aaronnelson.studio>
Date: Mon, 4 May 2026 01:00:32 +0000
Subject: [PATCH] api.py: switch whisper to distil-large-v3, beam_size=1,
 cpu_threads=4

Three changes to reduce voice-note transcription latency on the VPS:
- Model: large-v3 -> distil-large-v3 (~6x faster, near-identical English
  accuracy; language is already hardcoded "en").
- beam_size: 5 (default) -> 1 (~3-4x faster on clean audio).
- cpu_threads: 8 -> 4 (the box has 8 cores running api, dreamer, watcher,
  nextcloud concurrently; ctranslate2's inter-op pool plus context switching
  makes 4 effectively faster than 8 here).

Combined effect expected ~10-15x over prior config. No accuracy regression
expected for the voice-note use case (English, clean audio, domain terms
already supplied via initial_prompt).
---
 scripts/api.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/scripts/api.py b/scripts/api.py
index 45420ef..67fdba2 100644
--- a/scripts/api.py
+++ b/scripts/api.py
@@ -73,7 +73,7 @@ WHISPER_PROMPT = (
 whisper_model = None
 if HAS_WHISPER:
     try:
-        whisper_model = WhisperModel("large-v3", device="cpu", compute_type="int8", cpu_threads=8)
+        whisper_model = WhisperModel("distil-large-v3", device="cpu", compute_type="int8", cpu_threads=4)
         print("Whisper model loaded")
     except Exception as e:
         print(f"Whisper not available: {e}")
@@ -623,6 +623,7 @@ async def transcribe_audio(request: Request, audio: UploadFile = File(...), auth
             tmp_path,
             language="en",
             vad_filter=True,
+            beam_size=1,
             initial_prompt=WHISPER_PROMPT
         )
         transcript = " ".join(s.text.strip() for s in segments)
@@ -674,7 +675,7 @@ def transcribe_and_save(tmp_path, timestamp, nextcloud_url, nextcloud_user, next
     nc_auth = (nextcloud_user, nextcloud_password)
     try:
         segments, _ = whisper_model.transcribe(
-            tmp_path, language="en", vad_filter=True, initial_prompt=WHISPER_PROMPT
+            tmp_path, language="en", vad_filter=True, beam_size=1, initial_prompt=WHISPER_PROMPT
         )
         transcript = " ".join(s.text.strip() for s in segments).strip()
         os.unlink(tmp_path)
@@ -760,7 +761,7 @@ async def capture_endpoint(
                     tmp.write(audio_bytes)
                     tmp_audio_path = tmp.name
                 segments, _ = whisper_model.transcribe(
-                    tmp_audio_path, language="en", vad_filter=True, initial_prompt=WHISPER_PROMPT
+                    tmp_audio_path, language="en", vad_filter=True, beam_size=1, initial_prompt=WHISPER_PROMPT
                 )
                 voice_annotation = " ".join(s.text.strip() for s in segments).strip() or None
                 os.unlink(tmp_audio_path)