diff --git a/scripts/api.py b/scripts/api.py index 45420ef..67fdba2 100644 --- a/scripts/api.py +++ b/scripts/api.py @@ -73,7 +73,7 @@ WHISPER_PROMPT = ( whisper_model = None if HAS_WHISPER: try: - whisper_model = WhisperModel("large-v3", device="cpu", compute_type="int8", cpu_threads=8) + whisper_model = WhisperModel("distil-large-v3", device="cpu", compute_type="int8", cpu_threads=4) print("Whisper model loaded") except Exception as e: print(f"Whisper not available: {e}") @@ -623,6 +623,7 @@ async def transcribe_audio(request: Request, audio: UploadFile = File(...), auth tmp_path, language="en", vad_filter=True, + beam_size=1, initial_prompt=WHISPER_PROMPT ) transcript = " ".join(s.text.strip() for s in segments) @@ -674,7 +675,7 @@ def transcribe_and_save(tmp_path, timestamp, nextcloud_url, nextcloud_user, next nc_auth = (nextcloud_user, nextcloud_password) try: segments, _ = whisper_model.transcribe( - tmp_path, language="en", vad_filter=True, initial_prompt=WHISPER_PROMPT + tmp_path, language="en", vad_filter=True, beam_size=1, initial_prompt=WHISPER_PROMPT ) transcript = " ".join(s.text.strip() for s in segments).strip() os.unlink(tmp_path) @@ -760,7 +761,7 @@ async def capture_endpoint( tmp.write(audio_bytes) tmp_audio_path = tmp.name segments, _ = whisper_model.transcribe( - tmp_audio_path, language="en", vad_filter=True, initial_prompt=WHISPER_PROMPT + tmp_audio_path, language="en", vad_filter=True, beam_size=1, initial_prompt=WHISPER_PROMPT ) voice_annotation = " ".join(s.text.strip() for s in segments).strip() or None os.unlink(tmp_audio_path)