api.py: switch whisper to distil-large-v3, beam_size=1, cpu_threads=4
Three changes to reduce voice-note transcription latency on the VPS: - Model: large-v3 -> distil-large-v3 (~6x faster, near-identical English accuracy; language is already hardcoded "en"). - beam_size: 5 (default) -> 1 (~3-4x faster on clean audio). - cpu_threads: 8 -> 4 (the box has 8 cores running api, dreamer, watcher, nextcloud concurrently; ctranslate2's inter-op pool plus context switching makes 4 effectively faster than 8 here). Combined effect expected ~10-15x over prior config. No accuracy regression expected for the voice-note use case (English, clean audio, domain terms already supplied via initial_prompt).
This commit is contained in:
+4
-3
@@ -73,7 +73,7 @@ WHISPER_PROMPT = (
|
|||||||
whisper_model = None
|
whisper_model = None
|
||||||
if HAS_WHISPER:
|
if HAS_WHISPER:
|
||||||
try:
|
try:
|
||||||
whisper_model = WhisperModel("large-v3", device="cpu", compute_type="int8", cpu_threads=8)
|
whisper_model = WhisperModel("distil-large-v3", device="cpu", compute_type="int8", cpu_threads=4)
|
||||||
print("Whisper model loaded")
|
print("Whisper model loaded")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Whisper not available: {e}")
|
print(f"Whisper not available: {e}")
|
||||||
@@ -623,6 +623,7 @@ async def transcribe_audio(request: Request, audio: UploadFile = File(...), auth
|
|||||||
tmp_path,
|
tmp_path,
|
||||||
language="en",
|
language="en",
|
||||||
vad_filter=True,
|
vad_filter=True,
|
||||||
|
beam_size=1,
|
||||||
initial_prompt=WHISPER_PROMPT
|
initial_prompt=WHISPER_PROMPT
|
||||||
)
|
)
|
||||||
transcript = " ".join(s.text.strip() for s in segments)
|
transcript = " ".join(s.text.strip() for s in segments)
|
||||||
@@ -674,7 +675,7 @@ def transcribe_and_save(tmp_path, timestamp, nextcloud_url, nextcloud_user, next
|
|||||||
nc_auth = (nextcloud_user, nextcloud_password)
|
nc_auth = (nextcloud_user, nextcloud_password)
|
||||||
try:
|
try:
|
||||||
segments, _ = whisper_model.transcribe(
|
segments, _ = whisper_model.transcribe(
|
||||||
tmp_path, language="en", vad_filter=True, initial_prompt=WHISPER_PROMPT
|
tmp_path, language="en", vad_filter=True, beam_size=1, initial_prompt=WHISPER_PROMPT
|
||||||
)
|
)
|
||||||
transcript = " ".join(s.text.strip() for s in segments).strip()
|
transcript = " ".join(s.text.strip() for s in segments).strip()
|
||||||
os.unlink(tmp_path)
|
os.unlink(tmp_path)
|
||||||
@@ -760,7 +761,7 @@ async def capture_endpoint(
|
|||||||
tmp.write(audio_bytes)
|
tmp.write(audio_bytes)
|
||||||
tmp_audio_path = tmp.name
|
tmp_audio_path = tmp.name
|
||||||
segments, _ = whisper_model.transcribe(
|
segments, _ = whisper_model.transcribe(
|
||||||
tmp_audio_path, language="en", vad_filter=True, initial_prompt=WHISPER_PROMPT
|
tmp_audio_path, language="en", vad_filter=True, beam_size=1, initial_prompt=WHISPER_PROMPT
|
||||||
)
|
)
|
||||||
voice_annotation = " ".join(s.text.strip() for s in segments).strip() or None
|
voice_annotation = " ".join(s.text.strip() for s in segments).strip() or None
|
||||||
os.unlink(tmp_audio_path)
|
os.unlink(tmp_audio_path)
|
||||||
|
|||||||
Reference in New Issue
Block a user