async voice transcription — return immediately, whisper runs in background

This commit is contained in:
2026-04-29 17:48:22 +00:00
parent eb7cf3be10
commit a05fcec882
+37 -19
View File
@@ -8,7 +8,7 @@ from datetime import datetime
from dotenv import load_dotenv from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer from sentence_transformers import SentenceTransformer
import anthropic import anthropic
from fastapi import FastAPI, Request, Response, Depends, HTTPException from fastapi import FastAPI, Request, Response, Depends, HTTPException, BackgroundTasks
import psycopg2 import psycopg2
import psycopg2.extras import psycopg2.extras
from fastapi import UploadFile, File, Form from fastapi import UploadFile, File, Form
@@ -696,8 +696,35 @@ async def run_dreamer(request: Request, auth: str = Depends(require_auth)):
except Exception as e: except Exception as e:
return JSONResponse({"started": False, "error": str(e)}) return JSONResponse({"started": False, "error": str(e)})
def transcribe_and_save(tmp_path, timestamp, nextcloud_url, nextcloud_user, nextcloud_password):
"""Background task — transcribes audio and saves to Nextcloud after endpoint returns."""
import requests as req_lib
nc_auth = (nextcloud_user, nextcloud_password)
try:
segments, _ = whisper_model.transcribe(
tmp_path, language="en", vad_filter=True, initial_prompt=WHISPER_PROMPT
)
transcript = " ".join(s.text.strip() for s in segments).strip()
os.unlink(tmp_path)
if not transcript:
print(f"Async transcription empty for {timestamp} — nothing saved")
return
filename = f"{timestamp}-voice.md"
content_md = f"# Capture — {timestamp}\n\n**type:** voice\n**modality:** audio\n**status:** unprocessed\n\n---\n\n{transcript}\n"
captures_dir = f"{nextcloud_url}/remote.php/dav/files/{nextcloud_user}/Journal/Captures"
req_lib.request("MKCOL", captures_dir, auth=nc_auth, timeout=10)
url = f"{captures_dir}/{filename}"
req_lib.put(url, data=content_md.encode("utf-8"), auth=nc_auth, timeout=30)
print(f"Async transcription saved: {filename}")
except Exception as e:
if os.path.exists(tmp_path):
os.unlink(tmp_path)
print(f"Async transcription failed for {timestamp}: {e}")
@app.post("/api/capture") @app.post("/api/capture")
async def capture_endpoint( async def capture_endpoint(
background_tasks: BackgroundTasks,
audio: UploadFile = File(None), audio: UploadFile = File(None),
image: UploadFile = File(None), image: UploadFile = File(None),
project: str = Form(None), project: str = Form(None),
@@ -836,7 +863,6 @@ Keep the full description to 150-250 words. Do not speculate beyond what is visi
elif audio is not None: elif audio is not None:
if not whisper_model: if not whisper_model:
raise HTTPException(status_code=503, detail="Whisper not available") raise HTTPException(status_code=503, detail="Whisper not available")
tmp_path = None
try: try:
suffix = ".webm" suffix = ".webm"
if audio.content_type and "mp4" in audio.content_type: if audio.content_type and "mp4" in audio.content_type:
@@ -847,25 +873,17 @@ Keep the full description to 150-250 words. Do not speculate beyond what is visi
content_bytes = await audio.read() content_bytes = await audio.read()
tmp.write(content_bytes) tmp.write(content_bytes)
tmp_path = tmp.name tmp_path = tmp.name
segments, _ = whisper_model.transcribe( background_tasks.add_task(
tmp_path, language="en", vad_filter=True, initial_prompt=WHISPER_PROMPT transcribe_and_save,
tmp_path=tmp_path,
timestamp=timestamp,
nextcloud_url=nextcloud_url,
nextcloud_user=nextcloud_user,
nextcloud_password=nextcloud_password,
) )
transcript = " ".join(s.text.strip() for s in segments).strip() return JSONResponse({"ok": True, "filename": f"{timestamp}-voice.md", "async": True})
os.unlink(tmp_path)
tmp_path = None
if not transcript:
return JSONResponse({"ok": False, "error": "No speech detected"})
filename = f"{timestamp}-voice.md"
content_md = f"# Capture — {timestamp}\n\n**type:** voice\n**modality:** audio\n**status:** unprocessed\n\n---\n\n{transcript}\n"
captures_dir = f"{nextcloud_url}/remote.php/dav/files/{nextcloud_user}/Journal/Captures"
req_lib.request("MKCOL", captures_dir, auth=nc_auth, timeout=10)
url = f"{captures_dir}/{filename}"
req_lib.put(url, data=content_md.encode("utf-8"), auth=nc_auth, timeout=30)
return JSONResponse({"ok": True, "filename": filename, "transcript": transcript})
except Exception as e: except Exception as e:
if tmp_path and os.path.exists(tmp_path): return JSONResponse({"ok": False, "error": str(e), "error_type": "capture_failed"})
os.unlink(tmp_path)
return JSONResponse({"ok": False, "error": str(e), "error_type": "transcription_failed"})
else: else:
raise HTTPException(status_code=400, detail="No audio or image provided") raise HTTPException(status_code=400, detail="No audio or image provided")