async voice transcription — return immediately, whisper runs in background

2026-04-29 17:48:22 +00:00
parent eb7cf3be10
commit a05fcec882
1 changed files with 37 additions and 19 deletions
@@ -8,7 +8,7 @@ from datetime import datetime
 from dotenv import load_dotenv
 from sentence_transformers import SentenceTransformer
 import anthropic
-from fastapi import FastAPI, Request, Response, Depends, HTTPException
+from fastapi import FastAPI, Request, Response, Depends, HTTPException, BackgroundTasks
 import psycopg2
 import psycopg2.extras
 from fastapi import UploadFile, File, Form
@@ -696,8 +696,35 @@ async def run_dreamer(request: Request, auth: str = Depends(require_auth)):
    except Exception as e:
        return JSONResponse({"started": False, "error": str(e)})

+def transcribe_and_save(tmp_path, timestamp, nextcloud_url, nextcloud_user, nextcloud_password):
+    """Background task — transcribes audio and saves to Nextcloud after endpoint returns."""
+    import requests as req_lib
+    nc_auth = (nextcloud_user, nextcloud_password)
+    try:
+        segments, _ = whisper_model.transcribe(
+            tmp_path, language="en", vad_filter=True, initial_prompt=WHISPER_PROMPT
+        )
+        transcript = " ".join(s.text.strip() for s in segments).strip()
+        os.unlink(tmp_path)
+        if not transcript:
+            print(f"Async transcription empty for {timestamp} — nothing saved")
+            return
+        filename = f"{timestamp}-voice.md"
+        content_md = f"# Capture — {timestamp}\n\n**type:** voice\n**modality:** audio\n**status:** unprocessed\n\n---\n\n{transcript}\n"
+        captures_dir = f"{nextcloud_url}/remote.php/dav/files/{nextcloud_user}/Journal/Captures"
+        req_lib.request("MKCOL", captures_dir, auth=nc_auth, timeout=10)
+        url = f"{captures_dir}/{filename}"
+        req_lib.put(url, data=content_md.encode("utf-8"), auth=nc_auth, timeout=30)
+        print(f"Async transcription saved: {filename}")
+    except Exception as e:
+        if os.path.exists(tmp_path):
+            os.unlink(tmp_path)
+        print(f"Async transcription failed for {timestamp}: {e}")
+
+
@app.post("/api/capture")
 async def capture_endpoint(
+    background_tasks: BackgroundTasks,
    audio: UploadFile = File(None),
    image: UploadFile = File(None),
    project: str = Form(None),
@@ -836,7 +863,6 @@ Keep the full description to 150-250 words. Do not speculate beyond what is visi
    elif audio is not None:
        if not whisper_model:
            raise HTTPException(status_code=503, detail="Whisper not available")
-        tmp_path = None
        try:
            suffix = ".webm"
            if audio.content_type and "mp4" in audio.content_type:
@@ -847,25 +873,17 @@ Keep the full description to 150-250 words. Do not speculate beyond what is visi
                content_bytes = await audio.read()
                tmp.write(content_bytes)
                tmp_path = tmp.name
-            segments, _ = whisper_model.transcribe(
-                tmp_path, language="en", vad_filter=True, initial_prompt=WHISPER_PROMPT
+            background_tasks.add_task(
+                transcribe_and_save,
+                tmp_path=tmp_path,
+                timestamp=timestamp,
+                nextcloud_url=nextcloud_url,
+                nextcloud_user=nextcloud_user,
+                nextcloud_password=nextcloud_password,
            )
-            transcript = " ".join(s.text.strip() for s in segments).strip()
-            os.unlink(tmp_path)
-            tmp_path = None
-            if not transcript:
-                return JSONResponse({"ok": False, "error": "No speech detected"})
-            filename = f"{timestamp}-voice.md"
-            content_md = f"# Capture — {timestamp}\n\n**type:** voice\n**modality:** audio\n**status:** unprocessed\n\n---\n\n{transcript}\n"
-            captures_dir = f"{nextcloud_url}/remote.php/dav/files/{nextcloud_user}/Journal/Captures"
-            req_lib.request("MKCOL", captures_dir, auth=nc_auth, timeout=10)
-            url = f"{captures_dir}/{filename}"
-            req_lib.put(url, data=content_md.encode("utf-8"), auth=nc_auth, timeout=30)
-            return JSONResponse({"ok": True, "filename": filename, "transcript": transcript})
+            return JSONResponse({"ok": True, "filename": f"{timestamp}-voice.md", "async": True})
        except Exception as e:
-            if tmp_path and os.path.exists(tmp_path):
-                os.unlink(tmp_path)
-            return JSONResponse({"ok": False, "error": str(e), "error_type": "transcription_failed"})
+            return JSONResponse({"ok": False, "error": str(e), "error_type": "capture_failed"})

    else:
        raise HTTPException(status_code=400, detail="No audio or image provided")