From 050fe4669b3ba73f2ca589ec16fbe8c49b98f016 Mon Sep 17 00:00:00 2001
From: Aaron Nelson <aaron@aaronnelson.studio>
Date: Sun, 26 Apr 2026 15:25:22 +0000
Subject: [PATCH] =?UTF-8?q?Add=20Whisper=20small=20model=20=E2=80=94=20/ap?=
 =?UTF-8?q?i/transcribe=20endpoint,=20VAD=20filter,=20domain=20vocabulary?=
 =?UTF-8?q?=20prompt?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 scripts/api.py | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/scripts/api.py b/scripts/api.py
index abdba8a..b8caca7 100644
--- a/scripts/api.py
+++ b/scripts/api.py
@@ -10,6 +10,14 @@ import chromadb
 from sentence_transformers import SentenceTransformer
 import anthropic
 from fastapi import FastAPI, Request, Response, Depends, HTTPException
+from fastapi import UploadFile, File
+import tempfile
+import os
+try:
+    from faster_whisper import WhisperModel
+    HAS_WHISPER = True
+except ImportError:
+    HAS_WHISPER = False
 from fastapi.responses import FileResponse, JSONResponse
 import secrets
 import hashlib
@@ -38,6 +46,18 @@ DEFAULT_SETTINGS = {
 }
 
 print("Loading Aaron AI...")
+WHISPER_PROMPT = (
+    "Grasshopper, Rhino, PolyJet, SLA, FDM, DMLS, ChromaDB, "
+    "HVAMC, FWN3D, Mossygear, Nextcloud, Gitea, computational design, "
+    "additive manufacturing, parametric, fabrication"
+)
+whisper_model = None
+if HAS_WHISPER:
+    try:
+        whisper_model = WhisperModel("small", device="cpu", compute_type="int8", cpu_threads=4)
+        print("Whisper model loaded")
+    except Exception as e:
+        print(f"Whisper not available: {e}")
 embedder = SentenceTransformer("all-MiniLM-L6-v2")
 chroma_client = chromadb.PersistentClient(path=DB_PATH)
 collection = chroma_client.get_or_create_collection(
@@ -533,6 +553,34 @@ async def get_status(auth: str = Depends(require_auth)):
         "nextcloud_path": NEXTCLOUD_PATH
     })
 
+@app.post("/api/transcribe")
+async def transcribe_audio(request: Request, audio: UploadFile = File(...), auth: str = Depends(require_auth)):
+    if not whisper_model:
+        raise HTTPException(status_code=503, detail="Whisper not available")
+    try:
+        suffix = ".webm"
+        if audio.content_type and "mp4" in audio.content_type:
+            suffix = ".mp4"
+        elif audio.content_type and "ogg" in audio.content_type:
+            suffix = ".ogg"
+        with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
+            content = await audio.read()
+            tmp.write(content)
+            tmp_path = tmp.name
+        segments, info = whisper_model.transcribe(
+            tmp_path,
+            language="en",
+            vad_filter=True,
+            initial_prompt=WHISPER_PROMPT
+        )
+        transcript = " ".join(s.text.strip() for s in segments)
+        os.unlink(tmp_path)
+        return JSONResponse({"text": transcript, "language": info.language})
+    except Exception as e:
+        if os.path.exists(tmp_path):
+            os.unlink(tmp_path)
+        raise HTTPException(status_code=500, detail=str(e))
+
 @app.post("/api/reindex")
 async def trigger_reindex(auth: str = Depends(require_auth)):
     try: