From 050fe4669b3ba73f2ca589ec16fbe8c49b98f016 Mon Sep 17 00:00:00 2001 From: Aaron Nelson Date: Sun, 26 Apr 2026 15:25:22 +0000 Subject: [PATCH] =?UTF-8?q?Add=20Whisper=20small=20model=20=E2=80=94=20/ap?= =?UTF-8?q?i/transcribe=20endpoint,=20VAD=20filter,=20domain=20vocabulary?= =?UTF-8?q?=20prompt?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/api.py | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/scripts/api.py b/scripts/api.py index abdba8a..b8caca7 100644 --- a/scripts/api.py +++ b/scripts/api.py @@ -10,6 +10,14 @@ import chromadb from sentence_transformers import SentenceTransformer import anthropic from fastapi import FastAPI, Request, Response, Depends, HTTPException +from fastapi import UploadFile, File +import tempfile +import os +try: + from faster_whisper import WhisperModel + HAS_WHISPER = True +except ImportError: + HAS_WHISPER = False from fastapi.responses import FileResponse, JSONResponse import secrets import hashlib @@ -38,6 +46,18 @@ DEFAULT_SETTINGS = { } print("Loading Aaron AI...") +WHISPER_PROMPT = ( + "Grasshopper, Rhino, PolyJet, SLA, FDM, DMLS, ChromaDB, " + "HVAMC, FWN3D, Mossygear, Nextcloud, Gitea, computational design, " + "additive manufacturing, parametric, fabrication" +) +whisper_model = None +if HAS_WHISPER: + try: + whisper_model = WhisperModel("small", device="cpu", compute_type="int8", cpu_threads=4) + print("Whisper model loaded") + except Exception as e: + print(f"Whisper not available: {e}") embedder = SentenceTransformer("all-MiniLM-L6-v2") chroma_client = chromadb.PersistentClient(path=DB_PATH) collection = chroma_client.get_or_create_collection( @@ -533,6 +553,34 @@ async def get_status(auth: str = Depends(require_auth)): "nextcloud_path": NEXTCLOUD_PATH }) +@app.post("/api/transcribe") +async def transcribe_audio(request: Request, audio: UploadFile = File(...), auth: str = Depends(require_auth)): + if not whisper_model: + raise HTTPException(status_code=503, detail="Whisper not available") + try: + suffix = ".webm" + if audio.content_type and "mp4" in audio.content_type: + suffix = ".mp4" + elif audio.content_type and "ogg" in audio.content_type: + suffix = ".ogg" + with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp: + content = await audio.read() + tmp.write(content) + tmp_path = tmp.name + segments, info = whisper_model.transcribe( + tmp_path, + language="en", + vad_filter=True, + initial_prompt=WHISPER_PROMPT + ) + transcript = " ".join(s.text.strip() for s in segments) + os.unlink(tmp_path) + return JSONResponse({"text": transcript, "language": info.language}) + except Exception as e: + if os.path.exists(tmp_path): + os.unlink(tmp_path) + raise HTTPException(status_code=500, detail=str(e)) + @app.post("/api/reindex") async def trigger_reindex(auth: str = Depends(require_auth)): try: