Add Whisper small model — /api/transcribe endpoint, VAD filter, domain vocabulary prompt

2026-04-26 15:25:22 +00:00
parent 17e06b1e70
commit 050fe4669b
1 changed files with 48 additions and 0 deletions
@@ -10,6 +10,14 @@ import chromadb
 from sentence_transformers import SentenceTransformer
 import anthropic
 from fastapi import FastAPI, Request, Response, Depends, HTTPException
 from fastapi import UploadFile, File
 import tempfile
 import os
 try:
    from faster_whisper import WhisperModel
    HAS_WHISPER = True
 except ImportError:
    HAS_WHISPER = False
 from fastapi.responses import FileResponse, JSONResponse
 import secrets
 import hashlib
@@ -38,6 +46,18 @@ DEFAULT_SETTINGS = {
 }
 print("Loading Aaron AI...")
 WHISPER_PROMPT = (
    "Grasshopper, Rhino, PolyJet, SLA, FDM, DMLS, ChromaDB, "
    "HVAMC, FWN3D, Mossygear, Nextcloud, Gitea, computational design, "
    "additive manufacturing, parametric, fabrication"
 )
 whisper_model = None
 if HAS_WHISPER:
    try:
        whisper_model = WhisperModel("small", device="cpu", compute_type="int8", cpu_threads=4)
        print("Whisper model loaded")
    except Exception as e:
        print(f"Whisper not available: {e}")
 embedder = SentenceTransformer("all-MiniLM-L6-v2")
 chroma_client = chromadb.PersistentClient(path=DB_PATH)
 collection = chroma_client.get_or_create_collection(
@@ -533,6 +553,34 @@ async def get_status(auth: str = Depends(require_auth)):
        "nextcloud_path": NEXTCLOUD_PATH
    })
@app.post("/api/transcribe")
 async def transcribe_audio(request: Request, audio: UploadFile = File(...), auth: str = Depends(require_auth)):
    if not whisper_model:
        raise HTTPException(status_code=503, detail="Whisper not available")
    try:
        suffix = ".webm"
        if audio.content_type and "mp4" in audio.content_type:
            suffix = ".mp4"
        elif audio.content_type and "ogg" in audio.content_type:
            suffix = ".ogg"
        with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
            content = await audio.read()
            tmp.write(content)
            tmp_path = tmp.name
        segments, info = whisper_model.transcribe(
            tmp_path,
            language="en",
            vad_filter=True,
            initial_prompt=WHISPER_PROMPT
        )
        transcript = " ".join(s.text.strip() for s in segments)
        os.unlink(tmp_path)
        return JSONResponse({"text": transcript, "language": info.language})
    except Exception as e:
        if os.path.exists(tmp_path):
            os.unlink(tmp_path)
        raise HTTPException(status_code=500, detail=str(e))
@app.post("/api/reindex")
 async def trigger_reindex(auth: str = Depends(require_auth)):
    try: