Image capture — extend /api/capture for image+voice, Claude vision description, Media/ WebDAV, watcher excludes Media/

2026-04-27 04:28:31 +00:00
parent ef2fddc47f
commit d3239aba17
2 changed files with 174 additions and 55 deletions
@@ -11,7 +11,7 @@ import anthropic
 from fastapi import FastAPI, Request, Response, Depends, HTTPException
 import psycopg2
 import psycopg2.extras
-from fastapi import UploadFile, File
+from fastapi import UploadFile, File, Form
 import tempfile
 import os
 try:
@@ -683,59 +683,178 @@ async def run_dreamer(request: Request, auth: str = Depends(require_auth)):
        return JSONResponse({"started": False, "error": str(e)})

@app.post("/api/capture")
-async def capture_audio(audio: UploadFile = File(...)):
-    """Auth-free capture endpoint — saves transcribed audio to Nextcloud Journal/Captures/"""
-    if not whisper_model:
-        raise HTTPException(status_code=503, detail="Whisper not available")
-    tmp_path = None
-    try:
-        suffix = ".webm"
-        if audio.content_type and "mp4" in audio.content_type:
-            suffix = ".mp4"
-        elif audio.content_type and "ogg" in audio.content_type:
-            suffix = ".ogg"
-        with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
-            content = await audio.read()
-            tmp.write(content)
-            tmp_path = tmp.name
+async def capture_endpoint(
+    audio: UploadFile = File(None),
+    image: UploadFile = File(None),
+    project: str = Form(None),
+):
+    """Auth-free capture endpoint — handles voice, image, or image+voice."""
+    import requests as req_lib
+    import base64

-        segments, info = whisper_model.transcribe(
-            tmp_path,
-            language="en",
-            vad_filter=True,
-            initial_prompt=WHISPER_PROMPT
-        )
-        transcript = " ".join(s.text.strip() for s in segments).strip()
-        os.unlink(tmp_path)
+    nextcloud_url = os.getenv("NEXTCLOUD_URL", "")
+    nextcloud_user = os.getenv("NEXTCLOUD_USER", "aaron")
+    nextcloud_password = os.getenv("NEXTCLOUD_PASSWORD", "")
+    nc_auth = (nextcloud_user, nextcloud_password)
+    timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M")
+    month_dir = datetime.now().strftime("%Y-%m")
+
+    # ── Image + optional voice ───────────────────────────────────────────────
+    if image is not None:
+        tmp_audio_path = None
+        try:
+            # Read image bytes
+            image_bytes = await image.read()
+            image_content_type = image.content_type or "image/jpeg"
+            # Determine extension
+            ext_map = {"image/jpeg": "jpg", "image/png": "png", "image/webp": "webp", "image/heic": "jpg"}
+            img_ext = ext_map.get(image_content_type, "jpg")
+            img_filename = f"{timestamp}-image.{img_ext}"
+
+            # Save raw image to Media/YYYY-MM/ via WebDAV
+            media_dir = f"{nextcloud_url}/remote.php/dav/files/{nextcloud_user}/Journal/Media/{month_dir}"
+            req_lib.request("MKCOL", f"{nextcloud_url}/remote.php/dav/files/{nextcloud_user}/Journal/Media", auth=nc_auth, timeout=10)
+            req_lib.request("MKCOL", media_dir, auth=nc_auth, timeout=10)
+            media_url = f"{media_dir}/{img_filename}"
+            req_lib.put(media_url, data=image_bytes, auth=nc_auth,
+                       headers={"Content-Type": image_content_type}, timeout=60)
+
+            # Transcribe voice annotation if present
+            voice_annotation = None
+            if audio is not None and whisper_model:
+                audio_bytes = await audio.read()
+                suffix = ".webm"
+                if audio.content_type and "mp4" in audio.content_type:
+                    suffix = ".mp4"
+                elif audio.content_type and "ogg" in audio.content_type:
+                    suffix = ".ogg"
+                with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
+                    tmp.write(audio_bytes)
+                    tmp_audio_path = tmp.name
+                segments, _ = whisper_model.transcribe(
+                    tmp_audio_path, language="en", vad_filter=True, initial_prompt=WHISPER_PROMPT
+                )
+                voice_annotation = " ".join(s.text.strip() for s in segments).strip() or None
+                os.unlink(tmp_audio_path)
+                tmp_audio_path = None
+
+            # Generate Claude vision description
+            image_b64 = base64.standard_b64encode(image_bytes).decode("utf-8")
+            annotation_line = f"Aaron said about this image: \"{voice_annotation}\"" if voice_annotation else ""
+            vision_prompt = f"""You are generating a memory description for an AI corpus belonging to Aaron Nelson — computational designer, fabrication researcher, and visual artist working in the Hudson Valley.
+
+Describe this image for long-term memory indexing.
+
+PERCEPTUAL: Composition, materials, light, color, texture, scale, spatial relationships. Be specific enough that this image could be distinguished from visually similar images.
+
+CONTENT: What is this? What domain does it belong to? What is it an instance of?
+
+{annotation_line}
+
+End your response with a single line in this exact format:
+ENTITIES: [comma-separated list of key entities — people, objects, materials, places, projects, tools]
+
+Keep the full description to 150-250 words. Do not speculate beyond what is visible or stated. Write as continuous prose followed by the ENTITIES line."""
+
+            vision_response = anthropic_client.messages.create(
+                model="claude-sonnet-4-6",
+                max_tokens=800,
+                messages=[{
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image",
+                            "source": {
+                                "type": "base64",
+                                "media_type": image_content_type,
+                                "data": image_b64,
+                            }
+                        },
+                        {"type": "text", "text": vision_prompt}
+                    ]
+                }]
+            )
+            description = vision_response.content[0].text.strip()
+
+            # Build rich Graphiti-ready episode markdown
+            capture_type = "image+voice" if voice_annotation else "image"
+            modality = "visual+audio" if voice_annotation else "visual"
+            media_path = f"Journal/Media/{month_dir}/{img_filename}"
+
+            content_md = f"""# Capture — Image — {timestamp}
+
+**type:** {capture_type}
+**modality:** {modality}
+**status:** unprocessed
+**media:** {media_path}
+{f"**project:** {project}" if project else ""}
+
+---
+
+**Visual description:**
+{description}
+
+**Voice annotation:**
+{voice_annotation if voice_annotation else "none recorded"}
+
+---
+"""
+            # Save description to Journal/Captures/ via WebDAV
+            captures_dir = f"{nextcloud_url}/remote.php/dav/files/{nextcloud_user}/Journal/Captures"
+            req_lib.request("MKCOL", captures_dir, auth=nc_auth, timeout=10)
+            cap_filename = f"{timestamp}-image.md"
+            cap_url = f"{captures_dir}/{cap_filename}"
+            req_lib.put(cap_url, data=content_md.encode("utf-8"), auth=nc_auth, timeout=30)
+
+            return JSONResponse({
+                "ok": True,
+                "filename": cap_filename,
+                "media": media_path,
+                "has_voice": voice_annotation is not None,
+            })
+
+        except Exception as e:
+            if tmp_audio_path and os.path.exists(tmp_audio_path):
+                os.unlink(tmp_audio_path)
+            raise HTTPException(status_code=500, detail=str(e))
+
+    # ── Voice only ───────────────────────────────────────────────────────────
+    elif audio is not None:
+        if not whisper_model:
+            raise HTTPException(status_code=503, detail="Whisper not available")
        tmp_path = None
-
-        if not transcript:
-            return JSONResponse({"ok": False, "error": "No speech detected"})
-
-        # Save to Nextcloud Journal/Captures/ via WebDAV
-        import requests as req_lib
-        from datetime import datetime
-        timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M")
-        filename = f"{timestamp}-voice.md"
-        content_md = f"# Capture — {timestamp}\n\n**type:** voice\n**modality:** audio\n**status:** unprocessed\n\n---\n\n{transcript}\n"
-
-        nextcloud_url = os.getenv("NEXTCLOUD_URL", "")
-        nextcloud_user = os.getenv("NEXTCLOUD_USER", "aaron")
-        nextcloud_password = os.getenv("NEXTCLOUD_PASSWORD", "")
-        captures_dir = f"{nextcloud_url}/remote.php/dav/files/{nextcloud_user}/Journal/Captures"
-        auth = (nextcloud_user, nextcloud_password)
-
-        req_lib.request("MKCOL", captures_dir, auth=auth, timeout=10)
-        url = f"{captures_dir}/{filename}"
-        response = req_lib.put(url, data=content_md.encode("utf-8"), auth=auth, timeout=30)
-        response.raise_for_status()
-
-        return JSONResponse({"ok": True, "filename": filename, "transcript": transcript})
-
-    except Exception as e:
-        if tmp_path and os.path.exists(tmp_path):
+        try:
+            suffix = ".webm"
+            if audio.content_type and "mp4" in audio.content_type:
+                suffix = ".mp4"
+            elif audio.content_type and "ogg" in audio.content_type:
+                suffix = ".ogg"
+            with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
+                content_bytes = await audio.read()
+                tmp.write(content_bytes)
+                tmp_path = tmp.name
+            segments, _ = whisper_model.transcribe(
+                tmp_path, language="en", vad_filter=True, initial_prompt=WHISPER_PROMPT
+            )
+            transcript = " ".join(s.text.strip() for s in segments).strip()
            os.unlink(tmp_path)
-        raise HTTPException(status_code=500, detail=str(e))
+            tmp_path = None
+            if not transcript:
+                return JSONResponse({"ok": False, "error": "No speech detected"})
+            filename = f"{timestamp}-voice.md"
+            content_md = f"# Capture — {timestamp}\n\n**type:** voice\n**modality:** audio\n**status:** unprocessed\n\n---\n\n{transcript}\n"
+            captures_dir = f"{nextcloud_url}/remote.php/dav/files/{nextcloud_user}/Journal/Captures"
+            req_lib.request("MKCOL", captures_dir, auth=nc_auth, timeout=10)
+            url = f"{captures_dir}/{filename}"
+            req_lib.put(url, data=content_md.encode("utf-8"), auth=nc_auth, timeout=30)
+            return JSONResponse({"ok": True, "filename": filename, "transcript": transcript})
+        except Exception as e:
+            if tmp_path and os.path.exists(tmp_path):
+                os.unlink(tmp_path)
+            raise HTTPException(status_code=500, detail=str(e))
+
+    else:
+        raise HTTPException(status_code=400, detail="No audio or image provided")

@app.get("/api/captures")
 async def list_captures():
@@ -795,14 +914,12 @@ async def clear_all_conversations(auth: str = Depends(require_auth)):
 scheduler = BackgroundScheduler()

 def run_dream_job():
-    """Runs nightly dreamer — reuses loaded embedder, no subprocess overhead."""
+    """Runs nightly dreamer — full interdependent pipeline, no mode flag."""
    try:
        import subprocess
-        settings = load_settings()
-        mode = settings.get("dream_mode", "nrem")
        dream_script = str(Path.home() / "aaronai" / "scripts" / "dream.py")
        result = subprocess.run(
-            [PYTHON, dream_script, "--mode", mode],
+            [PYTHON, dream_script],
            cwd=str(Path.home() / "aaronai"),
            capture_output=True, text=True, timeout=600
        )