Image capture — extend /api/capture for image+voice, Claude vision description, Media/ WebDAV, watcher excludes Media/

This commit is contained in:
2026-04-27 04:28:31 +00:00
parent ef2fddc47f
commit d3239aba17
2 changed files with 174 additions and 55 deletions
+172 -55
View File
@@ -11,7 +11,7 @@ import anthropic
from fastapi import FastAPI, Request, Response, Depends, HTTPException
import psycopg2
import psycopg2.extras
from fastapi import UploadFile, File
from fastapi import UploadFile, File, Form
import tempfile
import os
try:
@@ -683,59 +683,178 @@ async def run_dreamer(request: Request, auth: str = Depends(require_auth)):
return JSONResponse({"started": False, "error": str(e)})
@app.post("/api/capture")
async def capture_audio(audio: UploadFile = File(...)):
"""Auth-free capture endpoint — saves transcribed audio to Nextcloud Journal/Captures/"""
if not whisper_model:
raise HTTPException(status_code=503, detail="Whisper not available")
tmp_path = None
try:
suffix = ".webm"
if audio.content_type and "mp4" in audio.content_type:
suffix = ".mp4"
elif audio.content_type and "ogg" in audio.content_type:
suffix = ".ogg"
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
content = await audio.read()
tmp.write(content)
tmp_path = tmp.name
async def capture_endpoint(
audio: UploadFile = File(None),
image: UploadFile = File(None),
project: str = Form(None),
):
"""Auth-free capture endpoint — handles voice, image, or image+voice."""
import requests as req_lib
import base64
segments, info = whisper_model.transcribe(
tmp_path,
language="en",
vad_filter=True,
initial_prompt=WHISPER_PROMPT
)
transcript = " ".join(s.text.strip() for s in segments).strip()
os.unlink(tmp_path)
nextcloud_url = os.getenv("NEXTCLOUD_URL", "")
nextcloud_user = os.getenv("NEXTCLOUD_USER", "aaron")
nextcloud_password = os.getenv("NEXTCLOUD_PASSWORD", "")
nc_auth = (nextcloud_user, nextcloud_password)
timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M")
month_dir = datetime.now().strftime("%Y-%m")
# ── Image + optional voice ───────────────────────────────────────────────
if image is not None:
tmp_audio_path = None
try:
# Read image bytes
image_bytes = await image.read()
image_content_type = image.content_type or "image/jpeg"
# Determine extension
ext_map = {"image/jpeg": "jpg", "image/png": "png", "image/webp": "webp", "image/heic": "jpg"}
img_ext = ext_map.get(image_content_type, "jpg")
img_filename = f"{timestamp}-image.{img_ext}"
# Save raw image to Media/YYYY-MM/ via WebDAV
media_dir = f"{nextcloud_url}/remote.php/dav/files/{nextcloud_user}/Journal/Media/{month_dir}"
req_lib.request("MKCOL", f"{nextcloud_url}/remote.php/dav/files/{nextcloud_user}/Journal/Media", auth=nc_auth, timeout=10)
req_lib.request("MKCOL", media_dir, auth=nc_auth, timeout=10)
media_url = f"{media_dir}/{img_filename}"
req_lib.put(media_url, data=image_bytes, auth=nc_auth,
headers={"Content-Type": image_content_type}, timeout=60)
# Transcribe voice annotation if present
voice_annotation = None
if audio is not None and whisper_model:
audio_bytes = await audio.read()
suffix = ".webm"
if audio.content_type and "mp4" in audio.content_type:
suffix = ".mp4"
elif audio.content_type and "ogg" in audio.content_type:
suffix = ".ogg"
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
tmp.write(audio_bytes)
tmp_audio_path = tmp.name
segments, _ = whisper_model.transcribe(
tmp_audio_path, language="en", vad_filter=True, initial_prompt=WHISPER_PROMPT
)
voice_annotation = " ".join(s.text.strip() for s in segments).strip() or None
os.unlink(tmp_audio_path)
tmp_audio_path = None
# Generate Claude vision description
image_b64 = base64.standard_b64encode(image_bytes).decode("utf-8")
annotation_line = f"Aaron said about this image: \"{voice_annotation}\"" if voice_annotation else ""
vision_prompt = f"""You are generating a memory description for an AI corpus belonging to Aaron Nelson — computational designer, fabrication researcher, and visual artist working in the Hudson Valley.
Describe this image for long-term memory indexing.
PERCEPTUAL: Composition, materials, light, color, texture, scale, spatial relationships. Be specific enough that this image could be distinguished from visually similar images.
CONTENT: What is this? What domain does it belong to? What is it an instance of?
{annotation_line}
End your response with a single line in this exact format:
ENTITIES: [comma-separated list of key entities — people, objects, materials, places, projects, tools]
Keep the full description to 150-250 words. Do not speculate beyond what is visible or stated. Write as continuous prose followed by the ENTITIES line."""
vision_response = anthropic_client.messages.create(
model="claude-sonnet-4-6",
max_tokens=800,
messages=[{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": image_content_type,
"data": image_b64,
}
},
{"type": "text", "text": vision_prompt}
]
}]
)
description = vision_response.content[0].text.strip()
# Build rich Graphiti-ready episode markdown
capture_type = "image+voice" if voice_annotation else "image"
modality = "visual+audio" if voice_annotation else "visual"
media_path = f"Journal/Media/{month_dir}/{img_filename}"
content_md = f"""# Capture — Image — {timestamp}
**type:** {capture_type}
**modality:** {modality}
**status:** unprocessed
**media:** {media_path}
{f"**project:** {project}" if project else ""}
---
**Visual description:**
{description}
**Voice annotation:**
{voice_annotation if voice_annotation else "none recorded"}
---
"""
# Save description to Journal/Captures/ via WebDAV
captures_dir = f"{nextcloud_url}/remote.php/dav/files/{nextcloud_user}/Journal/Captures"
req_lib.request("MKCOL", captures_dir, auth=nc_auth, timeout=10)
cap_filename = f"{timestamp}-image.md"
cap_url = f"{captures_dir}/{cap_filename}"
req_lib.put(cap_url, data=content_md.encode("utf-8"), auth=nc_auth, timeout=30)
return JSONResponse({
"ok": True,
"filename": cap_filename,
"media": media_path,
"has_voice": voice_annotation is not None,
})
except Exception as e:
if tmp_audio_path and os.path.exists(tmp_audio_path):
os.unlink(tmp_audio_path)
raise HTTPException(status_code=500, detail=str(e))
# ── Voice only ───────────────────────────────────────────────────────────
elif audio is not None:
if not whisper_model:
raise HTTPException(status_code=503, detail="Whisper not available")
tmp_path = None
if not transcript:
return JSONResponse({"ok": False, "error": "No speech detected"})
# Save to Nextcloud Journal/Captures/ via WebDAV
import requests as req_lib
from datetime import datetime
timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M")
filename = f"{timestamp}-voice.md"
content_md = f"# Capture — {timestamp}\n\n**type:** voice\n**modality:** audio\n**status:** unprocessed\n\n---\n\n{transcript}\n"
nextcloud_url = os.getenv("NEXTCLOUD_URL", "")
nextcloud_user = os.getenv("NEXTCLOUD_USER", "aaron")
nextcloud_password = os.getenv("NEXTCLOUD_PASSWORD", "")
captures_dir = f"{nextcloud_url}/remote.php/dav/files/{nextcloud_user}/Journal/Captures"
auth = (nextcloud_user, nextcloud_password)
req_lib.request("MKCOL", captures_dir, auth=auth, timeout=10)
url = f"{captures_dir}/{filename}"
response = req_lib.put(url, data=content_md.encode("utf-8"), auth=auth, timeout=30)
response.raise_for_status()
return JSONResponse({"ok": True, "filename": filename, "transcript": transcript})
except Exception as e:
if tmp_path and os.path.exists(tmp_path):
try:
suffix = ".webm"
if audio.content_type and "mp4" in audio.content_type:
suffix = ".mp4"
elif audio.content_type and "ogg" in audio.content_type:
suffix = ".ogg"
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
content_bytes = await audio.read()
tmp.write(content_bytes)
tmp_path = tmp.name
segments, _ = whisper_model.transcribe(
tmp_path, language="en", vad_filter=True, initial_prompt=WHISPER_PROMPT
)
transcript = " ".join(s.text.strip() for s in segments).strip()
os.unlink(tmp_path)
raise HTTPException(status_code=500, detail=str(e))
tmp_path = None
if not transcript:
return JSONResponse({"ok": False, "error": "No speech detected"})
filename = f"{timestamp}-voice.md"
content_md = f"# Capture — {timestamp}\n\n**type:** voice\n**modality:** audio\n**status:** unprocessed\n\n---\n\n{transcript}\n"
captures_dir = f"{nextcloud_url}/remote.php/dav/files/{nextcloud_user}/Journal/Captures"
req_lib.request("MKCOL", captures_dir, auth=nc_auth, timeout=10)
url = f"{captures_dir}/{filename}"
req_lib.put(url, data=content_md.encode("utf-8"), auth=nc_auth, timeout=30)
return JSONResponse({"ok": True, "filename": filename, "transcript": transcript})
except Exception as e:
if tmp_path and os.path.exists(tmp_path):
os.unlink(tmp_path)
raise HTTPException(status_code=500, detail=str(e))
else:
raise HTTPException(status_code=400, detail="No audio or image provided")
@app.get("/api/captures")
async def list_captures():
@@ -795,14 +914,12 @@ async def clear_all_conversations(auth: str = Depends(require_auth)):
scheduler = BackgroundScheduler()
def run_dream_job():
"""Runs nightly dreamer — reuses loaded embedder, no subprocess overhead."""
"""Runs nightly dreamer — full interdependent pipeline, no mode flag."""
try:
import subprocess
settings = load_settings()
mode = settings.get("dream_mode", "nrem")
dream_script = str(Path.home() / "aaronai" / "scripts" / "dream.py")
result = subprocess.run(
[PYTHON, dream_script, "--mode", mode],
[PYTHON, dream_script],
cwd=str(Path.home() / "aaronai"),
capture_output=True, text=True, timeout=600
)