Image capture — extend /api/capture for image+voice, Claude vision description, Media/ WebDAV, watcher excludes Media/
This commit is contained in:
+172
-55
@@ -11,7 +11,7 @@ import anthropic
|
||||
from fastapi import FastAPI, Request, Response, Depends, HTTPException
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
from fastapi import UploadFile, File
|
||||
from fastapi import UploadFile, File, Form
|
||||
import tempfile
|
||||
import os
|
||||
try:
|
||||
@@ -683,59 +683,178 @@ async def run_dreamer(request: Request, auth: str = Depends(require_auth)):
|
||||
return JSONResponse({"started": False, "error": str(e)})
|
||||
|
||||
@app.post("/api/capture")
|
||||
async def capture_audio(audio: UploadFile = File(...)):
|
||||
"""Auth-free capture endpoint — saves transcribed audio to Nextcloud Journal/Captures/"""
|
||||
if not whisper_model:
|
||||
raise HTTPException(status_code=503, detail="Whisper not available")
|
||||
tmp_path = None
|
||||
try:
|
||||
suffix = ".webm"
|
||||
if audio.content_type and "mp4" in audio.content_type:
|
||||
suffix = ".mp4"
|
||||
elif audio.content_type and "ogg" in audio.content_type:
|
||||
suffix = ".ogg"
|
||||
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
|
||||
content = await audio.read()
|
||||
tmp.write(content)
|
||||
tmp_path = tmp.name
|
||||
async def capture_endpoint(
|
||||
audio: UploadFile = File(None),
|
||||
image: UploadFile = File(None),
|
||||
project: str = Form(None),
|
||||
):
|
||||
"""Auth-free capture endpoint — handles voice, image, or image+voice."""
|
||||
import requests as req_lib
|
||||
import base64
|
||||
|
||||
segments, info = whisper_model.transcribe(
|
||||
tmp_path,
|
||||
language="en",
|
||||
vad_filter=True,
|
||||
initial_prompt=WHISPER_PROMPT
|
||||
)
|
||||
transcript = " ".join(s.text.strip() for s in segments).strip()
|
||||
os.unlink(tmp_path)
|
||||
nextcloud_url = os.getenv("NEXTCLOUD_URL", "")
|
||||
nextcloud_user = os.getenv("NEXTCLOUD_USER", "aaron")
|
||||
nextcloud_password = os.getenv("NEXTCLOUD_PASSWORD", "")
|
||||
nc_auth = (nextcloud_user, nextcloud_password)
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M")
|
||||
month_dir = datetime.now().strftime("%Y-%m")
|
||||
|
||||
# ── Image + optional voice ───────────────────────────────────────────────
|
||||
if image is not None:
|
||||
tmp_audio_path = None
|
||||
try:
|
||||
# Read image bytes
|
||||
image_bytes = await image.read()
|
||||
image_content_type = image.content_type or "image/jpeg"
|
||||
# Determine extension
|
||||
ext_map = {"image/jpeg": "jpg", "image/png": "png", "image/webp": "webp", "image/heic": "jpg"}
|
||||
img_ext = ext_map.get(image_content_type, "jpg")
|
||||
img_filename = f"{timestamp}-image.{img_ext}"
|
||||
|
||||
# Save raw image to Media/YYYY-MM/ via WebDAV
|
||||
media_dir = f"{nextcloud_url}/remote.php/dav/files/{nextcloud_user}/Journal/Media/{month_dir}"
|
||||
req_lib.request("MKCOL", f"{nextcloud_url}/remote.php/dav/files/{nextcloud_user}/Journal/Media", auth=nc_auth, timeout=10)
|
||||
req_lib.request("MKCOL", media_dir, auth=nc_auth, timeout=10)
|
||||
media_url = f"{media_dir}/{img_filename}"
|
||||
req_lib.put(media_url, data=image_bytes, auth=nc_auth,
|
||||
headers={"Content-Type": image_content_type}, timeout=60)
|
||||
|
||||
# Transcribe voice annotation if present
|
||||
voice_annotation = None
|
||||
if audio is not None and whisper_model:
|
||||
audio_bytes = await audio.read()
|
||||
suffix = ".webm"
|
||||
if audio.content_type and "mp4" in audio.content_type:
|
||||
suffix = ".mp4"
|
||||
elif audio.content_type and "ogg" in audio.content_type:
|
||||
suffix = ".ogg"
|
||||
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
|
||||
tmp.write(audio_bytes)
|
||||
tmp_audio_path = tmp.name
|
||||
segments, _ = whisper_model.transcribe(
|
||||
tmp_audio_path, language="en", vad_filter=True, initial_prompt=WHISPER_PROMPT
|
||||
)
|
||||
voice_annotation = " ".join(s.text.strip() for s in segments).strip() or None
|
||||
os.unlink(tmp_audio_path)
|
||||
tmp_audio_path = None
|
||||
|
||||
# Generate Claude vision description
|
||||
image_b64 = base64.standard_b64encode(image_bytes).decode("utf-8")
|
||||
annotation_line = f"Aaron said about this image: \"{voice_annotation}\"" if voice_annotation else ""
|
||||
vision_prompt = f"""You are generating a memory description for an AI corpus belonging to Aaron Nelson — computational designer, fabrication researcher, and visual artist working in the Hudson Valley.
|
||||
|
||||
Describe this image for long-term memory indexing.
|
||||
|
||||
PERCEPTUAL: Composition, materials, light, color, texture, scale, spatial relationships. Be specific enough that this image could be distinguished from visually similar images.
|
||||
|
||||
CONTENT: What is this? What domain does it belong to? What is it an instance of?
|
||||
|
||||
{annotation_line}
|
||||
|
||||
End your response with a single line in this exact format:
|
||||
ENTITIES: [comma-separated list of key entities — people, objects, materials, places, projects, tools]
|
||||
|
||||
Keep the full description to 150-250 words. Do not speculate beyond what is visible or stated. Write as continuous prose followed by the ENTITIES line."""
|
||||
|
||||
vision_response = anthropic_client.messages.create(
|
||||
model="claude-sonnet-4-6",
|
||||
max_tokens=800,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image",
|
||||
"source": {
|
||||
"type": "base64",
|
||||
"media_type": image_content_type,
|
||||
"data": image_b64,
|
||||
}
|
||||
},
|
||||
{"type": "text", "text": vision_prompt}
|
||||
]
|
||||
}]
|
||||
)
|
||||
description = vision_response.content[0].text.strip()
|
||||
|
||||
# Build rich Graphiti-ready episode markdown
|
||||
capture_type = "image+voice" if voice_annotation else "image"
|
||||
modality = "visual+audio" if voice_annotation else "visual"
|
||||
media_path = f"Journal/Media/{month_dir}/{img_filename}"
|
||||
|
||||
content_md = f"""# Capture — Image — {timestamp}
|
||||
|
||||
**type:** {capture_type}
|
||||
**modality:** {modality}
|
||||
**status:** unprocessed
|
||||
**media:** {media_path}
|
||||
{f"**project:** {project}" if project else ""}
|
||||
|
||||
---
|
||||
|
||||
**Visual description:**
|
||||
{description}
|
||||
|
||||
**Voice annotation:**
|
||||
{voice_annotation if voice_annotation else "none recorded"}
|
||||
|
||||
---
|
||||
"""
|
||||
# Save description to Journal/Captures/ via WebDAV
|
||||
captures_dir = f"{nextcloud_url}/remote.php/dav/files/{nextcloud_user}/Journal/Captures"
|
||||
req_lib.request("MKCOL", captures_dir, auth=nc_auth, timeout=10)
|
||||
cap_filename = f"{timestamp}-image.md"
|
||||
cap_url = f"{captures_dir}/{cap_filename}"
|
||||
req_lib.put(cap_url, data=content_md.encode("utf-8"), auth=nc_auth, timeout=30)
|
||||
|
||||
return JSONResponse({
|
||||
"ok": True,
|
||||
"filename": cap_filename,
|
||||
"media": media_path,
|
||||
"has_voice": voice_annotation is not None,
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
if tmp_audio_path and os.path.exists(tmp_audio_path):
|
||||
os.unlink(tmp_audio_path)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
# ── Voice only ───────────────────────────────────────────────────────────
|
||||
elif audio is not None:
|
||||
if not whisper_model:
|
||||
raise HTTPException(status_code=503, detail="Whisper not available")
|
||||
tmp_path = None
|
||||
|
||||
if not transcript:
|
||||
return JSONResponse({"ok": False, "error": "No speech detected"})
|
||||
|
||||
# Save to Nextcloud Journal/Captures/ via WebDAV
|
||||
import requests as req_lib
|
||||
from datetime import datetime
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M")
|
||||
filename = f"{timestamp}-voice.md"
|
||||
content_md = f"# Capture — {timestamp}\n\n**type:** voice\n**modality:** audio\n**status:** unprocessed\n\n---\n\n{transcript}\n"
|
||||
|
||||
nextcloud_url = os.getenv("NEXTCLOUD_URL", "")
|
||||
nextcloud_user = os.getenv("NEXTCLOUD_USER", "aaron")
|
||||
nextcloud_password = os.getenv("NEXTCLOUD_PASSWORD", "")
|
||||
captures_dir = f"{nextcloud_url}/remote.php/dav/files/{nextcloud_user}/Journal/Captures"
|
||||
auth = (nextcloud_user, nextcloud_password)
|
||||
|
||||
req_lib.request("MKCOL", captures_dir, auth=auth, timeout=10)
|
||||
url = f"{captures_dir}/{filename}"
|
||||
response = req_lib.put(url, data=content_md.encode("utf-8"), auth=auth, timeout=30)
|
||||
response.raise_for_status()
|
||||
|
||||
return JSONResponse({"ok": True, "filename": filename, "transcript": transcript})
|
||||
|
||||
except Exception as e:
|
||||
if tmp_path and os.path.exists(tmp_path):
|
||||
try:
|
||||
suffix = ".webm"
|
||||
if audio.content_type and "mp4" in audio.content_type:
|
||||
suffix = ".mp4"
|
||||
elif audio.content_type and "ogg" in audio.content_type:
|
||||
suffix = ".ogg"
|
||||
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
|
||||
content_bytes = await audio.read()
|
||||
tmp.write(content_bytes)
|
||||
tmp_path = tmp.name
|
||||
segments, _ = whisper_model.transcribe(
|
||||
tmp_path, language="en", vad_filter=True, initial_prompt=WHISPER_PROMPT
|
||||
)
|
||||
transcript = " ".join(s.text.strip() for s in segments).strip()
|
||||
os.unlink(tmp_path)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
tmp_path = None
|
||||
if not transcript:
|
||||
return JSONResponse({"ok": False, "error": "No speech detected"})
|
||||
filename = f"{timestamp}-voice.md"
|
||||
content_md = f"# Capture — {timestamp}\n\n**type:** voice\n**modality:** audio\n**status:** unprocessed\n\n---\n\n{transcript}\n"
|
||||
captures_dir = f"{nextcloud_url}/remote.php/dav/files/{nextcloud_user}/Journal/Captures"
|
||||
req_lib.request("MKCOL", captures_dir, auth=nc_auth, timeout=10)
|
||||
url = f"{captures_dir}/{filename}"
|
||||
req_lib.put(url, data=content_md.encode("utf-8"), auth=nc_auth, timeout=30)
|
||||
return JSONResponse({"ok": True, "filename": filename, "transcript": transcript})
|
||||
except Exception as e:
|
||||
if tmp_path and os.path.exists(tmp_path):
|
||||
os.unlink(tmp_path)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
else:
|
||||
raise HTTPException(status_code=400, detail="No audio or image provided")
|
||||
|
||||
@app.get("/api/captures")
|
||||
async def list_captures():
|
||||
@@ -795,14 +914,12 @@ async def clear_all_conversations(auth: str = Depends(require_auth)):
|
||||
scheduler = BackgroundScheduler()
|
||||
|
||||
def run_dream_job():
|
||||
"""Runs nightly dreamer — reuses loaded embedder, no subprocess overhead."""
|
||||
"""Runs nightly dreamer — full interdependent pipeline, no mode flag."""
|
||||
try:
|
||||
import subprocess
|
||||
settings = load_settings()
|
||||
mode = settings.get("dream_mode", "nrem")
|
||||
dream_script = str(Path.home() / "aaronai" / "scripts" / "dream.py")
|
||||
result = subprocess.run(
|
||||
[PYTHON, dream_script, "--mode", mode],
|
||||
[PYTHON, dream_script],
|
||||
cwd=str(Path.home() / "aaronai"),
|
||||
capture_output=True, text=True, timeout=600
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user