Initial project import

This commit is contained in:
Melbar
2026-05-02 09:07:41 +02:00
commit 8e1bcf142f
38 changed files with 7928 additions and 0 deletions
+1
View File
@@ -0,0 +1 @@
# src.llm package — Thematic segmentation / dramaturgy (NO vision matching)
+202
View File
@@ -0,0 +1,202 @@
"""
src/llm/dramaturg.py — LLM-based thematic beat classification (OpenRouter)
Responsibility:
- Receive a list of TrailerBeat objects (with dialogue lines attached)
- Send a single structured prompt to the LLM
- Parse the JSON response to assign BeatType to each beat
IMPORTANT: This module does ZERO visual analysis.
It classifies narrative dramaturgy from dialogue text only.
Visual matching is handled exclusively by the CV engine.
"""
from __future__ import annotations
import json
import logging
from dataclasses import replace
from typing import Sequence
from src.core.config import AppConfig
from src.core.models import BeatType, TrailerBeat
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Prompt builder
# ---------------------------------------------------------------------------
_SYSTEM_PROMPT = """You are a film trailer editor and narrative analyst.
Your task is to classify each beat of a trailer into one of these dramatic roles:
HOOK - Opening attention grabber (first impression, shocking image, logo)
SETUP - World/character introduction
CONFLICT - Inciting incident, rising tension, threat revealed
CLIMAX - Peak action/emotion, highest stakes
RESOLUTION - Cool-down, tagline, final title card
You will receive a JSON array of beats with their index and dialogue text.
Respond ONLY with a valid JSON array, one object per beat, with keys:
"beat_id" (int) and "beat_type" (one of the strings above).
Do NOT include any explanation or markdown fences."""
_USER_TEMPLATE = """Classify the following {n} trailer beats:
{beats_json}"""
def _build_beats_payload(beats: Sequence[TrailerBeat]) -> str:
payload = []
for b in beats:
dialogue_text = " / ".join(line.text for line in b.dialogue) or "(no dialogue)"
payload.append({
"beat_id": b.beat_id,
"duration": round(b.duration_s, 2),
"dialogue": dialogue_text,
})
return json.dumps(payload, ensure_ascii=False, indent=2)
# ---------------------------------------------------------------------------
# OpenRouter / OpenAI-compatible HTTP client
# ---------------------------------------------------------------------------
def _call_llm(prompt_user: str, cfg: AppConfig) -> str:
"""
Send a chat completion request to the configured LLM provider.
Supports: openrouter, openai, ollama (all use the OpenAI-compatible API).
Returns:
The raw text content of the first assistant message.
Raises:
RuntimeError: On HTTP errors or missing API key.
"""
import urllib.request
import urllib.error
llm = cfg.llm
if llm.provider in ("openrouter", "openai") and not llm.api_key:
raise RuntimeError(
f"LLM provider is '{llm.provider}' but no API key found. "
"Set OPENROUTER_API_KEY (or OPENAI_API_KEY) in your .env file."
)
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {llm.api_key}",
}
if llm.provider == "openrouter":
headers["HTTP-Referer"] = "https://github.com/ai-trailer-2026"
headers["X-Title"] = "AI Trailer Generator v2"
body = json.dumps({
"model": llm.model,
"messages": [
{"role": "system", "content": _SYSTEM_PROMPT},
{"role": "user", "content": prompt_user},
],
"temperature": llm.temperature,
"max_tokens": llm.max_tokens,
}).encode("utf-8")
url = f"{llm.base_url.rstrip('/')}/chat/completions"
req = urllib.request.Request(url, data=body, headers=headers, method="POST")
try:
with urllib.request.urlopen(req, timeout=llm.timeout_seconds) as resp:
data = json.loads(resp.read().decode("utf-8"))
return data["choices"][0]["message"]["content"]
except urllib.error.HTTPError as exc:
body_text = exc.read().decode(errors="replace")
raise RuntimeError(
f"LLM HTTP {exc.code} from {url}:\n{body_text}"
) from exc
# ---------------------------------------------------------------------------
# Response parser
# ---------------------------------------------------------------------------
_BEAT_TYPE_MAP: dict[str, BeatType] = {bt.name: bt for bt in BeatType}
def _parse_response(raw: str, beats: Sequence[TrailerBeat]) -> dict[int, BeatType]:
"""
Parse the LLM JSON array response into a beat_id → BeatType mapping.
Falls back to BeatType.UNKNOWN for any beat that cannot be parsed.
"""
# Strip accidental markdown fences
clean = raw.strip()
if clean.startswith("```"):
clean = "\n".join(clean.split("\n")[1:])
if clean.endswith("```"):
clean = clean[: clean.rfind("```")]
clean = clean.strip()
result: dict[int, BeatType] = {b.beat_id: BeatType.UNKNOWN for b in beats}
try:
parsed = json.loads(clean)
if not isinstance(parsed, list):
raise ValueError("Expected JSON array at top level.")
for item in parsed:
bid = int(item["beat_id"])
name = str(item.get("beat_type", "UNKNOWN")).upper()
result[bid] = _BEAT_TYPE_MAP.get(name, BeatType.UNKNOWN)
except (json.JSONDecodeError, KeyError, ValueError) as exc:
logger.warning("LLM response parse error (%s) — all beats → UNKNOWN.\nRaw: %s", exc, raw[:300])
return result
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def classify_beats(
beats: Sequence[TrailerBeat],
cfg: AppConfig,
) -> list[TrailerBeat]:
"""
Use the LLM to assign a BeatType to each TrailerBeat.
Args:
beats: TrailerBeat list (dialogue should be populated for best results).
cfg: Application configuration (llm section + api key).
Returns:
New list of TrailerBeat objects with beat_type set.
On LLM error, all beats keep BeatType.UNKNOWN (no exception raised).
"""
if not beats:
return list(beats)
logger.info(
"Classifying %d beats via %s / %s",
len(beats), cfg.llm.provider, cfg.llm.model,
)
payload = _build_beats_payload(beats)
prompt = _USER_TEMPLATE.format(n=len(beats), beats_json=payload)
try:
raw_response = _call_llm(prompt, cfg)
except Exception as exc:
logger.error("LLM classification failed: %s — keeping BeatType.UNKNOWN.", exc)
return list(beats)
type_map = _parse_response(raw_response, beats)
enriched = [replace(b, beat_type=type_map.get(b.beat_id, BeatType.UNKNOWN)) for b in beats]
classified = sum(1 for b in enriched if b.beat_type != BeatType.UNKNOWN)
logger.info("Beat classification done: %d / %d classified.", classified, len(beats))
return enriched
+316
View File
@@ -0,0 +1,316 @@
"""
Cached vision descriptions for ambiguous trailer/source matching.
This module is deliberately conservative: it never writes a final match and it
does not replace CV. It describes a small number of 3-frame beat/scene samples,
caches those descriptions, and returns extra source in-point seeds for the CV
scanner to verify.
"""
from __future__ import annotations
import base64
import json
import logging
import re
import urllib.error
import urllib.request
from dataclasses import asdict
from pathlib import Path
from typing import Sequence
import cv2
from src.core.config import AppConfig
from src.core.models import Scene, TrailerBeat
logger = logging.getLogger(__name__)
_CACHE_VERSION = 1
_STOPWORDS = {
"the", "and", "with", "from", "that", "this", "there", "their", "into",
"scene", "frame", "image", "shot", "video", "visible", "looks", "appears",
"eine", "einer", "einem", "einen", "und", "oder", "mit", "der", "die", "das",
}
_SYSTEM_PROMPT = """You describe film shots for automatic matching.
Return only compact JSON with these keys:
subject, setting, composition, action_phase, distinctive_objects, lighting_color, negatives.
Focus on stable visual facts and spatial layout. Ignore timecode overlays, subtitles, logos, compression, aspect ratio, and color grading differences."""
def _cache_path(cfg: AppConfig) -> Path:
return cfg.paths.cache_dir / "vision_descriptions.json"
def _load_cache(cfg: AppConfig) -> dict:
path = _cache_path(cfg)
if not path.exists():
return {"version": _CACHE_VERSION, "items": {}}
try:
data = json.loads(path.read_text(encoding="utf-8"))
except json.JSONDecodeError:
logger.warning("Vision cache is unreadable; rebuilding: %s", path)
return {"version": _CACHE_VERSION, "items": {}}
if data.get("version") != _CACHE_VERSION or not isinstance(data.get("items"), dict):
return {"version": _CACHE_VERSION, "items": {}}
return data
def _save_cache(cfg: AppConfig, cache: dict) -> None:
path = _cache_path(cfg)
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(cache, indent=2, ensure_ascii=False), encoding="utf-8")
def _sample_times(start_s: float, end_s: float) -> list[float]:
duration_s = max(0.04, end_s - start_s)
return [
start_s + min(duration_s * 0.12, max(0.0, duration_s - 0.04)),
start_s + duration_s * 0.50,
start_s + max(0.0, duration_s - min(duration_s * 0.12, 0.20)),
]
def _frame_data_url(video_path: Path, t_s: float) -> str | None:
cap = cv2.VideoCapture(str(video_path))
try:
if not cap.isOpened():
return None
cap.set(cv2.CAP_PROP_POS_MSEC, max(0.0, t_s) * 1000.0)
ok, frame = cap.read()
if not ok or frame is None:
return None
h, w = frame.shape[:2]
if w > 640:
frame = cv2.resize(frame, (640, int(h * (640 / w))), interpolation=cv2.INTER_AREA)
ok, encoded = cv2.imencode(".jpg", frame, [int(cv2.IMWRITE_JPEG_QUALITY), 72])
if not ok:
return None
payload = base64.b64encode(encoded.tobytes()).decode("ascii")
return f"data:image/jpeg;base64,{payload}"
finally:
cap.release()
def _call_vision_model(label: str, image_urls: list[str], cfg: AppConfig) -> str:
vision = cfg.vision
if vision.provider in ("openai", "openrouter") and not vision.api_key:
raise RuntimeError(
"Vision is enabled but no API key is available. Set VISION_API_KEY, "
"OPENROUTER_API_KEY, OPENAI_API_KEY, or LLM_API_KEY."
)
content: list[dict] = [{
"type": "text",
"text": (
f"Describe this 3-frame sample for matching. Label: {label}. "
"The frames are start, middle, and end of the same beat/scene."
),
}]
content.extend({
"type": "image_url",
"image_url": {"url": url, "detail": "low"},
} for url in image_urls)
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {vision.api_key}",
}
if vision.provider == "openrouter":
headers["HTTP-Referer"] = "https://github.com/ai-trailer-2026"
headers["X-Title"] = "AI Trailer Generator v2"
body = json.dumps({
"model": vision.model,
"messages": [
{"role": "system", "content": _SYSTEM_PROMPT},
{"role": "user", "content": content},
],
"temperature": vision.temperature,
"max_tokens": vision.max_tokens,
}).encode("utf-8")
url = f"{vision.base_url.rstrip('/')}/chat/completions"
req = urllib.request.Request(url, data=body, headers=headers, method="POST")
try:
with urllib.request.urlopen(req, timeout=vision.timeout_seconds) as resp:
data = json.loads(resp.read().decode("utf-8"))
return str(data["choices"][0]["message"]["content"]).strip()
except urllib.error.HTTPError as exc:
body_text = exc.read().decode(errors="replace")
raise RuntimeError(f"Vision HTTP {exc.code} from {url}:\n{body_text}") from exc
def _description_key(kind: str, item_id: int, start_s: float, end_s: float, cfg: AppConfig) -> str:
path = cfg.paths.reference_trailer if kind == "beat" else cfg.paths.source_movie
try:
stamp = int(path.stat().st_mtime)
except OSError:
stamp = 0
return (
f"{kind}:{item_id}:"
f"{start_s:.3f}:{end_s:.3f}:"
f"{cfg.vision.provider}:{cfg.vision.model}:{stamp}"
)
def _describe_sample(
*,
kind: str,
item_id: int,
label: str,
video_path: Path,
start_s: float,
end_s: float,
cfg: AppConfig,
cache: dict,
budget: list[int],
) -> str | None:
key = _description_key(kind, item_id, start_s, end_s, cfg)
cached = cache["items"].get(key)
if cached:
return str(cached.get("description", ""))
if budget[0] <= 0:
return None
image_urls = [
url for url in (_frame_data_url(video_path, t) for t in _sample_times(start_s, end_s))
if url is not None
]
if len(image_urls) < 2:
return None
description = _call_vision_model(label, image_urls, cfg)
cache["items"][key] = {
"kind": kind,
"item_id": item_id,
"start_s": start_s,
"end_s": end_s,
"label": label,
"description": description,
}
budget[0] -= 1
return description
def _terms(text: str) -> set[str]:
words = re.findall(r"[a-zA-Z][a-zA-Z0-9_'-]{2,}", text.lower())
return {w for w in words if w not in _STOPWORDS}
def _text_similarity(a: str, b: str) -> float:
ta = _terms(a)
tb = _terms(b)
if not ta or not tb:
return 0.0
overlap = len(ta & tb)
return float(overlap / max(8, min(len(ta), len(tb))))
def _scene_seed_points(scene: Scene, max_points: int) -> list[float]:
if max_points <= 1 or scene.duration_s <= 0:
return [scene.start_s]
usable_end = max(scene.start_s, scene.end_s - 0.2)
if usable_end <= scene.start_s:
return [scene.start_s]
step = (usable_end - scene.start_s) / max(1, max_points - 1)
return [scene.start_s + step * idx for idx in range(max_points)]
def build_vision_seed_in_points(
beats: Sequence[TrailerBeat],
scenes: Sequence[Scene],
cfg: AppConfig,
) -> dict[int, list[tuple[float, float]]]:
"""
Return extra in-point seeds from cached vision descriptions.
The function is intentionally small-budget: for each beat it describes the
beat once and only a few top scene-level candidates. Existing descriptions
are read from cache and cost nothing.
"""
if not cfg.vision.enabled:
return {}
if not beats or not scenes:
return {}
from src.cv.vibe_check import run_vibe_check
cache = _load_cache(cfg)
budget = [cfg.vision.max_new_descriptions_per_run]
scenes_by_id = {scene.scene_id: scene for scene in scenes}
seeds: dict[int, list[tuple[float, float]]] = {}
for beat in beats:
beat_desc = _describe_sample(
kind="beat",
item_id=beat.beat_id,
label=f"trailer beat {beat.beat_id}",
video_path=beat.trailer_path,
start_s=beat.start_s,
end_s=beat.end_s,
cfg=cfg,
cache=cache,
budget=budget,
)
if not beat_desc:
continue
hits = run_vibe_check(
beat,
scenes,
top_k=cfg.vision.scene_candidate_top_k,
hist_method=cfg.cv.vibe_check.hist_compare_method,
phash_max_distance=64,
)
ranked: list[tuple[float, Scene]] = []
for hit in hits:
scene = scenes_by_id.get(hit.scene_id)
if scene is None:
continue
scene_desc = _describe_sample(
kind="scene",
item_id=scene.scene_id,
label=f"source scene {scene.scene_id}",
video_path=scene.source_path,
start_s=scene.start_s,
end_s=scene.end_s,
cfg=cfg,
cache=cache,
budget=budget,
)
if not scene_desc:
continue
score = _text_similarity(beat_desc, scene_desc)
if score >= cfg.vision.similarity_threshold:
ranked.append((score, scene))
ranked.sort(key=lambda item: item[0], reverse=True)
points: list[tuple[float, float]] = []
for score, scene in ranked[:cfg.vision.max_seed_scenes]:
logger.info(
"Beat %d: vision seed scene=%d score=%.3f",
beat.beat_id,
scene.scene_id,
score,
)
weighted_score = max(
cfg.cv.deep_scan.coarse_candidate_threshold,
min(0.98, cfg.vision.seed_score * (0.75 + min(1.0, score) * 0.25)),
)
points.extend(
(point, weighted_score)
for point in _scene_seed_points(scene, cfg.vision.seed_points_per_scene)
)
if points:
merged: dict[float, float] = {}
for point, weighted_score in points:
key = round(max(0.0, point), 3)
merged[key] = max(weighted_score, merged.get(key, 0.0))
seeds[beat.beat_id] = sorted((point, score) for point, score in merged.items())
_save_cache(cfg, cache)
return seeds