Initial project import
This commit is contained in:
@@ -0,0 +1 @@
|
||||
# src package
|
||||
@@ -0,0 +1 @@
|
||||
# src.audio package — Whisper / dialogue analysis
|
||||
@@ -0,0 +1,182 @@
|
||||
"""
|
||||
src/audio/transcriber.py — Whisper transcription via faster-whisper
|
||||
|
||||
Responsibility:
|
||||
- Transcribe audio from a video file into a list of DialogueLine objects
|
||||
- Optionally restrict to a time window [start_s, end_s] (for single beats)
|
||||
- All model config (model name, device, compute_type) comes from AppConfig
|
||||
|
||||
The LLM is NOT used here. This is pure audio-to-text.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Sequence
|
||||
|
||||
from src.core.config import AppConfig
|
||||
from src.core.models import DialogueLine
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Audio extraction helper (video → wav via ffmpeg)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _extract_audio_segment(
|
||||
video_path: Path,
|
||||
start_s: float | None,
|
||||
end_s: float | None,
|
||||
out_wav: Path,
|
||||
) -> None:
|
||||
"""
|
||||
Use ffmpeg (subprocess) to extract a mono 16kHz WAV from *video_path*.
|
||||
|
||||
Args:
|
||||
video_path: Source video.
|
||||
start_s: Start time in seconds (None = beginning of file).
|
||||
end_s: End time in seconds (None = end of file).
|
||||
out_wav: Destination WAV path.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If ffmpeg exits with a non-zero code.
|
||||
"""
|
||||
import subprocess
|
||||
|
||||
cmd = ["ffmpeg", "-y", "-loglevel", "error"]
|
||||
|
||||
if start_s is not None:
|
||||
cmd += ["-ss", str(start_s)]
|
||||
if end_s is not None and start_s is not None:
|
||||
cmd += ["-t", str(end_s - start_s)]
|
||||
elif end_s is not None:
|
||||
cmd += ["-to", str(end_s)]
|
||||
|
||||
cmd += [
|
||||
"-i", str(video_path),
|
||||
"-vn", # no video
|
||||
"-ac", "1", # mono
|
||||
"-ar", "16000", # 16 kHz — Whisper native rate
|
||||
"-f", "wav",
|
||||
str(out_wav),
|
||||
]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(
|
||||
f"ffmpeg failed (code {result.returncode}):\n"
|
||||
f"{result.stderr.decode(errors='replace')}"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Core transcription
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def transcribe_video(
|
||||
video_path: Path,
|
||||
cfg: AppConfig,
|
||||
start_s: float | None = None,
|
||||
end_s: float | None = None,
|
||||
time_offset_s: float = 0.0,
|
||||
) -> list[DialogueLine]:
|
||||
"""
|
||||
Transcribe dialogue from *video_path* using faster-whisper.
|
||||
|
||||
Args:
|
||||
video_path: Path to source or trailer video.
|
||||
cfg: Application configuration (whisper section).
|
||||
start_s: Clip start in video-file seconds (None = beginning).
|
||||
end_s: Clip end in video-file seconds (None = end of file).
|
||||
time_offset_s: Added to every transcript timestamp so that beat-level
|
||||
transcripts align with absolute movie time.
|
||||
|
||||
Returns:
|
||||
List of DialogueLine ordered by start time.
|
||||
"""
|
||||
try:
|
||||
from faster_whisper import WhisperModel
|
||||
except ImportError:
|
||||
raise ImportError("faster-whisper not installed. Run: pip install faster-whisper")
|
||||
|
||||
w = cfg.whisper
|
||||
|
||||
logger.info(
|
||||
"Transcribing %s [%.1f–%s] with %s on %s …",
|
||||
video_path.name,
|
||||
start_s or 0.0,
|
||||
f"{end_s:.1f}s" if end_s else "end",
|
||||
w.model,
|
||||
w.device,
|
||||
)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
wav = Path(tmp) / "audio.wav"
|
||||
_extract_audio_segment(video_path, start_s, end_s, wav)
|
||||
|
||||
model = WhisperModel(w.model, device=w.device, compute_type=w.compute_type)
|
||||
segments, _ = model.transcribe(
|
||||
str(wav),
|
||||
language=w.language if w.language else None,
|
||||
beam_size=5,
|
||||
)
|
||||
|
||||
lines: list[DialogueLine] = []
|
||||
for seg in segments:
|
||||
lines.append(DialogueLine(
|
||||
start_s=seg.start + time_offset_s,
|
||||
end_s=seg.end + time_offset_s,
|
||||
text=seg.text.strip(),
|
||||
))
|
||||
|
||||
logger.info("Transcription done: %d segments.", len(lines))
|
||||
return lines
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Convenience: transcribe a whole file and return grouped by scene
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def transcribe_full_movie(
|
||||
cfg: AppConfig,
|
||||
) -> list[DialogueLine]:
|
||||
"""
|
||||
Transcribe the entire source movie. Use this result to enrich Scenes
|
||||
via a dialogue_callback passed to build_scene_index().
|
||||
"""
|
||||
return transcribe_video(cfg.paths.source_movie, cfg)
|
||||
|
||||
|
||||
def assign_dialogue_to_scenes(
|
||||
all_dialogue: Sequence[DialogueLine],
|
||||
scenes: list["src.core.models.Scene"], # type: ignore[name-defined]
|
||||
) -> list["src.core.models.Scene"]: # type: ignore[name-defined]
|
||||
"""
|
||||
Distribute pre-transcribed DialogueLines into their respective Scenes.
|
||||
|
||||
A line is assigned to the scene whose window contains its midpoint.
|
||||
|
||||
Args:
|
||||
all_dialogue: Full-movie transcript as flat list.
|
||||
scenes: Scene list (will be replaced with enriched copies).
|
||||
|
||||
Returns:
|
||||
New list of Scene objects with dialogue tuples populated.
|
||||
"""
|
||||
from dataclasses import replace
|
||||
from src.core.models import Scene
|
||||
|
||||
enriched: list[Scene] = []
|
||||
for scene in scenes:
|
||||
matched = tuple(
|
||||
line for line in all_dialogue
|
||||
if scene.start_s <= (line.start_s + line.end_s) / 2.0 < scene.end_s
|
||||
)
|
||||
enriched.append(replace(scene, dialogue=matched))
|
||||
|
||||
total_assigned = sum(len(s.dialogue) for s in enriched)
|
||||
logger.info("Assigned %d dialogue lines across %d scenes.", total_assigned, len(enriched))
|
||||
return enriched
|
||||
@@ -0,0 +1 @@
|
||||
# src.core package
|
||||
@@ -0,0 +1,387 @@
|
||||
"""
|
||||
src/core/config.py — Configuration loader for AI Trailer Generator v2
|
||||
|
||||
Loads config.toml and exposes typed, nested dataclasses.
|
||||
All CV thresholds, paths, and model settings are sourced exclusively here.
|
||||
API keys are NEVER stored in config.toml; they are loaded from .env.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import tomllib
|
||||
|
||||
try:
|
||||
from dotenv import load_dotenv as _load_dotenv
|
||||
_HAS_DOTENV = True
|
||||
except ImportError: # dotenv optional — falls back to existing env vars
|
||||
_HAS_DOTENV = False
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Literal
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Leaf sections
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PathsConfig:
|
||||
source_movie: Path
|
||||
reference_trailer: Path
|
||||
output_dir: Path
|
||||
cache_dir: Path
|
||||
proxy_dir: Path
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class VideoConfig:
|
||||
extract_fps: float
|
||||
proxy_width: int
|
||||
proxy_height: int
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class VibeCheckConfig:
|
||||
top_k_candidates: int
|
||||
hist_compare_method: int
|
||||
hist_bins_hue: int
|
||||
hist_bins_saturation: int
|
||||
phash_max_distance: int
|
||||
crop_top_fraction: float
|
||||
crop_bottom_fraction: float
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class DeepScanConfig:
|
||||
coarse_step_seconds: float
|
||||
match_threshold: float
|
||||
provisional_match_threshold: float
|
||||
coarse_candidate_threshold: float
|
||||
sequence_score_weight: float
|
||||
span_score_weight: float
|
||||
coarse_score_weight: float
|
||||
duration_score_weight: float
|
||||
duration_tie_break_score_delta: float
|
||||
min_duration_coverage: float
|
||||
continuity_seed_offsets_s: tuple[float, ...]
|
||||
scene_seed_top_k: int
|
||||
scene_seed_points_per_scene: int
|
||||
content_rerank_candidate_count: int
|
||||
skip_coarse_scan_with_weighted_seeds: bool
|
||||
max_refine_candidates: int
|
||||
match_method: int
|
||||
refine_window_seconds: float
|
||||
refine_step_seconds: float
|
||||
content_align_window_seconds: float
|
||||
content_align_sample_step_s: float
|
||||
content_validation_weight: float
|
||||
provisional_content_threshold: float
|
||||
start_tie_break_score_delta: float
|
||||
start_preroll_frames: int
|
||||
sequence_candidate_count: int
|
||||
sequence_min_distance_s: float
|
||||
span_sample_step_s: float
|
||||
trim_tail_frames: int
|
||||
scene_boundary_epsilon_s: float
|
||||
scoreable_luma_mean_min: float
|
||||
scoreable_luma_p90_min: float
|
||||
scoreable_contrast_min: float
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class CVConfig:
|
||||
vibe_check: VibeCheckConfig
|
||||
deep_scan: DeepScanConfig
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class SceneDetectionConfig:
|
||||
content_threshold: float
|
||||
min_scene_duration_s: float
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class WhisperConfig:
|
||||
model: str
|
||||
language: str
|
||||
device: Literal["cuda", "cpu"]
|
||||
compute_type: Literal["float16", "int8", "float32"]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class LLMConfig:
|
||||
provider: Literal["ollama", "openai", "openrouter"]
|
||||
base_url: str
|
||||
model: str
|
||||
timeout_seconds: int
|
||||
temperature: float
|
||||
max_tokens: int
|
||||
# Loaded from .env — NEVER committed to version control
|
||||
api_key: str = ""
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class VisionConfig:
|
||||
enabled: bool
|
||||
provider: Literal["openai", "openrouter"]
|
||||
base_url: str
|
||||
model: str
|
||||
timeout_seconds: int
|
||||
temperature: float
|
||||
max_tokens: int
|
||||
scene_candidate_top_k: int
|
||||
max_new_descriptions_per_run: int
|
||||
max_seed_scenes: int
|
||||
seed_points_per_scene: int
|
||||
seed_score: float
|
||||
max_refine_candidates: int
|
||||
local_scan_step_s: float
|
||||
local_scan_max_points_per_scene: int
|
||||
local_scan_top_candidates: int
|
||||
local_scan_tie_break_score_delta: float
|
||||
multi_shot_cut_corr_threshold: float
|
||||
multi_shot_boundary_tolerance_s: float
|
||||
fullscan_fallback: bool
|
||||
content_threshold: float
|
||||
similarity_threshold: float
|
||||
api_key: str = ""
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ExportConfig:
|
||||
fcpxml_version: str
|
||||
edl_frame_rate: float
|
||||
output_format: Literal["fcpxml", "edl", "both"]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Root config — single object passed through the entire application
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class AppConfig:
|
||||
project_name: str
|
||||
version: str
|
||||
log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR"]
|
||||
|
||||
paths: PathsConfig
|
||||
video: VideoConfig
|
||||
cv: CVConfig
|
||||
scene_detection: SceneDetectionConfig
|
||||
whisper: WhisperConfig
|
||||
llm: LLMConfig
|
||||
vision: VisionConfig
|
||||
export: ExportConfig
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Loader
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_DEFAULT_CONFIG_PATH = Path(__file__).parents[2] / "config.toml"
|
||||
_DEFAULT_ENV_PATH = Path(__file__).parents[2] / ".env"
|
||||
|
||||
|
||||
def load_config(
|
||||
config_path: Path = _DEFAULT_CONFIG_PATH,
|
||||
env_path: Path = _DEFAULT_ENV_PATH,
|
||||
) -> AppConfig:
|
||||
"""
|
||||
Parse config.toml and return a fully-typed, immutable AppConfig.
|
||||
|
||||
API keys are read from the .env file (or existing environment variables);
|
||||
they are never stored in config.toml.
|
||||
|
||||
Args:
|
||||
config_path: Absolute or relative path to the TOML file.
|
||||
Defaults to <project_root>/config.toml.
|
||||
env_path: Path to the .env file.
|
||||
Defaults to <project_root>/.env.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If the TOML file does not exist.
|
||||
KeyError / TypeError: If a required key is missing or has the wrong type.
|
||||
"""
|
||||
# Load .env first so os.environ is populated before we read it below.
|
||||
if _HAS_DOTENV:
|
||||
_load_dotenv(dotenv_path=env_path, override=False)
|
||||
|
||||
if not config_path.exists():
|
||||
raise FileNotFoundError(
|
||||
f"Config file not found: {config_path}\n"
|
||||
"Copy config.toml.example to config.toml and adjust your paths."
|
||||
)
|
||||
|
||||
with config_path.open("rb") as fh:
|
||||
raw: dict = tomllib.load(fh)
|
||||
|
||||
project = raw["project"]
|
||||
paths_raw = raw["paths"]
|
||||
video_raw = raw["video"]
|
||||
cv_raw = raw["cv"]
|
||||
sd_raw = raw["scene_detection"]
|
||||
whisper_raw = raw["whisper"]
|
||||
llm_raw = raw["llm"]
|
||||
vision_raw = raw.get("vision", {})
|
||||
export_raw = raw["export"]
|
||||
|
||||
# Resolve paths relative to the config file's parent directory so the
|
||||
# project is relocatable, but keep absolute paths as-is.
|
||||
def _resolve(p: str) -> Path:
|
||||
path = Path(p)
|
||||
return path if path.is_absolute() else (config_path.parent / path).resolve()
|
||||
|
||||
paths = PathsConfig(
|
||||
source_movie=_resolve(paths_raw["source_movie"]),
|
||||
reference_trailer=_resolve(paths_raw["reference_trailer"]),
|
||||
output_dir=_resolve(paths_raw["output_dir"]),
|
||||
cache_dir=_resolve(paths_raw["cache_dir"]),
|
||||
proxy_dir=_resolve(paths_raw["proxy_dir"]),
|
||||
)
|
||||
|
||||
video = VideoConfig(
|
||||
extract_fps=float(video_raw["extract_fps"]),
|
||||
proxy_width=int(video_raw["proxy_width"]),
|
||||
proxy_height=int(video_raw["proxy_height"]),
|
||||
)
|
||||
|
||||
vibe_check = VibeCheckConfig(
|
||||
top_k_candidates=int(cv_raw["vibe_check"]["top_k_candidates"]),
|
||||
hist_compare_method=int(cv_raw["vibe_check"]["hist_compare_method"]),
|
||||
hist_bins_hue=int(cv_raw["vibe_check"]["hist_bins_hue"]),
|
||||
hist_bins_saturation=int(cv_raw["vibe_check"]["hist_bins_saturation"]),
|
||||
phash_max_distance=int(cv_raw["vibe_check"]["phash_max_distance"]),
|
||||
crop_top_fraction=float(cv_raw["vibe_check"]["crop_top_fraction"]),
|
||||
crop_bottom_fraction=float(cv_raw["vibe_check"]["crop_bottom_fraction"]),
|
||||
)
|
||||
|
||||
deep_scan = DeepScanConfig(
|
||||
coarse_step_seconds=float(cv_raw["deep_scan"]["coarse_step_seconds"]),
|
||||
match_threshold=float(cv_raw["deep_scan"]["match_threshold"]),
|
||||
provisional_match_threshold=float(cv_raw["deep_scan"].get("provisional_match_threshold", 0.45)),
|
||||
coarse_candidate_threshold=float(cv_raw["deep_scan"].get("coarse_candidate_threshold", cv_raw["deep_scan"]["match_threshold"])),
|
||||
sequence_score_weight=float(cv_raw["deep_scan"].get("sequence_score_weight", 0.55)),
|
||||
span_score_weight=float(cv_raw["deep_scan"].get("span_score_weight", 0.15)),
|
||||
coarse_score_weight=float(cv_raw["deep_scan"].get("coarse_score_weight", 0.10)),
|
||||
duration_score_weight=float(cv_raw["deep_scan"].get("duration_score_weight", 0.20)),
|
||||
duration_tie_break_score_delta=float(cv_raw["deep_scan"].get("duration_tie_break_score_delta", 0.03)),
|
||||
min_duration_coverage=float(cv_raw["deep_scan"].get("min_duration_coverage", 0.65)),
|
||||
continuity_seed_offsets_s=tuple(
|
||||
float(v) for v in cv_raw["deep_scan"].get(
|
||||
"continuity_seed_offsets_s",
|
||||
[-1.0, 0.0, 0.5, 1.0, 1.5, 2.0, 3.0],
|
||||
)
|
||||
),
|
||||
scene_seed_top_k=int(cv_raw["deep_scan"].get("scene_seed_top_k", 30)),
|
||||
scene_seed_points_per_scene=int(cv_raw["deep_scan"].get("scene_seed_points_per_scene", 6)),
|
||||
content_rerank_candidate_count=int(cv_raw["deep_scan"].get("content_rerank_candidate_count", 100)),
|
||||
skip_coarse_scan_with_weighted_seeds=bool(cv_raw["deep_scan"].get("skip_coarse_scan_with_weighted_seeds", False)),
|
||||
max_refine_candidates=int(cv_raw["deep_scan"].get("max_refine_candidates", 6)),
|
||||
match_method=int(cv_raw["deep_scan"]["match_method"]),
|
||||
refine_window_seconds=float(cv_raw["deep_scan"].get("refine_window_seconds", 0.6)),
|
||||
refine_step_seconds=float(cv_raw["deep_scan"]["refine_step_seconds"]),
|
||||
content_align_window_seconds=float(cv_raw["deep_scan"].get("content_align_window_seconds", 0.48)),
|
||||
content_align_sample_step_s=float(cv_raw["deep_scan"].get("content_align_sample_step_s", 0.28)),
|
||||
content_validation_weight=float(cv_raw["deep_scan"].get("content_validation_weight", 0.35)),
|
||||
provisional_content_threshold=float(cv_raw["deep_scan"].get("provisional_content_threshold", 0.42)),
|
||||
start_tie_break_score_delta=float(cv_raw["deep_scan"].get("start_tie_break_score_delta", 0.015)),
|
||||
start_preroll_frames=int(cv_raw["deep_scan"].get("start_preroll_frames", 0)),
|
||||
sequence_candidate_count=int(cv_raw["deep_scan"].get("sequence_candidate_count", 240)),
|
||||
sequence_min_distance_s=float(cv_raw["deep_scan"].get("sequence_min_distance_s", 1.0)),
|
||||
span_sample_step_s=float(cv_raw["deep_scan"].get("span_sample_step_s", 0.08)),
|
||||
trim_tail_frames=int(cv_raw["deep_scan"].get("trim_tail_frames", 2)),
|
||||
scene_boundary_epsilon_s=float(cv_raw["deep_scan"].get("scene_boundary_epsilon_s", 0.12)),
|
||||
scoreable_luma_mean_min=float(cv_raw["deep_scan"].get("scoreable_luma_mean_min", 24.0)),
|
||||
scoreable_luma_p90_min=float(cv_raw["deep_scan"].get("scoreable_luma_p90_min", 58.0)),
|
||||
scoreable_contrast_min=float(cv_raw["deep_scan"].get("scoreable_contrast_min", 24.0)),
|
||||
)
|
||||
|
||||
scene_detection = SceneDetectionConfig(
|
||||
content_threshold=float(sd_raw["content_threshold"]),
|
||||
min_scene_duration_s=float(sd_raw["min_scene_duration_s"]),
|
||||
)
|
||||
|
||||
whisper = WhisperConfig(
|
||||
model=whisper_raw["model"],
|
||||
language=whisper_raw["language"],
|
||||
device=whisper_raw["device"],
|
||||
compute_type=whisper_raw["compute_type"],
|
||||
)
|
||||
|
||||
# Resolve API key: env var takes precedence over config (which shouldn't have it).
|
||||
# Supported env vars (in priority order):
|
||||
# OPENROUTER_API_KEY → for provider = openrouter
|
||||
# OPENAI_API_KEY → for provider = openai
|
||||
# LLM_API_KEY → universal fallback
|
||||
_provider = llm_raw["provider"]
|
||||
_api_key = (
|
||||
os.environ.get("OPENROUTER_API_KEY", "")
|
||||
if _provider == "openrouter"
|
||||
else os.environ.get("OPENAI_API_KEY", "")
|
||||
if _provider == "openai"
|
||||
else ""
|
||||
) or os.environ.get("LLM_API_KEY", "")
|
||||
|
||||
llm = LLMConfig(
|
||||
provider=_provider,
|
||||
base_url=llm_raw["base_url"],
|
||||
model=llm_raw["model"],
|
||||
timeout_seconds=int(llm_raw["timeout_seconds"]),
|
||||
temperature=float(llm_raw["temperature"]),
|
||||
max_tokens=int(llm_raw["max_tokens"]),
|
||||
api_key=_api_key,
|
||||
)
|
||||
|
||||
vision_provider = vision_raw.get("provider", _provider if _provider in ("openai", "openrouter") else "openrouter")
|
||||
vision_api_key = (
|
||||
os.environ.get("OPENROUTER_API_KEY", "")
|
||||
if vision_provider == "openrouter"
|
||||
else os.environ.get("OPENAI_API_KEY", "")
|
||||
) or os.environ.get("VISION_API_KEY", "") or os.environ.get("LLM_API_KEY", "")
|
||||
|
||||
vision = VisionConfig(
|
||||
enabled=bool(vision_raw.get("enabled", False)),
|
||||
provider=vision_provider,
|
||||
base_url=str(vision_raw.get("base_url", llm.base_url)),
|
||||
model=str(vision_raw.get("model", llm.model)),
|
||||
timeout_seconds=int(vision_raw.get("timeout_seconds", llm.timeout_seconds)),
|
||||
temperature=float(vision_raw.get("temperature", 0.0)),
|
||||
max_tokens=int(vision_raw.get("max_tokens", 350)),
|
||||
scene_candidate_top_k=int(vision_raw.get("scene_candidate_top_k", 8)),
|
||||
max_new_descriptions_per_run=int(vision_raw.get("max_new_descriptions_per_run", 12)),
|
||||
max_seed_scenes=int(vision_raw.get("max_seed_scenes", 3)),
|
||||
seed_points_per_scene=int(vision_raw.get("seed_points_per_scene", 12)),
|
||||
seed_score=float(vision_raw.get("seed_score", 0.88)),
|
||||
max_refine_candidates=int(vision_raw.get("max_refine_candidates", 6)),
|
||||
local_scan_step_s=float(vision_raw.get("local_scan_step_s", 0.12)),
|
||||
local_scan_max_points_per_scene=int(vision_raw.get("local_scan_max_points_per_scene", 180)),
|
||||
local_scan_top_candidates=int(vision_raw.get("local_scan_top_candidates", 18)),
|
||||
local_scan_tie_break_score_delta=float(vision_raw.get("local_scan_tie_break_score_delta", 0.08)),
|
||||
multi_shot_cut_corr_threshold=float(vision_raw.get("multi_shot_cut_corr_threshold", 0.20)),
|
||||
multi_shot_boundary_tolerance_s=float(vision_raw.get("multi_shot_boundary_tolerance_s", 0.20)),
|
||||
fullscan_fallback=bool(vision_raw.get("fullscan_fallback", False)),
|
||||
content_threshold=float(vision_raw.get("content_threshold", 0.22)),
|
||||
similarity_threshold=float(vision_raw.get("similarity_threshold", 0.18)),
|
||||
api_key=vision_api_key,
|
||||
)
|
||||
|
||||
export = ExportConfig(
|
||||
fcpxml_version=str(export_raw["fcpxml_version"]),
|
||||
edl_frame_rate=float(export_raw["edl_frame_rate"]),
|
||||
output_format=export_raw["output_format"],
|
||||
)
|
||||
|
||||
return AppConfig(
|
||||
project_name=project["name"],
|
||||
version=project["version"],
|
||||
log_level=project["log_level"],
|
||||
paths=paths,
|
||||
video=video,
|
||||
cv=CVConfig(vibe_check=vibe_check, deep_scan=deep_scan),
|
||||
scene_detection=scene_detection,
|
||||
whisper=whisper,
|
||||
llm=llm,
|
||||
vision=vision,
|
||||
export=export,
|
||||
)
|
||||
@@ -0,0 +1,287 @@
|
||||
"""
|
||||
src/core/models.py — Canonical data models for AI Trailer Generator v2
|
||||
|
||||
Rules:
|
||||
- Every model is a frozen dataclass (immutable after creation).
|
||||
- All fields are strictly typed; no bare dicts or untyped lists.
|
||||
- Seconds are always float; frame numbers are always int.
|
||||
- Confidence scores live in [0.0, 1.0].
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum, auto
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Enumerations
|
||||
# ===========================================================================
|
||||
|
||||
class MatchMethod(Enum):
|
||||
"""CV template matching method (mirrors cv2.TM_* constants)."""
|
||||
TM_SQDIFF = 0
|
||||
TM_SQDIFF_NORMED = 1
|
||||
TM_CCORR = 2
|
||||
TM_CCORR_NORMED = 3
|
||||
TM_CCOEFF = 4
|
||||
TM_CCOEFF_NORMED = 5
|
||||
|
||||
|
||||
class BeatType(Enum):
|
||||
"""Narrative role of a trailer beat (for dramaturgy / LLM use only)."""
|
||||
HOOK = auto() # Opening attention grabber
|
||||
SETUP = auto() # World / character introduction
|
||||
CONFLICT = auto() # Inciting incident / rising tension
|
||||
CLIMAX = auto() # Peak action / emotion
|
||||
RESOLUTION = auto() # Cool-down / tagline
|
||||
UNKNOWN = auto()
|
||||
|
||||
|
||||
class ExportFormat(Enum):
|
||||
FCPXML = "fcpxml"
|
||||
EDL = "edl"
|
||||
BOTH = "both"
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Phase 0 — Source-movie scene index
|
||||
# ===========================================================================
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class DialogueLine:
|
||||
"""Single transcribed line from Whisper output."""
|
||||
start_s: float # onset in seconds
|
||||
end_s: float # offset in seconds
|
||||
text: str # verbatim transcript
|
||||
speaker: Optional[str] = None # diarisation label if available
|
||||
|
||||
@property
|
||||
def duration_s(self) -> float:
|
||||
return self.end_s - self.start_s
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Scene:
|
||||
"""
|
||||
One detected scene in the source movie.
|
||||
|
||||
Produced by PySceneDetect; enriched by Whisper dialogue and
|
||||
(optionally) perceptual hashes during the Vibe Check phase.
|
||||
"""
|
||||
scene_id: int # zero-based index in source movie
|
||||
source_path: Path # absolute path to the source video file
|
||||
start_s: float # scene start in seconds
|
||||
end_s: float # scene end in seconds
|
||||
start_frame: int # first frame number
|
||||
end_frame: int # last frame number
|
||||
|
||||
# Populated after Vibe Check fingerprinting
|
||||
luma_hist: Optional[bytes] = None # serialised np.ndarray (pickle)
|
||||
sat_hist: Optional[bytes] = None
|
||||
phash: Optional[str] = None # 64-bit hex string
|
||||
|
||||
# Populated after Whisper pass
|
||||
dialogue: tuple[DialogueLine, ...] = field(default_factory=tuple)
|
||||
|
||||
@property
|
||||
def duration_s(self) -> float:
|
||||
return self.end_s - self.start_s
|
||||
|
||||
@property
|
||||
def midpoint_s(self) -> float:
|
||||
return self.start_s + self.duration_s / 2.0
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return (
|
||||
f"Scene(id={self.scene_id}, "
|
||||
f"{self.start_s:.2f}s–{self.end_s:.2f}s, "
|
||||
f"dur={self.duration_s:.2f}s)"
|
||||
)
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Phase 1 — Reference-trailer beat
|
||||
# ===========================================================================
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class TrailerBeat:
|
||||
"""
|
||||
One cut / segment in the reference trailer.
|
||||
|
||||
The 'beat' is the atomic unit of a trailer: it maps exactly to one
|
||||
clip that will later be sourced from the original movie.
|
||||
"""
|
||||
beat_id: int
|
||||
trailer_path: Path
|
||||
start_s: float
|
||||
end_s: float
|
||||
start_frame: int
|
||||
end_frame: int
|
||||
|
||||
beat_type: BeatType = BeatType.UNKNOWN # set by LLM dramaturgy pass
|
||||
|
||||
# Visual fingerprints of the *middle* frame (populated by CV pipeline)
|
||||
luma_hist: Optional[bytes] = None
|
||||
sat_hist: Optional[bytes] = None
|
||||
phash: Optional[str] = None
|
||||
|
||||
# Dialogue extracted from this beat
|
||||
dialogue: tuple[DialogueLine, ...] = field(default_factory=tuple)
|
||||
|
||||
@property
|
||||
def duration_s(self) -> float:
|
||||
return self.end_s - self.start_s
|
||||
|
||||
@property
|
||||
def midpoint_s(self) -> float:
|
||||
return self.start_s + self.duration_s / 2.0
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return (
|
||||
f"TrailerBeat(id={self.beat_id}, "
|
||||
f"{self.beat_type.name}, "
|
||||
f"{self.start_s:.2f}s–{self.end_s:.2f}s)"
|
||||
)
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Phase 2 — CV match result
|
||||
# ===========================================================================
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class VibeHit:
|
||||
"""
|
||||
Intermediate result from Phase 1 (Vibe Check — histogram/pHash).
|
||||
|
||||
Represents a *candidate* scene that passed the coarse filter.
|
||||
Not yet a confirmed match; forwarded to Deep Scan.
|
||||
"""
|
||||
beat_id: int
|
||||
scene_id: int
|
||||
hist_score: float # histogram similarity [0.0, 1.0] (CORREL method)
|
||||
phash_distance: int # Hamming distance [0, 64]; lower = more similar
|
||||
combined_score: float # weighted aggregate used for ranking
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MatchSegment:
|
||||
"""
|
||||
One source-backed visual island inside a trailer beat.
|
||||
|
||||
Some trailer beats contain multiple shots separated by fades/title frames.
|
||||
A single continuous source in/out cannot represent those beats accurately.
|
||||
"""
|
||||
trailer_offset_s: float
|
||||
duration_s: float
|
||||
scene_id: int
|
||||
in_point_s: float
|
||||
out_point_s: float
|
||||
match_score: float
|
||||
is_confirmed: bool = True
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MatchResult:
|
||||
"""
|
||||
Final, confirmed match from Phase 2 (Deep Scan — template matching).
|
||||
|
||||
One MatchResult per TrailerBeat: the best frame-accurate hit found
|
||||
inside the source movie.
|
||||
"""
|
||||
beat_id: int # which trailer beat was matched
|
||||
scene_id: int # which source scene contains the match
|
||||
source_path: Path # absolute path to source video
|
||||
|
||||
# Frame-accurate in-point / out-point in the SOURCE movie
|
||||
in_point_s: float # matched frame onset in source seconds
|
||||
out_point_s: float # computed out-point (in_point + beat duration)
|
||||
in_point_frame: int # matched frame number in source movie
|
||||
|
||||
# Match quality
|
||||
match_score: float # cv2.matchTemplate peak value [0.0, 1.0]
|
||||
match_location: tuple[int, int] = field(default_factory=lambda: (0, 0))
|
||||
# (x, y) pixel location of the best match within the source frame
|
||||
|
||||
# Provenance
|
||||
vibe_hit: Optional[VibeHit] = None # the candidate that led here
|
||||
is_confirmed: bool = True
|
||||
segments: tuple[MatchSegment, ...] = field(default_factory=tuple)
|
||||
|
||||
@property
|
||||
def duration_s(self) -> float:
|
||||
return self.out_point_s - self.in_point_s
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return (
|
||||
f"MatchResult(beat={self.beat_id} → scene={self.scene_id}, "
|
||||
f"in={self.in_point_s:.3f}s, score={self.match_score:.3f})"
|
||||
)
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Phase 3 — Edit timeline (pre-export)
|
||||
# ===========================================================================
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class EditClip:
|
||||
"""
|
||||
One clip on the final edit timeline, ready for FCPXML / EDL export.
|
||||
|
||||
Combines beat dramaturgy + the CV-confirmed source in/out points.
|
||||
"""
|
||||
clip_index: int # position on the timeline (0-based)
|
||||
beat: TrailerBeat
|
||||
match: MatchResult
|
||||
|
||||
# Timeline position (in the OUTPUT trailer)
|
||||
timeline_start_s: float
|
||||
timeline_end_s: float
|
||||
source_duration_s: float | None = None
|
||||
trailer_tail_s: float = 0.0
|
||||
|
||||
# Optional audio override (e.g. VO or music)
|
||||
audio_path: Optional[Path] = None
|
||||
audio_offset_s: float = 0.0
|
||||
|
||||
@property
|
||||
def timeline_duration_s(self) -> float:
|
||||
return self.timeline_end_s - self.timeline_start_s
|
||||
|
||||
@property
|
||||
def source_timeline_duration_s(self) -> float:
|
||||
if self.source_duration_s is not None:
|
||||
return max(0.0, self.source_duration_s)
|
||||
return self.timeline_duration_s
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return (
|
||||
f"EditClip(#{self.clip_index}, "
|
||||
f"tl={self.timeline_start_s:.2f}s–{self.timeline_end_s:.2f}s, "
|
||||
f"src={self.match.in_point_s:.3f}s)"
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class EditTimeline:
|
||||
"""
|
||||
The complete ordered sequence of EditClips that forms the trailer.
|
||||
|
||||
Passed to the export layer (FCPXML / EDL writer).
|
||||
"""
|
||||
title: str
|
||||
frame_rate: float # e.g. 23.976
|
||||
clips: tuple[EditClip, ...] # ordered by clip_index
|
||||
|
||||
@property
|
||||
def total_duration_s(self) -> float:
|
||||
if not self.clips:
|
||||
return 0.0
|
||||
last = max(self.clips, key=lambda c: c.timeline_end_s)
|
||||
return last.timeline_end_s
|
||||
|
||||
@property
|
||||
def clip_count(self) -> int:
|
||||
return len(self.clips)
|
||||
@@ -0,0 +1 @@
|
||||
# src.cv package — Computer Vision engine
|
||||
@@ -0,0 +1,240 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
import shutil
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image, ImageFilter, ImageOps
|
||||
|
||||
from src.core.config import AppConfig
|
||||
from src.core.models import TrailerBeat
|
||||
|
||||
|
||||
def _run(cmd: list[str]) -> None:
|
||||
result = subprocess.run(cmd, capture_output=True)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(result.stderr.decode(errors="replace"))
|
||||
|
||||
|
||||
def _extract_frames(
|
||||
video_path: Path,
|
||||
start_s: float,
|
||||
duration_s: float,
|
||||
fps: float,
|
||||
out_dir: Path,
|
||||
prefix: str,
|
||||
) -> None:
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
_run([
|
||||
"ffmpeg", "-y", "-loglevel", "error",
|
||||
"-ss", str(max(0.0, start_s)),
|
||||
"-i", str(video_path),
|
||||
"-t", str(max(0.04, duration_s)),
|
||||
"-vf", f"scale=640:360,fps={fps}",
|
||||
str(out_dir / f"{prefix}_%04d.png"),
|
||||
])
|
||||
|
||||
|
||||
def _cropped_image(path: Path, cfg: AppConfig) -> Image.Image:
|
||||
image = Image.open(path).convert("L")
|
||||
image = _trim_dark_borders(image)
|
||||
w, h = image.size
|
||||
# Final validation should see the composition. The broader text-safe crop
|
||||
# used for coarse search can remove bodies, furniture and lower-frame
|
||||
# spatial cues that distinguish otherwise similar face/window shots.
|
||||
top = int(h * 0.05)
|
||||
bottom = int(h * 0.95)
|
||||
return image.crop((0, top, w, bottom))
|
||||
|
||||
|
||||
def _trim_dark_borders(image: Image.Image) -> Image.Image:
|
||||
"""Remove encoded black matte/pillarbox borders before content scoring."""
|
||||
gray = image.convert("L")
|
||||
arr = np.asarray(gray, dtype=np.float32)
|
||||
if arr.size == 0:
|
||||
return image
|
||||
h, w = arr.shape[:2]
|
||||
col_signal = np.percentile(arr, 90, axis=0)
|
||||
row_signal = np.percentile(arr, 90, axis=1)
|
||||
active_cols = np.where(col_signal > 18.0)[0]
|
||||
active_rows = np.where(row_signal > 18.0)[0]
|
||||
if active_cols.size >= max(8, int(w * 0.35)):
|
||||
x0 = max(0, int(active_cols[0]) - 2)
|
||||
x1 = min(w, int(active_cols[-1]) + 3)
|
||||
else:
|
||||
x0, x1 = 0, w
|
||||
if active_rows.size >= max(8, int(h * 0.35)):
|
||||
y0 = max(0, int(active_rows[0]) - 2)
|
||||
y1 = min(h, int(active_rows[-1]) + 3)
|
||||
else:
|
||||
y0, y1 = 0, h
|
||||
if x1 - x0 < int(w * 0.35) or y1 - y0 < int(h * 0.35):
|
||||
return image
|
||||
return image.crop((x0, y0, x1, y1))
|
||||
|
||||
|
||||
def _feature(path: Path, cfg: AppConfig) -> np.ndarray:
|
||||
image = _cropped_image(path, cfg)
|
||||
w, h = image.size
|
||||
image = image.crop((int(w * 0.10), int(h * 0.10), int(w * 0.90), int(h * 0.90)))
|
||||
image = ImageOps.equalize(image).filter(ImageFilter.FIND_EDGES).resize((160, 62))
|
||||
arr = np.asarray(image, dtype=np.float32)
|
||||
return (arr - arr.mean()) / (arr.std() + 1e-6)
|
||||
|
||||
|
||||
def _luma_feature(path: Path, cfg: AppConfig) -> np.ndarray:
|
||||
image = ImageOps.equalize(_cropped_image(path, cfg)).resize((160, 80))
|
||||
arr = np.asarray(image, dtype=np.float32)
|
||||
return (arr - arr.mean()) / (arr.std() + 1e-6)
|
||||
|
||||
|
||||
def _hist_feature(path: Path, cfg: AppConfig) -> np.ndarray:
|
||||
image = _trim_dark_borders(Image.open(path).convert("RGB"))
|
||||
w, h = image.size
|
||||
top = int(h * 0.05)
|
||||
bottom = int(h * 0.95)
|
||||
arr = np.asarray(image.crop((0, top, w, bottom)).resize((160, 80)), dtype=np.float32)
|
||||
hist_parts = []
|
||||
for channel in range(3):
|
||||
hist, _ = np.histogram(arr[:, :, channel], bins=32, range=(0, 255))
|
||||
hist = hist.astype(np.float32)
|
||||
hist_parts.append(hist / (hist.sum() + 1e-6))
|
||||
return np.concatenate(hist_parts)
|
||||
|
||||
|
||||
def _spatial_hist_feature(path: Path, cfg: AppConfig) -> np.ndarray:
|
||||
image = _trim_dark_borders(Image.open(path).convert("RGB"))
|
||||
w, h = image.size
|
||||
top = int(h * 0.05)
|
||||
bottom = int(h * 0.95)
|
||||
arr = np.asarray(image.crop((0, top, w, bottom)).resize((160, 80)), dtype=np.float32)
|
||||
cells = []
|
||||
grid_y = 4
|
||||
grid_x = 4
|
||||
cell_h = arr.shape[0] // grid_y
|
||||
cell_w = arr.shape[1] // grid_x
|
||||
for gy in range(grid_y):
|
||||
for gx in range(grid_x):
|
||||
cell = arr[gy * cell_h:(gy + 1) * cell_h, gx * cell_w:(gx + 1) * cell_w, :]
|
||||
for channel in range(3):
|
||||
hist, _ = np.histogram(cell[:, :, channel], bins=16, range=(0, 255))
|
||||
hist = hist.astype(np.float32)
|
||||
cells.append(hist / (hist.sum() + 1e-6))
|
||||
return np.concatenate(cells)
|
||||
|
||||
|
||||
def _is_dark(path: Path, cfg: AppConfig) -> bool:
|
||||
image = _trim_dark_borders(Image.open(path).convert("L"))
|
||||
w, h = image.size
|
||||
top = int(h * 0.05)
|
||||
bottom = int(h * 0.95)
|
||||
arr = np.asarray(image.crop((0, top, w, bottom)), dtype=np.float32)
|
||||
return float(arr.mean()) < 28.0 and float(np.percentile(arr, 90)) < 58.0
|
||||
|
||||
|
||||
def _corr(a: np.ndarray, b: np.ndarray) -> float:
|
||||
return float((a * b).mean())
|
||||
|
||||
|
||||
def _hist_intersection(a: np.ndarray, b: np.ndarray) -> float:
|
||||
return float(np.minimum(a, b).sum() / (np.maximum(a, b).sum() + 1e-6))
|
||||
|
||||
|
||||
def _paired_frame_score(ref_path: Path, src_path: Path, cfg: AppConfig) -> float:
|
||||
edge_score = _corr(_feature(ref_path, cfg), _feature(src_path, cfg))
|
||||
luma_score = _corr(_luma_feature(ref_path, cfg), _luma_feature(src_path, cfg))
|
||||
hist_score = _hist_intersection(_hist_feature(ref_path, cfg), _hist_feature(src_path, cfg))
|
||||
spatial_score = _hist_intersection(_spatial_hist_feature(ref_path, cfg), _spatial_hist_feature(src_path, cfg))
|
||||
return (
|
||||
edge_score * 0.24
|
||||
+ luma_score * 0.24
|
||||
+ hist_score * 0.14
|
||||
+ spatial_score * 0.38
|
||||
)
|
||||
|
||||
|
||||
def align_cached_match_by_content(
|
||||
beat: TrailerBeat,
|
||||
estimated_in_point_s: float,
|
||||
cfg: AppConfig,
|
||||
search_window_s: float | None = None,
|
||||
fps: float = 25.0,
|
||||
) -> tuple[float, float]:
|
||||
"""
|
||||
Measure the local source offset directly from rendered frame content.
|
||||
|
||||
This is intentionally independent from the global OpenCV matcher: it only
|
||||
needs FFmpeg, Pillow and numpy, and it scans a small window around an
|
||||
already plausible candidate.
|
||||
"""
|
||||
window_s = (
|
||||
search_window_s
|
||||
if search_window_s is not None
|
||||
else cfg.cv.deep_scan.content_align_window_seconds
|
||||
)
|
||||
sample_step_s = max(1.0 / fps, cfg.cv.deep_scan.content_align_sample_step_s)
|
||||
source_start_s = max(0.0, estimated_in_point_s - window_s)
|
||||
source_duration_s = beat.duration_s + (2.0 * window_s) + 0.5
|
||||
|
||||
tmp = cfg.paths.output_dir / "align_tmp" / f"beat_{beat.beat_id:03d}"
|
||||
shutil.rmtree(tmp, ignore_errors=True)
|
||||
tmp.mkdir(parents=True, exist_ok=True)
|
||||
try:
|
||||
ref_dir = tmp / "ref"
|
||||
src_dir = tmp / "src"
|
||||
_extract_frames(beat.trailer_path, beat.start_s, beat.duration_s, fps, ref_dir, "ref")
|
||||
_extract_frames(cfg.paths.source_movie, source_start_s, source_duration_s, fps, src_dir, "src")
|
||||
|
||||
ref_frames = sorted(ref_dir.glob("ref_*.png"))
|
||||
src_frames = sorted(src_dir.glob("src_*.png"))
|
||||
if not ref_frames or not src_frames:
|
||||
return estimated_in_point_s, 0.0
|
||||
|
||||
sample_frame_step = max(1, int(round(sample_step_s * fps)))
|
||||
min_matchable_frames = max(1, len(ref_frames) - int(round(0.24 * fps)))
|
||||
template_offsets: list[int] = []
|
||||
templates: list[tuple[int, np.ndarray]] = []
|
||||
for idx in range(0, min_matchable_frames, sample_frame_step):
|
||||
path = ref_frames[idx]
|
||||
if _is_dark(path, cfg):
|
||||
continue
|
||||
template_offsets.append(idx)
|
||||
templates.append((idx, _feature(path, cfg)))
|
||||
|
||||
if len(templates) < 3:
|
||||
template_offsets = list(range(0, min_matchable_frames, sample_frame_step))
|
||||
templates = [
|
||||
(idx, _feature(ref_frames[idx], cfg))
|
||||
for idx in template_offsets
|
||||
]
|
||||
|
||||
search_start_frame = 0
|
||||
search_end_frame = max(0, len(src_frames) - min_matchable_frames)
|
||||
estimated_frame = int(round((estimated_in_point_s - source_start_s) * fps))
|
||||
best_frame = estimated_frame
|
||||
best_score = -1.0
|
||||
|
||||
for candidate_frame in range(search_start_frame, search_end_frame + 1):
|
||||
scores: list[float] = []
|
||||
for offset_frame in template_offsets:
|
||||
src_idx = candidate_frame + offset_frame
|
||||
if src_idx < 0 or src_idx >= len(src_frames):
|
||||
break
|
||||
scores.append(_paired_frame_score(ref_frames[offset_frame], src_frames[src_idx], cfg))
|
||||
if len(scores) < max(3, math.ceil(len(templates) * 0.65)):
|
||||
continue
|
||||
|
||||
avg_score = sum(scores) / len(scores)
|
||||
min_score = min(scores)
|
||||
score = (avg_score * 0.68) + (min_score * 0.32)
|
||||
if score > best_score + 0.003:
|
||||
best_score = score
|
||||
best_frame = candidate_frame
|
||||
elif score >= best_score - 0.003 and abs(candidate_frame - estimated_frame) < abs(best_frame - estimated_frame):
|
||||
best_frame = candidate_frame
|
||||
|
||||
return source_start_s + (best_frame / fps), max(0.0, best_score)
|
||||
finally:
|
||||
shutil.rmtree(tmp, ignore_errors=True)
|
||||
@@ -0,0 +1,253 @@
|
||||
"""
|
||||
src/cv/deep_scan.py — Phase 2: Frame-accurate template matching (Deep Scan)
|
||||
|
||||
Responsibility:
|
||||
Given a TrailerBeat and a ranked list of VibeHit candidates, open the
|
||||
source video and scan each candidate scene in two passes:
|
||||
|
||||
1. Coarse pass: step through at coarse_step_seconds intervals,
|
||||
comparing via cv2.matchTemplate.
|
||||
2. Refine pass: if coarse score > threshold, zoom in ± refine_window_seconds
|
||||
at refine_step_seconds resolution to pin the exact in-point.
|
||||
|
||||
Returns a MatchResult if a confident hit is found, otherwise None.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Sequence
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
from src.core.config import AppConfig
|
||||
from src.core.models import MatchResult, Scene, TrailerBeat, VibeHit
|
||||
from src.cv.fingerprinting import text_safe_crop
|
||||
from src.cv.frame_extractor import (
|
||||
grab_frame_at,
|
||||
grab_frame_at_path,
|
||||
iter_frames_stepped,
|
||||
open_video,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Template preparation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _prepare_template(
|
||||
trailer_beat: TrailerBeat,
|
||||
cfg: AppConfig,
|
||||
proxy_w: int,
|
||||
proxy_h: int,
|
||||
) -> np.ndarray | None:
|
||||
"""
|
||||
Extract, crop, and resize the representative frame from the trailer beat.
|
||||
|
||||
This frame becomes the cv2.matchTemplate "needle".
|
||||
"""
|
||||
vc = cfg.cv.vibe_check
|
||||
ds = cfg.cv.deep_scan
|
||||
|
||||
beat_frame = grab_frame_at_path(
|
||||
trailer_beat.trailer_path,
|
||||
trailer_beat.midpoint_s,
|
||||
)
|
||||
if beat_frame is None:
|
||||
logger.warning("Beat %d: cannot decode midpoint frame.", trailer_beat.beat_id)
|
||||
return None
|
||||
|
||||
cropped = text_safe_crop(beat_frame, vc.crop_top_fraction, vc.crop_bottom_fraction)
|
||||
resized = cv2.resize(cropped, (proxy_w, proxy_h), interpolation=cv2.INTER_AREA)
|
||||
|
||||
# Crop the template by 10% on all sides to allow sliding window (translation invariance)
|
||||
# when matching against the source movie, which might have slight pan/scan shifts.
|
||||
margin_y = int(proxy_h * 0.10)
|
||||
margin_x = int(proxy_w * 0.10)
|
||||
template = resized[margin_y : proxy_h - margin_y, margin_x : proxy_w - margin_x]
|
||||
|
||||
return template
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Single-frame match
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _match_frame(
|
||||
source_frame: np.ndarray,
|
||||
template: np.ndarray,
|
||||
method: int,
|
||||
proxy_w: int,
|
||||
proxy_h: int,
|
||||
crop_top: float,
|
||||
crop_bottom: float,
|
||||
) -> tuple[float, tuple[int, int]]:
|
||||
"""
|
||||
Run cv2.matchTemplate between *source_frame* and *template*.
|
||||
|
||||
Returns:
|
||||
(score, (x, y)) where score ∈ [0, 1] for CCOEFF_NORMED.
|
||||
"""
|
||||
cropped = text_safe_crop(source_frame, crop_top, crop_bottom)
|
||||
haystack = cv2.resize(cropped, (proxy_w, proxy_h), interpolation=cv2.INTER_AREA)
|
||||
|
||||
# Match the slightly smaller template inside the full proxy frame
|
||||
result = cv2.matchTemplate(haystack, template, method)
|
||||
_, max_val, _, max_loc = cv2.minMaxLoc(result)
|
||||
return float(max_val), (int(max_loc[0]), int(max_loc[1]))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Deep Scan core
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def scan_scene(
|
||||
beat: TrailerBeat,
|
||||
scene: Scene,
|
||||
template: np.ndarray,
|
||||
cfg: AppConfig,
|
||||
) -> tuple[float, float, tuple[int, int]] | None:
|
||||
"""
|
||||
Scan one source scene in two passes (coarse → refine).
|
||||
|
||||
Returns:
|
||||
(best_timestamp_s, best_score, best_location) or None if no hit.
|
||||
"""
|
||||
ds = cfg.cv.deep_scan
|
||||
vc = cfg.cv.vibe_check
|
||||
proxy_w = cfg.video.proxy_width
|
||||
proxy_h = cfg.video.proxy_height
|
||||
|
||||
best_t = scene.start_s
|
||||
best_score = 0.0
|
||||
best_loc = (0, 0)
|
||||
|
||||
# ---- Coarse pass --------------------------------------------------------
|
||||
with open_video(scene.source_path) as cap:
|
||||
for t, frame in iter_frames_stepped(
|
||||
cap, scene.start_s, scene.end_s, ds.coarse_step_seconds
|
||||
):
|
||||
score, loc = _match_frame(
|
||||
frame, template, ds.match_method,
|
||||
proxy_w, proxy_h, vc.crop_top_fraction, vc.crop_bottom_fraction,
|
||||
)
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_t = t
|
||||
best_loc = loc
|
||||
|
||||
if best_score < ds.match_threshold:
|
||||
return None # scene doesn't contain a match worth refining
|
||||
|
||||
# ---- Refine pass ----------------------------------------------------
|
||||
refine_start = max(scene.start_s, best_t - ds.refine_window_seconds)
|
||||
refine_end = min(scene.end_s, best_t + ds.refine_window_seconds)
|
||||
|
||||
refined_t = best_t
|
||||
refined_score = best_score
|
||||
refined_loc = best_loc
|
||||
|
||||
for t, frame in iter_frames_stepped(
|
||||
cap, refine_start, refine_end, ds.refine_step_seconds
|
||||
):
|
||||
score, loc = _match_frame(
|
||||
frame, template, ds.match_method,
|
||||
proxy_w, proxy_h, vc.crop_top_fraction, vc.crop_bottom_fraction,
|
||||
)
|
||||
if score > refined_score:
|
||||
refined_score = score
|
||||
refined_t = t
|
||||
refined_loc = loc
|
||||
|
||||
logger.debug(
|
||||
"Beat %d → Scene %d: coarse=%.3f refined=%.3f @%.3fs",
|
||||
beat.beat_id, scene.scene_id, best_score, refined_score, refined_t,
|
||||
)
|
||||
return refined_t, refined_score, refined_loc
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def run_deep_scan(
|
||||
beat: TrailerBeat,
|
||||
candidates: Sequence[VibeHit],
|
||||
scenes_by_id: dict[int, Scene],
|
||||
cfg: AppConfig,
|
||||
) -> MatchResult | None:
|
||||
"""
|
||||
Phase 2 Deep Scan: iterate over Vibe Check candidates and template-match.
|
||||
|
||||
Args:
|
||||
beat: The trailer beat to source.
|
||||
candidates: Ranked VibeHit list from Phase 1 (best first).
|
||||
scenes_by_id: Lookup dict: scene_id → Scene.
|
||||
cfg: Application configuration.
|
||||
|
||||
Returns:
|
||||
The best MatchResult above threshold, or None if no match found.
|
||||
"""
|
||||
proxy_w = cfg.video.proxy_width
|
||||
proxy_h = cfg.video.proxy_height
|
||||
|
||||
template = _prepare_template(beat, cfg, proxy_w, proxy_h)
|
||||
if template is None:
|
||||
return None
|
||||
|
||||
best_result: MatchResult | None = None
|
||||
|
||||
for vibe_hit in candidates:
|
||||
scene = scenes_by_id.get(vibe_hit.scene_id)
|
||||
if scene is None:
|
||||
logger.warning("VibeHit references unknown scene_id=%d", vibe_hit.scene_id)
|
||||
continue
|
||||
|
||||
hit = scan_scene(beat, scene, template, cfg)
|
||||
if hit is None:
|
||||
continue
|
||||
|
||||
in_point_s, match_score, match_loc = hit
|
||||
|
||||
# Frame number: approximate via FPS (refined later if needed)
|
||||
from src.cv.frame_extractor import get_video_info
|
||||
info = get_video_info(scene.source_path)
|
||||
fps = float(info["fps"]) or 24.0
|
||||
in_point_frame = int(in_point_s * fps)
|
||||
|
||||
candidate_result = MatchResult(
|
||||
beat_id=beat.beat_id,
|
||||
scene_id=scene.scene_id,
|
||||
source_path=scene.source_path,
|
||||
in_point_s=in_point_s,
|
||||
out_point_s=in_point_s + beat.duration_s,
|
||||
in_point_frame=in_point_frame,
|
||||
match_score=match_score,
|
||||
match_location=match_loc,
|
||||
vibe_hit=vibe_hit,
|
||||
)
|
||||
|
||||
if best_result is None or match_score > best_result.match_score:
|
||||
best_result = candidate_result
|
||||
|
||||
# Early exit: if score is very high, no need to check other candidates
|
||||
if match_score >= 0.90:
|
||||
logger.info(
|
||||
"Beat %d: early-exit match (score=%.3f) in scene %d @%.3fs",
|
||||
beat.beat_id, match_score, scene.scene_id, in_point_s,
|
||||
)
|
||||
break
|
||||
|
||||
if best_result:
|
||||
logger.info("Beat %d → MATCH scene=%d score=%.3f in=%.3fs",
|
||||
beat.beat_id, best_result.scene_id,
|
||||
best_result.match_score, best_result.in_point_s)
|
||||
else:
|
||||
logger.warning("Beat %d → NO MATCH found in %d candidates.",
|
||||
beat.beat_id, len(candidates))
|
||||
|
||||
return best_result
|
||||
@@ -0,0 +1,228 @@
|
||||
"""
|
||||
src/cv/fingerprinting.py — Image fingerprinting for the Vibe Check phase
|
||||
|
||||
Responsibilities (Single Responsibility Principle):
|
||||
- Text-Safe Crop: strip top/bottom fractions to hide logos & letterbox
|
||||
- Luma + Saturation histogram extraction (scale-invariant)
|
||||
- Perceptual hash (pHash) via imagehash
|
||||
|
||||
This module is PURELY functional — no file I/O, no video decoding,
|
||||
no search logic. It takes numpy arrays and returns numeric descriptors.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pickle
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
try:
|
||||
import imagehash
|
||||
from PIL import Image as PilImage
|
||||
_HAS_IMAGEHASH = True
|
||||
except ImportError:
|
||||
_HAS_IMAGEHASH = False
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from src.core.config import VibeCheckConfig
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Text-Safe Crop
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def text_safe_crop(
|
||||
frame: np.ndarray,
|
||||
crop_top: float,
|
||||
crop_bottom: float,
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Remove the top and bottom fractions of a frame.
|
||||
|
||||
This eliminates title cards, logos (top) and letterbox / subtitles
|
||||
(bottom) before any colour analysis, preventing false positives.
|
||||
|
||||
Args:
|
||||
frame: BGR or greyscale frame as (H, W[, C]) ndarray.
|
||||
crop_top: Fraction [0, 1) of height to remove from the top.
|
||||
crop_bottom: Fraction [0, 1) of height to remove from the bottom.
|
||||
|
||||
Returns:
|
||||
Cropped view (no copy — avoids memory overhead).
|
||||
|
||||
Raises:
|
||||
ValueError: If crop fractions are out of range or overlap.
|
||||
"""
|
||||
if not (0.0 <= crop_top < 1.0):
|
||||
raise ValueError(f"crop_top must be in [0, 1); got {crop_top}")
|
||||
if not (0.0 <= crop_bottom < 1.0):
|
||||
raise ValueError(f"crop_bottom must be in [0, 1); got {crop_bottom}")
|
||||
if crop_top + crop_bottom >= 1.0:
|
||||
raise ValueError(
|
||||
f"crop_top ({crop_top}) + crop_bottom ({crop_bottom}) must be < 1.0"
|
||||
)
|
||||
|
||||
h = frame.shape[0]
|
||||
y_start = int(h * crop_top)
|
||||
y_end = int(h * (1.0 - crop_bottom))
|
||||
return frame[y_start:y_end]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Histogram extraction
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def extract_hs_histograms(
|
||||
frame_bgr: np.ndarray,
|
||||
bins_luma: int | None = None,
|
||||
bins_sat: int | None = None,
|
||||
*,
|
||||
bins_hue: int | None = None,
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
"""
|
||||
Compute normalised Luma (Value) and Saturation histograms from a BGR frame.
|
||||
|
||||
We use Luma and Saturation (ignoring Hue) because Hue is highly sensitive
|
||||
to color grading differences between the trailer and the source movie.
|
||||
|
||||
Args:
|
||||
frame_bgr: BGR frame (H, W, 3) uint8.
|
||||
bins_luma: Number of histogram bins for the Luma channel [0, 256).
|
||||
bins_hue: Backwards-compatible alias for bins_luma.
|
||||
bins_sat: Number of histogram bins for the Saturation channel [0, 256).
|
||||
|
||||
Returns:
|
||||
(luma_hist, sat_hist) — each a 1-D float32 ndarray, L2-normalised.
|
||||
"""
|
||||
if bins_luma is None:
|
||||
bins_luma = bins_hue
|
||||
elif bins_hue is not None and bins_hue != bins_luma:
|
||||
raise ValueError("bins_hue is an alias for bins_luma; pass only one value")
|
||||
if bins_luma is None or bins_sat is None:
|
||||
raise TypeError("bins_luma/bins_hue and bins_sat are required")
|
||||
|
||||
hsv = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2HSV)
|
||||
luma = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
# Use perceptual grayscale luma rather than HSV Value. Value would make
|
||||
# saturated red and blue look identical, weakening the scene-level filter.
|
||||
luma_hist = cv2.calcHist(
|
||||
[luma], [0], None, [bins_luma], [0, 256]
|
||||
).flatten().astype(np.float32)
|
||||
|
||||
sat_hist = cv2.calcHist(
|
||||
[hsv], [1], None, [bins_sat], [0, 256]
|
||||
).flatten().astype(np.float32)
|
||||
|
||||
# L2-normalise so scene size doesn't affect scores
|
||||
cv2.normalize(luma_hist, luma_hist, alpha=1.0, norm_type=cv2.NORM_L2)
|
||||
cv2.normalize(sat_hist, sat_hist, alpha=1.0, norm_type=cv2.NORM_L2)
|
||||
|
||||
return luma_hist, sat_hist
|
||||
|
||||
|
||||
def compare_histograms(
|
||||
hist_a: np.ndarray,
|
||||
hist_b: np.ndarray,
|
||||
method: int,
|
||||
) -> float:
|
||||
"""
|
||||
Compare two histograms using cv2.compareHist.
|
||||
|
||||
Args:
|
||||
hist_a, hist_b: 1-D float32 ndarrays of identical shape.
|
||||
method: cv2.HISTCMP_* constant (e.g. cv2.HISTCMP_CORREL = 0).
|
||||
|
||||
Returns:
|
||||
Raw score from cv2.compareHist (range depends on method).
|
||||
For CORREL: [-1, 1], higher = more similar.
|
||||
For BHATTACHARYYA: [0, 1], lower = more similar.
|
||||
"""
|
||||
return float(cv2.compareHist(hist_a, hist_b, method))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Perceptual Hash
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def compute_phash(frame_bgr: np.ndarray, hash_size: int = 8) -> str:
|
||||
"""
|
||||
Compute a perceptual hash (pHash) of a BGR frame.
|
||||
|
||||
pHash is rotation- and scale-invariant; it catches visual similarity
|
||||
even when resolution differs between trailer proxy and source movie.
|
||||
|
||||
Args:
|
||||
frame_bgr: BGR frame (H, W, 3) uint8.
|
||||
hash_size: DCT block size; 8 → 64-bit hash (default).
|
||||
|
||||
Returns:
|
||||
Hex string representation of the 64-bit hash (e.g. "f8e0e0e0...").
|
||||
|
||||
Raises:
|
||||
RuntimeError: If imagehash is not installed.
|
||||
"""
|
||||
if not _HAS_IMAGEHASH:
|
||||
raise RuntimeError(
|
||||
"imagehash is not installed. Run: pip install imagehash"
|
||||
)
|
||||
rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
|
||||
pil = PilImage.fromarray(rgb)
|
||||
phash = imagehash.phash(pil, hash_size=hash_size)
|
||||
return str(phash)
|
||||
|
||||
|
||||
def phash_distance(hash_a: str, hash_b: str) -> int:
|
||||
"""
|
||||
Compute Hamming distance between two pHash hex strings.
|
||||
|
||||
Args:
|
||||
hash_a, hash_b: Hex strings as returned by compute_phash().
|
||||
|
||||
Returns:
|
||||
Integer Hamming distance [0, 64]. 0 = identical.
|
||||
"""
|
||||
if not _HAS_IMAGEHASH:
|
||||
raise RuntimeError("imagehash is not installed.")
|
||||
return int(imagehash.hex_to_hash(hash_a) - imagehash.hex_to_hash(hash_b))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Serialisation helpers (histograms ↔ bytes for caching)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def hist_to_bytes(hist: np.ndarray) -> bytes:
|
||||
"""Serialise a numpy histogram array for storage in a Scene/Beat model."""
|
||||
return pickle.dumps(hist, protocol=pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
|
||||
def bytes_to_hist(data: bytes) -> np.ndarray:
|
||||
"""Deserialise a numpy histogram array from bytes."""
|
||||
return pickle.loads(data) # noqa: S301 (trusted internal cache only)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# High-level convenience: fingerprint one frame using config
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def fingerprint_frame(
|
||||
frame_bgr: np.ndarray,
|
||||
cfg: "VibeCheckConfig",
|
||||
) -> tuple[bytes, bytes, str]:
|
||||
"""
|
||||
Apply Text-Safe Crop, histogram extraction, and pHash in one call.
|
||||
|
||||
Args:
|
||||
frame_bgr: Full BGR frame (H, W, 3) uint8.
|
||||
cfg: VibeCheckConfig carrying crop fractions and bin counts.
|
||||
|
||||
Returns:
|
||||
(luma_hist_bytes, sat_hist_bytes, phash_hex)
|
||||
"""
|
||||
cropped = text_safe_crop(frame_bgr, cfg.crop_top_fraction, cfg.crop_bottom_fraction)
|
||||
luma_hist, sat_hist = extract_hs_histograms(cropped, cfg.hist_bins_hue, cfg.hist_bins_saturation)
|
||||
phash_hex = compute_phash(cropped)
|
||||
|
||||
return hist_to_bytes(luma_hist), hist_to_bytes(sat_hist), phash_hex
|
||||
@@ -0,0 +1,172 @@
|
||||
"""
|
||||
src/cv/frame_extractor.py — Low-level video frame access
|
||||
|
||||
Responsibility:
|
||||
Provide a thin, testable wrapper around cv2.VideoCapture for:
|
||||
- seeking to an exact timestamp and returning one BGR frame
|
||||
- iterating frames with a configurable step size
|
||||
- extracting the "representative" middle frame of a Scene / TrailerBeat
|
||||
|
||||
No fingerprinting, no matching — only raw frame delivery.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
from typing import Generator, Iterator
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Context-managed VideoCapture
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@contextmanager
|
||||
def open_video(path: Path) -> Generator[cv2.VideoCapture, None, None]:
|
||||
"""
|
||||
Context manager that opens a VideoCapture and guarantees release.
|
||||
|
||||
Args:
|
||||
path: Absolute path to the video file.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If the file does not exist.
|
||||
RuntimeError: If OpenCV cannot open the file.
|
||||
"""
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"Video not found: {path}")
|
||||
|
||||
cap = cv2.VideoCapture(str(path))
|
||||
if not cap.isOpened():
|
||||
raise RuntimeError(f"OpenCV could not open video: {path}")
|
||||
|
||||
try:
|
||||
yield cap
|
||||
finally:
|
||||
cap.release()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Video metadata
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def get_video_info(path: Path) -> dict[str, float | int]:
|
||||
"""
|
||||
Return basic metadata without keeping the file open.
|
||||
|
||||
Returns:
|
||||
dict with keys: fps, frame_count, duration_s, width, height
|
||||
"""
|
||||
with open_video(path) as cap:
|
||||
fps = cap.get(cv2.CAP_PROP_FPS)
|
||||
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||
|
||||
duration_s = frame_count / fps if fps > 0 else 0.0
|
||||
return {
|
||||
"fps": fps,
|
||||
"frame_count": frame_count,
|
||||
"duration_s": duration_s,
|
||||
"width": width,
|
||||
"height": height,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Single frame extraction
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def grab_frame_at(cap: cv2.VideoCapture, timestamp_s: float) -> np.ndarray | None:
|
||||
"""
|
||||
Seek to *timestamp_s* and return the BGR frame at that position.
|
||||
|
||||
Uses CAP_PROP_POS_MSEC for sub-frame accuracy.
|
||||
|
||||
Args:
|
||||
cap: An already-open VideoCapture.
|
||||
timestamp_s: Target time in seconds.
|
||||
|
||||
Returns:
|
||||
BGR ndarray (H, W, 3) or None if seeking / decoding failed.
|
||||
"""
|
||||
cap.set(cv2.CAP_PROP_POS_MSEC, timestamp_s * 1000.0)
|
||||
ok, frame = cap.read()
|
||||
if not ok or frame is None:
|
||||
logger.debug("grab_frame_at: failed at %.3fs", timestamp_s)
|
||||
return None
|
||||
return frame
|
||||
|
||||
|
||||
def grab_frame_at_path(path: Path, timestamp_s: float) -> np.ndarray | None:
|
||||
"""
|
||||
One-shot convenience: open → seek → grab → release.
|
||||
Prefer open_video() when grabbing multiple frames from the same file.
|
||||
"""
|
||||
with open_video(path) as cap:
|
||||
return grab_frame_at(cap, timestamp_s)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Middle-frame extraction (representative frame for fingerprinting)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def grab_midpoint_frame(
|
||||
cap: cv2.VideoCapture,
|
||||
start_s: float,
|
||||
end_s: float,
|
||||
) -> np.ndarray | None:
|
||||
"""
|
||||
Grab the frame at the exact midpoint of a [start_s, end_s] interval.
|
||||
|
||||
Args:
|
||||
cap: Open VideoCapture for the source video.
|
||||
start_s: Interval start in seconds.
|
||||
end_s: Interval end in seconds.
|
||||
|
||||
Returns:
|
||||
BGR frame or None if decoding failed.
|
||||
"""
|
||||
mid = start_s + (end_s - start_s) / 2.0
|
||||
return grab_frame_at(cap, mid)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Stepped-frame iterator (used by Deep Scan coarse pass)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def iter_frames_stepped(
|
||||
cap: cv2.VideoCapture,
|
||||
start_s: float,
|
||||
end_s: float,
|
||||
step_s: float,
|
||||
) -> Iterator[tuple[float, np.ndarray]]:
|
||||
"""
|
||||
Yield (timestamp_s, frame) for every *step_s* increment in [start_s, end_s].
|
||||
|
||||
Frames that fail to decode are silently skipped.
|
||||
|
||||
Args:
|
||||
cap: Open VideoCapture.
|
||||
start_s: Scan window start in seconds.
|
||||
end_s: Scan window end in seconds.
|
||||
step_s: Step between samples in seconds.
|
||||
|
||||
Yields:
|
||||
(timestamp_s, bgr_frame)
|
||||
"""
|
||||
if step_s <= 0:
|
||||
raise ValueError(f"step_s must be > 0; got {step_s}")
|
||||
|
||||
t = start_s
|
||||
while t <= end_s:
|
||||
frame = grab_frame_at(cap, t)
|
||||
if frame is not None:
|
||||
yield t, frame
|
||||
t = round(t + step_s, 6) # avoid float accumulation drift
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,229 @@
|
||||
"""
|
||||
src/cv/scene_indexer.py — Source-movie scene segmentation + fingerprinting
|
||||
|
||||
Responsibility:
|
||||
1. Run PySceneDetect on the source movie → list of raw scene boundaries
|
||||
2. For each scene, extract the midpoint frame and fingerprint it
|
||||
3. Optionally run Whisper dialogue on each scene (injected as dependency)
|
||||
4. Persist results to .cache/ as JSON for fast re-runs
|
||||
|
||||
Returns: list[Scene] with luma_hist, sat_hist, phash populated.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
from typing import Callable, Sequence
|
||||
|
||||
import numpy as np
|
||||
|
||||
from src.core.config import AppConfig
|
||||
from src.core.models import Scene
|
||||
from src.cv.fingerprinting import fingerprint_frame
|
||||
from src.cv.frame_extractor import grab_midpoint_frame, open_video
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Type alias for an optional dialogue-injection callback
|
||||
DialogueCallback = Callable[[Scene], Scene]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Cache helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _cache_path(cfg: AppConfig) -> Path:
|
||||
p = cfg.paths.cache_dir / "scene_index.json"
|
||||
p.parent.mkdir(parents=True, exist_ok=True)
|
||||
return p
|
||||
|
||||
|
||||
def _scene_to_dict(s: Scene) -> dict:
|
||||
return {
|
||||
"scene_id": s.scene_id,
|
||||
"source_path": str(s.source_path),
|
||||
"start_s": s.start_s,
|
||||
"end_s": s.end_s,
|
||||
"start_frame": s.start_frame,
|
||||
"end_frame": s.end_frame,
|
||||
# histograms serialised as hex so JSON can hold them
|
||||
"luma_hist": s.luma_hist.hex() if s.luma_hist else None,
|
||||
"sat_hist": s.sat_hist.hex() if s.sat_hist else None,
|
||||
"phash": s.phash,
|
||||
}
|
||||
|
||||
|
||||
def _scene_from_dict(d: dict) -> Scene:
|
||||
return Scene(
|
||||
scene_id=d["scene_id"],
|
||||
source_path=Path(d["source_path"]),
|
||||
start_s=d["start_s"],
|
||||
end_s=d["end_s"],
|
||||
start_frame=d["start_frame"],
|
||||
end_frame=d["end_frame"],
|
||||
luma_hist=bytes.fromhex(d["luma_hist"]) if d.get("luma_hist") else None,
|
||||
sat_hist= bytes.fromhex(d["sat_hist"]) if d.get("sat_hist") else None,
|
||||
phash=d.get("phash"),
|
||||
)
|
||||
|
||||
|
||||
def _save_cache(scenes: list[Scene], cfg: AppConfig) -> None:
|
||||
data = [_scene_to_dict(s) for s in scenes]
|
||||
_cache_path(cfg).write_text(json.dumps(data, indent=2), encoding="utf-8")
|
||||
logger.info("Scene index cached → %s (%d scenes)", _cache_path(cfg), len(scenes))
|
||||
|
||||
|
||||
def _load_cache(cfg: AppConfig) -> list[Scene] | None:
|
||||
p = _cache_path(cfg)
|
||||
if not p.exists():
|
||||
return None
|
||||
try:
|
||||
data = json.loads(p.read_text(encoding="utf-8"))
|
||||
scenes = [_scene_from_dict(d) for d in data]
|
||||
logger.info("Loaded %d scenes from cache (%s)", len(scenes), p)
|
||||
return scenes
|
||||
except Exception as exc:
|
||||
logger.warning("Cache corrupt, re-indexing: %s", exc)
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# PySceneDetect integration
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _detect_scenes_pyscenedetect(cfg: AppConfig) -> list[tuple[float, float, int, int]]:
|
||||
"""
|
||||
Run PySceneDetect ContentDetector on the source movie.
|
||||
|
||||
Returns:
|
||||
List of (start_s, end_s, start_frame, end_frame) tuples.
|
||||
"""
|
||||
try:
|
||||
from scenedetect import open_video as sd_open_video, SceneManager
|
||||
from scenedetect.detectors import ContentDetector
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"scenedetect is not installed. Run: pip install scenedetect[opencv]"
|
||||
)
|
||||
|
||||
video = sd_open_video(str(cfg.paths.source_movie))
|
||||
manager = SceneManager()
|
||||
manager.add_detector(
|
||||
ContentDetector(
|
||||
threshold=cfg.scene_detection.content_threshold,
|
||||
min_scene_len=int(
|
||||
cfg.scene_detection.min_scene_duration_s
|
||||
* video.frame_rate
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
logger.info("Detecting scenes in %s …", cfg.paths.source_movie.name)
|
||||
manager.detect_scenes(video=video, show_progress=True)
|
||||
|
||||
raw = manager.get_scene_list()
|
||||
result: list[tuple[float, float, int, int]] = []
|
||||
for start_tc, end_tc in raw:
|
||||
result.append((
|
||||
start_tc.get_seconds(),
|
||||
end_tc.get_seconds(),
|
||||
start_tc.get_frames(),
|
||||
end_tc.get_frames(),
|
||||
))
|
||||
|
||||
logger.info("PySceneDetect found %d scenes.", len(result))
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fingerprint enrichment
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _fingerprint_scenes(
|
||||
raw_scenes: list[tuple[float, float, int, int]],
|
||||
cfg: AppConfig,
|
||||
) -> list[Scene]:
|
||||
"""
|
||||
For each raw scene boundary, extract the midpoint frame and fingerprint it.
|
||||
"""
|
||||
scenes: list[Scene] = []
|
||||
vc_cfg = cfg.cv.vibe_check
|
||||
|
||||
logger.info("Fingerprinting %d scenes …", len(raw_scenes))
|
||||
|
||||
with open_video(cfg.paths.source_movie) as cap:
|
||||
for idx, (start_s, end_s, start_frame, end_frame) in enumerate(raw_scenes):
|
||||
frame = grab_midpoint_frame(cap, start_s, end_s)
|
||||
|
||||
if frame is None:
|
||||
logger.warning("Scene %d: midpoint frame decode failed, skipping fingerprint.", idx)
|
||||
scenes.append(Scene(
|
||||
scene_id=idx,
|
||||
source_path=cfg.paths.source_movie,
|
||||
start_s=start_s, end_s=end_s,
|
||||
start_frame=start_frame, end_frame=end_frame,
|
||||
))
|
||||
continue
|
||||
|
||||
luma_bytes, sat_bytes, phash_hex = fingerprint_frame(frame, vc_cfg)
|
||||
|
||||
scenes.append(Scene(
|
||||
scene_id=idx,
|
||||
source_path=cfg.paths.source_movie,
|
||||
start_s=start_s, end_s=end_s,
|
||||
start_frame=start_frame, end_frame=end_frame,
|
||||
luma_hist=luma_bytes,
|
||||
sat_hist=sat_bytes,
|
||||
phash=phash_hex,
|
||||
))
|
||||
|
||||
if (idx + 1) % 50 == 0:
|
||||
logger.info(" … %d / %d scenes fingerprinted", idx + 1, len(raw_scenes))
|
||||
|
||||
return scenes
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def build_scene_index(
|
||||
cfg: AppConfig,
|
||||
force_reindex: bool = False,
|
||||
dialogue_callback: DialogueCallback | None = None,
|
||||
) -> list[Scene]:
|
||||
"""
|
||||
Build (or load from cache) the full scene index for the source movie.
|
||||
|
||||
Steps:
|
||||
1. Load from .cache/scene_index.json if available and force_reindex=False.
|
||||
2. Otherwise: detect scenes via PySceneDetect → fingerprint → cache.
|
||||
3. Optionally enrich each scene with dialogue via dialogue_callback.
|
||||
|
||||
Args:
|
||||
cfg: Application configuration.
|
||||
force_reindex: Ignore cache and re-run detection + fingerprinting.
|
||||
dialogue_callback: Optional function Scene → Scene that adds dialogue.
|
||||
Injected here so this module stays audio-free.
|
||||
|
||||
Returns:
|
||||
List of Scene objects with fingerprints populated.
|
||||
"""
|
||||
if not force_reindex:
|
||||
cached = _load_cache(cfg)
|
||||
if cached is not None:
|
||||
if dialogue_callback:
|
||||
cached = [dialogue_callback(s) for s in cached]
|
||||
return cached
|
||||
|
||||
raw = _detect_scenes_pyscenedetect(cfg)
|
||||
scenes = _fingerprint_scenes(raw, cfg)
|
||||
_save_cache(scenes, cfg)
|
||||
|
||||
if dialogue_callback:
|
||||
scenes = [dialogue_callback(s) for s in scenes]
|
||||
|
||||
return scenes
|
||||
@@ -0,0 +1,190 @@
|
||||
"""
|
||||
src/cv/vibe_check.py — Phase 1: Scene-level histogram / pHash filter
|
||||
|
||||
Responsibility:
|
||||
Given ONE TrailerBeat (with pre-computed fingerprints) and a list of
|
||||
source Scenes (also fingerprinted), return the Top-K candidates ranked
|
||||
by a combined histogram + pHash score.
|
||||
|
||||
This module contains ZERO file I/O and ZERO frame decoding — those live
|
||||
in the pipeline layer. Input = model objects, output = sorted VibeHit list.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from dataclasses import replace
|
||||
from typing import Sequence
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
from src.core.models import Scene, TrailerBeat, VibeHit
|
||||
from src.cv.fingerprinting import bytes_to_hist, phash_distance
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scoring
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Weight applied to histogram score vs pHash score in the combined metric.
|
||||
# pHash gets less weight because it's sensitive to text overlays on source.
|
||||
_HIST_WEIGHT = 0.70
|
||||
_PHASH_WEIGHT = 0.30
|
||||
_PHASH_MAX_BITS = 64 # maximum possible Hamming distance
|
||||
|
||||
|
||||
def _hist_combined_score(
|
||||
beat: TrailerBeat,
|
||||
scene: Scene,
|
||||
hist_method: int,
|
||||
) -> float:
|
||||
"""
|
||||
Average CORREL score of luma + saturation histograms.
|
||||
|
||||
Returns a value in [-1, 1] (CORREL) or [0, 1] depending on method.
|
||||
Higher is always more similar (we invert BHATTACHARYYA if needed).
|
||||
"""
|
||||
if beat.luma_hist is None or scene.luma_hist is None:
|
||||
return 0.0
|
||||
if beat.sat_hist is None or scene.sat_hist is None:
|
||||
return 0.0
|
||||
|
||||
luma_score = cv2.compareHist(
|
||||
bytes_to_hist(beat.luma_hist),
|
||||
bytes_to_hist(scene.luma_hist),
|
||||
hist_method,
|
||||
)
|
||||
sat_score = cv2.compareHist(
|
||||
bytes_to_hist(beat.sat_hist),
|
||||
bytes_to_hist(scene.sat_hist),
|
||||
hist_method,
|
||||
)
|
||||
|
||||
# Normalise BHATTACHARYYA to [0, 1] similarity (invert distance)
|
||||
if hist_method == cv2.HISTCMP_BHATTACHARYYA:
|
||||
luma_score = 1.0 - float(luma_score)
|
||||
sat_score = 1.0 - float(sat_score)
|
||||
|
||||
return float((luma_score + sat_score) / 2.0)
|
||||
|
||||
|
||||
def _phash_score(beat: TrailerBeat, scene: Scene) -> float:
|
||||
"""
|
||||
Convert Hamming distance to a [0, 1] similarity score.
|
||||
|
||||
0 Hamming distance → 1.0 (identical)
|
||||
64 Hamming distance → 0.0 (completely different)
|
||||
"""
|
||||
if beat.phash is None or scene.phash is None:
|
||||
return 0.0
|
||||
dist = phash_distance(beat.phash, scene.phash)
|
||||
return 1.0 - (dist / _PHASH_MAX_BITS)
|
||||
|
||||
|
||||
def _combined_score(
|
||||
beat: TrailerBeat,
|
||||
scene: Scene,
|
||||
hist_method: int,
|
||||
) -> float:
|
||||
"""Weighted aggregate of histogram + pHash similarity."""
|
||||
hist = _hist_combined_score(beat, scene, hist_method)
|
||||
phash = _phash_score(beat, scene)
|
||||
return _HIST_WEIGHT * hist + _PHASH_WEIGHT * phash
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def run_vibe_check(
|
||||
beat: TrailerBeat,
|
||||
scenes: Sequence[Scene],
|
||||
top_k: int,
|
||||
hist_method: int,
|
||||
phash_max_distance: int,
|
||||
) -> list[VibeHit]:
|
||||
"""
|
||||
Phase 1: Score all source scenes against one trailer beat and return
|
||||
the top-K candidates for Deep Scan.
|
||||
|
||||
Args:
|
||||
beat: The trailer beat to match (must have fingerprints).
|
||||
scenes: All detected scenes from the source movie.
|
||||
top_k: Maximum number of candidates to return.
|
||||
hist_method: cv2.HISTCMP_* constant (e.g. 0 = CORREL).
|
||||
phash_max_distance: Scenes with pHash Hamming distance > this value
|
||||
are excluded before ranking (hard filter).
|
||||
|
||||
Returns:
|
||||
List of VibeHit, sorted by combined_score descending, length ≤ top_k.
|
||||
Empty list if beat has no fingerprints or no scenes pass the filter.
|
||||
"""
|
||||
if beat.luma_hist is None and beat.phash is None:
|
||||
logger.warning(
|
||||
"Beat %d has no fingerprints — skipping Vibe Check.", beat.beat_id
|
||||
)
|
||||
return []
|
||||
|
||||
candidates: list[VibeHit] = []
|
||||
|
||||
for scene in scenes:
|
||||
# Hard pHash filter: skip scenes that are too visually distant
|
||||
if beat.phash and scene.phash:
|
||||
dist = phash_distance(beat.phash, scene.phash)
|
||||
if dist > phash_max_distance:
|
||||
continue # fast rejection — avoids full histogram compare
|
||||
|
||||
hist = _hist_combined_score(beat, scene, hist_method)
|
||||
phash = _phash_score(beat, scene)
|
||||
combined = _HIST_WEIGHT * hist + _PHASH_WEIGHT * phash
|
||||
|
||||
candidates.append(VibeHit(
|
||||
beat_id=beat.beat_id,
|
||||
scene_id=scene.scene_id,
|
||||
hist_score=round(hist, 4),
|
||||
phash_distance=(
|
||||
phash_distance(beat.phash, scene.phash)
|
||||
if beat.phash and scene.phash
|
||||
else _PHASH_MAX_BITS
|
||||
),
|
||||
combined_score=round(combined, 4),
|
||||
))
|
||||
|
||||
# Sort by combined score, descending; return top-K
|
||||
candidates.sort(key=lambda h: h.combined_score, reverse=True)
|
||||
top = candidates[:top_k]
|
||||
|
||||
logger.info(
|
||||
"Vibe Check beat=%d: %d scenes scored, %d candidates forwarded to Deep Scan. "
|
||||
"Best score: %.3f (scene %s)",
|
||||
beat.beat_id,
|
||||
len(candidates),
|
||||
len(top),
|
||||
top[0].combined_score if top else 0.0,
|
||||
top[0].scene_id if top else "—",
|
||||
)
|
||||
|
||||
return top
|
||||
|
||||
|
||||
def batch_vibe_check(
|
||||
beats: Sequence[TrailerBeat],
|
||||
scenes: Sequence[Scene],
|
||||
top_k: int,
|
||||
hist_method: int,
|
||||
phash_max_distance: int,
|
||||
) -> dict[int, list[VibeHit]]:
|
||||
"""
|
||||
Run Vibe Check for every beat and return a mapping beat_id → [VibeHit].
|
||||
|
||||
Convenience wrapper for the pipeline layer.
|
||||
"""
|
||||
return {
|
||||
beat.beat_id: run_vibe_check(
|
||||
beat, scenes, top_k, hist_method, phash_max_distance
|
||||
)
|
||||
for beat in beats
|
||||
}
|
||||
@@ -0,0 +1 @@
|
||||
# src.export package — FCPXML / EDL export
|
||||
@@ -0,0 +1,114 @@
|
||||
"""
|
||||
src/export/edl_writer.py — EditTimeline → CMX 3600 EDL
|
||||
|
||||
Generates a standard CMX 3600 Edit Decision List compatible with
|
||||
Avid, DaVinci Resolve, Premiere Pro, and most NLEs.
|
||||
|
||||
CMX 3600 format reference:
|
||||
https://en.wikipedia.org/wiki/Edit_decision_list#CMX_3600
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from src.core.config import AppConfig
|
||||
from src.core.models import EditClip, EditTimeline
|
||||
from src.export.timecode import seconds_to_smpte
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# EDL line builders
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _edl_header(title: str) -> str:
|
||||
return f"TITLE: {title}\nFCM: NON-DROP FRAME\n"
|
||||
|
||||
|
||||
def _edl_event(
|
||||
event_num: int,
|
||||
clip: EditClip,
|
||||
fps: float,
|
||||
) -> str:
|
||||
"""
|
||||
Build one CMX 3600 event block for a single clip.
|
||||
|
||||
Format:
|
||||
NNN AX V C <SRC_IN> <SRC_OUT> <REC_IN> <REC_OUT>
|
||||
* FROM CLIP NAME: ...
|
||||
* COMMENT: ...
|
||||
"""
|
||||
src_in = seconds_to_smpte(clip.match.in_point_s, fps)
|
||||
source_duration_s = clip.source_timeline_duration_s
|
||||
src_out = seconds_to_smpte(clip.match.in_point_s + source_duration_s, fps)
|
||||
rec_in = seconds_to_smpte(clip.timeline_start_s, fps)
|
||||
rec_out = seconds_to_smpte(clip.timeline_start_s + source_duration_s, fps)
|
||||
|
||||
event_line = f"{event_num:03d} AX V C {src_in} {src_out} {rec_in} {rec_out}"
|
||||
name_line = f"* FROM CLIP NAME: {clip.match.source_path.name}"
|
||||
comment_line = (
|
||||
f"* BEAT {clip.beat.beat_id:03d} | {clip.beat.beat_type.name} | "
|
||||
f"score={clip.match.match_score:.3f}"
|
||||
)
|
||||
|
||||
return "\n".join([event_line, name_line, comment_line, ""])
|
||||
|
||||
|
||||
def _edl_black_tail_event(event_num: int, clip: EditClip, fps: float) -> str:
|
||||
rec_in = seconds_to_smpte(clip.timeline_start_s + clip.source_timeline_duration_s, fps)
|
||||
rec_out = seconds_to_smpte(clip.timeline_end_s, fps)
|
||||
event_line = f"{event_num:03d} BL V C 00:00:00:00 00:00:00:00 {rec_in} {rec_out}"
|
||||
comment_line = (
|
||||
f"* BEAT {clip.beat.beat_id:03d} TRAILER-ONLY TAIL | "
|
||||
"add fade/dissolve to black"
|
||||
)
|
||||
return "\n".join([event_line, "* FROM CLIP NAME: BLACK", comment_line, ""])
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def write_edl(
|
||||
timeline: EditTimeline,
|
||||
cfg: AppConfig,
|
||||
output_path: Path | None = None,
|
||||
) -> Path:
|
||||
"""
|
||||
Write the EditTimeline as a CMX 3600 EDL file.
|
||||
|
||||
Args:
|
||||
timeline: EditTimeline from build_timeline().
|
||||
cfg: Application configuration.
|
||||
output_path: Override destination. Defaults to
|
||||
<output_dir>/<project_name>.edl.
|
||||
|
||||
Returns:
|
||||
Path to the written .edl file.
|
||||
"""
|
||||
if output_path is None:
|
||||
output_path = cfg.paths.output_dir / f"{timeline.title}.edl"
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
fps = timeline.frame_rate
|
||||
lines = [_edl_header(timeline.title), "\n"]
|
||||
|
||||
event_num = 1
|
||||
for clip in sorted(timeline.clips, key=lambda c: c.clip_index):
|
||||
lines.append(_edl_event(event_num, clip, fps))
|
||||
event_num += 1
|
||||
if clip.trailer_tail_s > 0:
|
||||
lines.append("\n")
|
||||
lines.append(_edl_black_tail_event(event_num, clip, fps))
|
||||
event_num += 1
|
||||
lines.append("\n")
|
||||
|
||||
edl_text = "\n".join(lines)
|
||||
output_path.write_text(edl_text, encoding="utf-8")
|
||||
|
||||
logger.info("EDL written → %s (%d events)", output_path, timeline.clip_count)
|
||||
return output_path
|
||||
@@ -0,0 +1,222 @@
|
||||
"""
|
||||
src/export/fcpxml_writer.py — EditTimeline → Final Cut Pro XML (FCPXML 1.10)
|
||||
|
||||
Generates a standards-compliant FCPXML file that can be imported directly
|
||||
into Final Cut Pro X, DaVinci Resolve, or Premiere Pro (via FCPXML plugin).
|
||||
|
||||
Spec reference: https://developer.apple.com/documentation/professional_video_applications/fcpxml_reference
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from urllib.parse import quote
|
||||
from xml.etree import ElementTree as ET
|
||||
from xml.etree.ElementTree import Element, SubElement
|
||||
|
||||
from src.core.config import AppConfig
|
||||
from src.core.models import EditClip, EditTimeline
|
||||
from src.export.timecode import (
|
||||
fcpxml_format_name,
|
||||
fcpxml_frame_duration,
|
||||
seconds_to_fcpxml,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Asset registry — one <asset> per unique source file
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class _AssetRegistry:
|
||||
def __init__(self) -> None:
|
||||
self._assets: dict[Path, str] = {} # path → asset id
|
||||
self._counter = 2 # r1 reserved for format
|
||||
|
||||
def get_or_create(self, path: Path) -> str:
|
||||
if path not in self._assets:
|
||||
rid = f"r{self._counter}"
|
||||
self._assets[path] = rid
|
||||
self._counter += 1
|
||||
return self._assets[path]
|
||||
|
||||
@property
|
||||
def items(self) -> dict[Path, str]:
|
||||
return dict(self._assets)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Builder
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _path_to_url(path: Path) -> str:
|
||||
"""Convert an absolute Path to a file:// URL as required by FCPXML."""
|
||||
posix = path.as_posix()
|
||||
if not posix.startswith("/"):
|
||||
# Windows drive letter: C:/foo → /C:/foo
|
||||
posix = "/" + posix
|
||||
return "file://" + quote(posix, safe="/:@")
|
||||
|
||||
|
||||
def build_fcpxml(
|
||||
timeline: EditTimeline,
|
||||
cfg: AppConfig,
|
||||
source_duration_s: float = 7200.0, # 2-hour fallback if not probed
|
||||
) -> ET.ElementTree:
|
||||
"""
|
||||
Build a complete FCPXML ElementTree from an EditTimeline.
|
||||
|
||||
Args:
|
||||
timeline: Ordered sequence of EditClips.
|
||||
cfg: Application configuration.
|
||||
source_duration_s: Duration of the source movie asset (used for
|
||||
<asset> duration attribute). Will be probed
|
||||
automatically when possible.
|
||||
|
||||
Returns:
|
||||
xml.etree.ElementTree.ElementTree — call .write() to serialise.
|
||||
"""
|
||||
fps = timeline.frame_rate
|
||||
|
||||
# ---- root ---------------------------------------------------------------
|
||||
root = Element("fcpxml", version=cfg.export.fcpxml_version)
|
||||
root.set("xmlns", "http://www.apple.com/dt/FCPXML/1_10")
|
||||
|
||||
# ---- resources ----------------------------------------------------------
|
||||
resources = SubElement(root, "resources")
|
||||
|
||||
format_id = "r1"
|
||||
format_name = fcpxml_format_name(fps)
|
||||
fmt = SubElement(resources, "format",
|
||||
id=format_id,
|
||||
name=format_name,
|
||||
frameDuration=fcpxml_frame_duration(fps),
|
||||
width="1920",
|
||||
height="1080",
|
||||
colorSpace="1-1-1 (Rec. 709)",
|
||||
)
|
||||
|
||||
registry = _AssetRegistry()
|
||||
|
||||
# Pre-register all unique source paths so <asset> elements come before
|
||||
# the <library> block (required by FCPXML spec).
|
||||
for clip in timeline.clips:
|
||||
registry.get_or_create(clip.match.source_path)
|
||||
|
||||
# Probe actual source duration when possible
|
||||
_durations: dict[Path, float] = {}
|
||||
for path in registry.items:
|
||||
try:
|
||||
from src.cv.frame_extractor import get_video_info
|
||||
info = get_video_info(path)
|
||||
_durations[path] = float(info["duration_s"])
|
||||
except Exception:
|
||||
_durations[path] = source_duration_s
|
||||
|
||||
for path, rid in registry.items.items():
|
||||
dur_s = _durations.get(path, source_duration_s)
|
||||
SubElement(resources, "asset",
|
||||
id=rid,
|
||||
name=path.stem,
|
||||
src=_path_to_url(path),
|
||||
start="0s",
|
||||
duration=seconds_to_fcpxml(dur_s, fps),
|
||||
hasVideo="1",
|
||||
hasAudio="1",
|
||||
format=format_id,
|
||||
)
|
||||
|
||||
# ---- library / event / project ------------------------------------------
|
||||
library = SubElement(root, "library")
|
||||
event = SubElement(library, "event", name=timeline.title)
|
||||
project = SubElement(event, "project", name=timeline.title)
|
||||
sequence = SubElement(project, "sequence",
|
||||
duration=seconds_to_fcpxml(timeline.total_duration_s, fps),
|
||||
format=format_id,
|
||||
tcStart="0s",
|
||||
tcFormat="NDF",
|
||||
audioLayout="stereo",
|
||||
audioRate="48k",
|
||||
)
|
||||
spine = SubElement(sequence, "spine")
|
||||
|
||||
# ---- clips --------------------------------------------------------------
|
||||
for clip in sorted(timeline.clips, key=lambda c: c.clip_index):
|
||||
asset_id = registry.get_or_create(clip.match.source_path)
|
||||
|
||||
source_duration_s = clip.source_timeline_duration_s
|
||||
clip_elem = SubElement(spine, "clip",
|
||||
name=f"Beat_{clip.beat.beat_id:03d}_{clip.beat.beat_type.name}",
|
||||
ref=asset_id,
|
||||
# offset = position on the timeline
|
||||
offset=seconds_to_fcpxml(clip.timeline_start_s, fps),
|
||||
# duration = matched source part only; trailer-only tails become gaps.
|
||||
duration=seconds_to_fcpxml(source_duration_s, fps),
|
||||
# start = in-point inside the source asset
|
||||
start=seconds_to_fcpxml(clip.match.in_point_s, fps),
|
||||
)
|
||||
|
||||
# Inline audio role
|
||||
SubElement(clip_elem, "audio",
|
||||
role="dialogue",
|
||||
srcCh="1, 2",
|
||||
outCh="L, R",
|
||||
)
|
||||
|
||||
if clip.trailer_tail_s > 0:
|
||||
gap = SubElement(spine, "gap",
|
||||
name=f"Beat_{clip.beat.beat_id:03d}_TRAILER_TAIL_BLACK_FADE",
|
||||
offset=seconds_to_fcpxml(clip.timeline_start_s + source_duration_s, fps),
|
||||
duration=seconds_to_fcpxml(clip.trailer_tail_s, fps),
|
||||
start="0s",
|
||||
)
|
||||
SubElement(gap, "marker",
|
||||
start="0s",
|
||||
value="Trailer-only tail: add fade/dissolve to black here",
|
||||
completed="0",
|
||||
)
|
||||
|
||||
return ET.ElementTree(root)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Writer
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def write_fcpxml(
|
||||
timeline: EditTimeline,
|
||||
cfg: AppConfig,
|
||||
output_path: Path | None = None,
|
||||
) -> Path:
|
||||
"""
|
||||
Serialise the EditTimeline to a .fcpxml file.
|
||||
|
||||
Args:
|
||||
timeline: EditTimeline from build_timeline().
|
||||
cfg: Application configuration.
|
||||
output_path: Override destination. Defaults to
|
||||
<output_dir>/<project_name>.fcpxml.
|
||||
|
||||
Returns:
|
||||
Path to the written .fcpxml file.
|
||||
"""
|
||||
if output_path is None:
|
||||
output_path = cfg.paths.output_dir / f"{timeline.title}.fcpxml"
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
tree = build_fcpxml(timeline, cfg)
|
||||
|
||||
# Add XML declaration + DOCTYPE manually (ElementTree doesn't support DOCTYPE)
|
||||
xml_bytes = ET.tostring(tree.getroot(), encoding="unicode", xml_declaration=False)
|
||||
header = (
|
||||
'<?xml version="1.0" encoding="UTF-8"?>\n'
|
||||
'<!DOCTYPE fcpxml>\n'
|
||||
)
|
||||
|
||||
output_path.write_text(header + xml_bytes, encoding="utf-8")
|
||||
|
||||
logger.info("FCPXML written → %s (%d clips)", output_path, timeline.clip_count)
|
||||
return output_path
|
||||
@@ -0,0 +1,146 @@
|
||||
"""
|
||||
src/export/timecode.py — Timecode / rational-time conversion helpers
|
||||
|
||||
FCPXML uses rational fractions ("1001/24000s") for all time values.
|
||||
EDL uses SMPTE timecode strings ("HH:MM:SS:FF").
|
||||
|
||||
All conversion functions are pure — no I/O, no state.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
from fractions import Fraction
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Common frame-rate denominators
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_FPS_RATIONAL: dict[float, tuple[int, int]] = {
|
||||
23.976: (24000, 1001),
|
||||
24.0: (24, 1),
|
||||
25.0: (25, 1),
|
||||
29.97: (30000, 1001),
|
||||
30.0: (30, 1),
|
||||
50.0: (50, 1),
|
||||
59.94: (60000, 1001),
|
||||
60.0: (60, 1),
|
||||
}
|
||||
|
||||
_TOLERANCE = 0.01 # fps match tolerance
|
||||
|
||||
|
||||
def _fps_to_rational(fps: float) -> tuple[int, int]:
|
||||
"""Return (numerator, denominator) for common fps values."""
|
||||
for ref_fps, rational in _FPS_RATIONAL.items():
|
||||
if abs(fps - ref_fps) < _TOLERANCE:
|
||||
return rational
|
||||
# Fallback: convert float to exact fraction
|
||||
f = Fraction(fps).limit_denominator(1001)
|
||||
return f.numerator, f.denominator
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Seconds → FCPXML rational string
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def seconds_to_fcpxml(seconds: float, fps: float) -> str:
|
||||
"""
|
||||
Convert *seconds* to FCPXML rational time string.
|
||||
|
||||
FCPXML requires exact rational arithmetic to avoid drift.
|
||||
Example: 10.0s @23.976fps → "240240/24000s"
|
||||
|
||||
Args:
|
||||
seconds: Time in seconds (float).
|
||||
fps: Project frame rate.
|
||||
|
||||
Returns:
|
||||
FCPXML time string, e.g. "240240/24000s".
|
||||
"""
|
||||
if seconds == 0.0:
|
||||
return "0s"
|
||||
|
||||
num, den = _fps_to_rational(fps) # frames per second = num/den
|
||||
# seconds × (num/den) = frames (float); round to nearest frame
|
||||
frames = round(seconds * num / den)
|
||||
# frames ÷ (num/den) = frames × den/num → rational seconds
|
||||
total_num = frames * den
|
||||
total_den = num
|
||||
# Reduce fraction
|
||||
g = math.gcd(total_num, total_den)
|
||||
return f"{total_num // g}/{total_den // g}s"
|
||||
|
||||
|
||||
def seconds_to_frame_count(seconds: float, fps: float) -> int:
|
||||
"""Convert seconds to integer frame count."""
|
||||
return round(seconds * fps)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Seconds → SMPTE timecode (for EDL)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def seconds_to_smpte(seconds: float, fps: float, drop_frame: bool = False) -> str:
|
||||
"""
|
||||
Convert *seconds* to SMPTE timecode string "HH:MM:SS:FF".
|
||||
|
||||
Drop-frame timecode (;) is not implemented — always returns NDF (:).
|
||||
|
||||
Args:
|
||||
seconds: Time in float seconds.
|
||||
fps: Frame rate (23.976, 24, 25, etc.).
|
||||
drop_frame: Ignored; placeholder for future DF support.
|
||||
|
||||
Returns:
|
||||
"HH:MM:SS:FF" string.
|
||||
"""
|
||||
total_frames = seconds_to_frame_count(seconds, fps)
|
||||
nominal_fps = round(fps) # e.g. 23.976 → 24
|
||||
|
||||
ff = total_frames % nominal_fps
|
||||
total_s = total_frames // nominal_fps
|
||||
ss = total_s % 60
|
||||
total_m = total_s // 60
|
||||
mm = total_m % 60
|
||||
hh = total_m // 60
|
||||
|
||||
return f"{hh:02d}:{mm:02d}:{ss:02d}:{ff:02d}"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# FCPXML format ID helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def fcpxml_format_name(fps: float, width: int = 1920, height: int = 1080) -> str:
|
||||
"""
|
||||
Return an FCPXML format name string for a given frame rate and resolution.
|
||||
|
||||
Example: fps=23.976, 1080p → "FFVideoFormat1080p2398"
|
||||
"""
|
||||
res = f"{height}p"
|
||||
fps_tag = {
|
||||
23.976: "2398",
|
||||
24.0: "24",
|
||||
25.0: "25",
|
||||
29.97: "2997",
|
||||
30.0: "30",
|
||||
50.0: "50",
|
||||
59.94: "5994",
|
||||
60.0: "60",
|
||||
}.get(fps, str(int(fps * 100)))
|
||||
return f"FFVideoFormat{res}{fps_tag}"
|
||||
|
||||
|
||||
def fcpxml_frame_duration(fps: float) -> str:
|
||||
"""
|
||||
Return FCPXML frameDuration attribute for a given fps.
|
||||
|
||||
frame duration = 1 frame = 1/fps seconds = den/num seconds
|
||||
Example: 23.976fps → num=24000, den=1001 → frame duration = 1001/24000s
|
||||
"""
|
||||
num, den = _fps_to_rational(fps) # fps = num/den (e.g. 24000/1001)
|
||||
# frame duration = den/num seconds
|
||||
g = math.gcd(den, num)
|
||||
return f"{den // g}/{num // g}s"
|
||||
@@ -0,0 +1 @@
|
||||
# src.llm package — Thematic segmentation / dramaturgy (NO vision matching)
|
||||
@@ -0,0 +1,202 @@
|
||||
"""
|
||||
src/llm/dramaturg.py — LLM-based thematic beat classification (OpenRouter)
|
||||
|
||||
Responsibility:
|
||||
- Receive a list of TrailerBeat objects (with dialogue lines attached)
|
||||
- Send a single structured prompt to the LLM
|
||||
- Parse the JSON response to assign BeatType to each beat
|
||||
|
||||
IMPORTANT: This module does ZERO visual analysis.
|
||||
It classifies narrative dramaturgy from dialogue text only.
|
||||
Visual matching is handled exclusively by the CV engine.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
from dataclasses import replace
|
||||
from typing import Sequence
|
||||
|
||||
from src.core.config import AppConfig
|
||||
from src.core.models import BeatType, TrailerBeat
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Prompt builder
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_SYSTEM_PROMPT = """You are a film trailer editor and narrative analyst.
|
||||
Your task is to classify each beat of a trailer into one of these dramatic roles:
|
||||
HOOK - Opening attention grabber (first impression, shocking image, logo)
|
||||
SETUP - World/character introduction
|
||||
CONFLICT - Inciting incident, rising tension, threat revealed
|
||||
CLIMAX - Peak action/emotion, highest stakes
|
||||
RESOLUTION - Cool-down, tagline, final title card
|
||||
|
||||
You will receive a JSON array of beats with their index and dialogue text.
|
||||
Respond ONLY with a valid JSON array, one object per beat, with keys:
|
||||
"beat_id" (int) and "beat_type" (one of the strings above).
|
||||
Do NOT include any explanation or markdown fences."""
|
||||
|
||||
_USER_TEMPLATE = """Classify the following {n} trailer beats:
|
||||
|
||||
{beats_json}"""
|
||||
|
||||
|
||||
def _build_beats_payload(beats: Sequence[TrailerBeat]) -> str:
|
||||
payload = []
|
||||
for b in beats:
|
||||
dialogue_text = " / ".join(line.text for line in b.dialogue) or "(no dialogue)"
|
||||
payload.append({
|
||||
"beat_id": b.beat_id,
|
||||
"duration": round(b.duration_s, 2),
|
||||
"dialogue": dialogue_text,
|
||||
})
|
||||
return json.dumps(payload, ensure_ascii=False, indent=2)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# OpenRouter / OpenAI-compatible HTTP client
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _call_llm(prompt_user: str, cfg: AppConfig) -> str:
|
||||
"""
|
||||
Send a chat completion request to the configured LLM provider.
|
||||
|
||||
Supports: openrouter, openai, ollama (all use the OpenAI-compatible API).
|
||||
|
||||
Returns:
|
||||
The raw text content of the first assistant message.
|
||||
|
||||
Raises:
|
||||
RuntimeError: On HTTP errors or missing API key.
|
||||
"""
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
|
||||
llm = cfg.llm
|
||||
|
||||
if llm.provider in ("openrouter", "openai") and not llm.api_key:
|
||||
raise RuntimeError(
|
||||
f"LLM provider is '{llm.provider}' but no API key found. "
|
||||
"Set OPENROUTER_API_KEY (or OPENAI_API_KEY) in your .env file."
|
||||
)
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {llm.api_key}",
|
||||
}
|
||||
if llm.provider == "openrouter":
|
||||
headers["HTTP-Referer"] = "https://github.com/ai-trailer-2026"
|
||||
headers["X-Title"] = "AI Trailer Generator v2"
|
||||
|
||||
body = json.dumps({
|
||||
"model": llm.model,
|
||||
"messages": [
|
||||
{"role": "system", "content": _SYSTEM_PROMPT},
|
||||
{"role": "user", "content": prompt_user},
|
||||
],
|
||||
"temperature": llm.temperature,
|
||||
"max_tokens": llm.max_tokens,
|
||||
}).encode("utf-8")
|
||||
|
||||
url = f"{llm.base_url.rstrip('/')}/chat/completions"
|
||||
|
||||
req = urllib.request.Request(url, data=body, headers=headers, method="POST")
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=llm.timeout_seconds) as resp:
|
||||
data = json.loads(resp.read().decode("utf-8"))
|
||||
return data["choices"][0]["message"]["content"]
|
||||
except urllib.error.HTTPError as exc:
|
||||
body_text = exc.read().decode(errors="replace")
|
||||
raise RuntimeError(
|
||||
f"LLM HTTP {exc.code} from {url}:\n{body_text}"
|
||||
) from exc
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Response parser
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_BEAT_TYPE_MAP: dict[str, BeatType] = {bt.name: bt for bt in BeatType}
|
||||
|
||||
|
||||
def _parse_response(raw: str, beats: Sequence[TrailerBeat]) -> dict[int, BeatType]:
|
||||
"""
|
||||
Parse the LLM JSON array response into a beat_id → BeatType mapping.
|
||||
|
||||
Falls back to BeatType.UNKNOWN for any beat that cannot be parsed.
|
||||
"""
|
||||
# Strip accidental markdown fences
|
||||
clean = raw.strip()
|
||||
if clean.startswith("```"):
|
||||
clean = "\n".join(clean.split("\n")[1:])
|
||||
if clean.endswith("```"):
|
||||
clean = clean[: clean.rfind("```")]
|
||||
clean = clean.strip()
|
||||
|
||||
result: dict[int, BeatType] = {b.beat_id: BeatType.UNKNOWN for b in beats}
|
||||
|
||||
try:
|
||||
parsed = json.loads(clean)
|
||||
if not isinstance(parsed, list):
|
||||
raise ValueError("Expected JSON array at top level.")
|
||||
|
||||
for item in parsed:
|
||||
bid = int(item["beat_id"])
|
||||
name = str(item.get("beat_type", "UNKNOWN")).upper()
|
||||
result[bid] = _BEAT_TYPE_MAP.get(name, BeatType.UNKNOWN)
|
||||
|
||||
except (json.JSONDecodeError, KeyError, ValueError) as exc:
|
||||
logger.warning("LLM response parse error (%s) — all beats → UNKNOWN.\nRaw: %s", exc, raw[:300])
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def classify_beats(
|
||||
beats: Sequence[TrailerBeat],
|
||||
cfg: AppConfig,
|
||||
) -> list[TrailerBeat]:
|
||||
"""
|
||||
Use the LLM to assign a BeatType to each TrailerBeat.
|
||||
|
||||
Args:
|
||||
beats: TrailerBeat list (dialogue should be populated for best results).
|
||||
cfg: Application configuration (llm section + api key).
|
||||
|
||||
Returns:
|
||||
New list of TrailerBeat objects with beat_type set.
|
||||
On LLM error, all beats keep BeatType.UNKNOWN (no exception raised).
|
||||
"""
|
||||
if not beats:
|
||||
return list(beats)
|
||||
|
||||
logger.info(
|
||||
"Classifying %d beats via %s / %s …",
|
||||
len(beats), cfg.llm.provider, cfg.llm.model,
|
||||
)
|
||||
|
||||
payload = _build_beats_payload(beats)
|
||||
prompt = _USER_TEMPLATE.format(n=len(beats), beats_json=payload)
|
||||
|
||||
try:
|
||||
raw_response = _call_llm(prompt, cfg)
|
||||
except Exception as exc:
|
||||
logger.error("LLM classification failed: %s — keeping BeatType.UNKNOWN.", exc)
|
||||
return list(beats)
|
||||
|
||||
type_map = _parse_response(raw_response, beats)
|
||||
|
||||
enriched = [replace(b, beat_type=type_map.get(b.beat_id, BeatType.UNKNOWN)) for b in beats]
|
||||
|
||||
classified = sum(1 for b in enriched if b.beat_type != BeatType.UNKNOWN)
|
||||
logger.info("Beat classification done: %d / %d classified.", classified, len(beats))
|
||||
return enriched
|
||||
@@ -0,0 +1,316 @@
|
||||
"""
|
||||
Cached vision descriptions for ambiguous trailer/source matching.
|
||||
|
||||
This module is deliberately conservative: it never writes a final match and it
|
||||
does not replace CV. It describes a small number of 3-frame beat/scene samples,
|
||||
caches those descriptions, and returns extra source in-point seeds for the CV
|
||||
scanner to verify.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from dataclasses import asdict
|
||||
from pathlib import Path
|
||||
from typing import Sequence
|
||||
|
||||
import cv2
|
||||
|
||||
from src.core.config import AppConfig
|
||||
from src.core.models import Scene, TrailerBeat
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_CACHE_VERSION = 1
|
||||
_STOPWORDS = {
|
||||
"the", "and", "with", "from", "that", "this", "there", "their", "into",
|
||||
"scene", "frame", "image", "shot", "video", "visible", "looks", "appears",
|
||||
"eine", "einer", "einem", "einen", "und", "oder", "mit", "der", "die", "das",
|
||||
}
|
||||
|
||||
_SYSTEM_PROMPT = """You describe film shots for automatic matching.
|
||||
Return only compact JSON with these keys:
|
||||
subject, setting, composition, action_phase, distinctive_objects, lighting_color, negatives.
|
||||
Focus on stable visual facts and spatial layout. Ignore timecode overlays, subtitles, logos, compression, aspect ratio, and color grading differences."""
|
||||
|
||||
|
||||
def _cache_path(cfg: AppConfig) -> Path:
|
||||
return cfg.paths.cache_dir / "vision_descriptions.json"
|
||||
|
||||
|
||||
def _load_cache(cfg: AppConfig) -> dict:
|
||||
path = _cache_path(cfg)
|
||||
if not path.exists():
|
||||
return {"version": _CACHE_VERSION, "items": {}}
|
||||
try:
|
||||
data = json.loads(path.read_text(encoding="utf-8"))
|
||||
except json.JSONDecodeError:
|
||||
logger.warning("Vision cache is unreadable; rebuilding: %s", path)
|
||||
return {"version": _CACHE_VERSION, "items": {}}
|
||||
if data.get("version") != _CACHE_VERSION or not isinstance(data.get("items"), dict):
|
||||
return {"version": _CACHE_VERSION, "items": {}}
|
||||
return data
|
||||
|
||||
|
||||
def _save_cache(cfg: AppConfig, cache: dict) -> None:
|
||||
path = _cache_path(cfg)
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(json.dumps(cache, indent=2, ensure_ascii=False), encoding="utf-8")
|
||||
|
||||
|
||||
def _sample_times(start_s: float, end_s: float) -> list[float]:
|
||||
duration_s = max(0.04, end_s - start_s)
|
||||
return [
|
||||
start_s + min(duration_s * 0.12, max(0.0, duration_s - 0.04)),
|
||||
start_s + duration_s * 0.50,
|
||||
start_s + max(0.0, duration_s - min(duration_s * 0.12, 0.20)),
|
||||
]
|
||||
|
||||
|
||||
def _frame_data_url(video_path: Path, t_s: float) -> str | None:
|
||||
cap = cv2.VideoCapture(str(video_path))
|
||||
try:
|
||||
if not cap.isOpened():
|
||||
return None
|
||||
cap.set(cv2.CAP_PROP_POS_MSEC, max(0.0, t_s) * 1000.0)
|
||||
ok, frame = cap.read()
|
||||
if not ok or frame is None:
|
||||
return None
|
||||
h, w = frame.shape[:2]
|
||||
if w > 640:
|
||||
frame = cv2.resize(frame, (640, int(h * (640 / w))), interpolation=cv2.INTER_AREA)
|
||||
ok, encoded = cv2.imencode(".jpg", frame, [int(cv2.IMWRITE_JPEG_QUALITY), 72])
|
||||
if not ok:
|
||||
return None
|
||||
payload = base64.b64encode(encoded.tobytes()).decode("ascii")
|
||||
return f"data:image/jpeg;base64,{payload}"
|
||||
finally:
|
||||
cap.release()
|
||||
|
||||
|
||||
def _call_vision_model(label: str, image_urls: list[str], cfg: AppConfig) -> str:
|
||||
vision = cfg.vision
|
||||
if vision.provider in ("openai", "openrouter") and not vision.api_key:
|
||||
raise RuntimeError(
|
||||
"Vision is enabled but no API key is available. Set VISION_API_KEY, "
|
||||
"OPENROUTER_API_KEY, OPENAI_API_KEY, or LLM_API_KEY."
|
||||
)
|
||||
|
||||
content: list[dict] = [{
|
||||
"type": "text",
|
||||
"text": (
|
||||
f"Describe this 3-frame sample for matching. Label: {label}. "
|
||||
"The frames are start, middle, and end of the same beat/scene."
|
||||
),
|
||||
}]
|
||||
content.extend({
|
||||
"type": "image_url",
|
||||
"image_url": {"url": url, "detail": "low"},
|
||||
} for url in image_urls)
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {vision.api_key}",
|
||||
}
|
||||
if vision.provider == "openrouter":
|
||||
headers["HTTP-Referer"] = "https://github.com/ai-trailer-2026"
|
||||
headers["X-Title"] = "AI Trailer Generator v2"
|
||||
|
||||
body = json.dumps({
|
||||
"model": vision.model,
|
||||
"messages": [
|
||||
{"role": "system", "content": _SYSTEM_PROMPT},
|
||||
{"role": "user", "content": content},
|
||||
],
|
||||
"temperature": vision.temperature,
|
||||
"max_tokens": vision.max_tokens,
|
||||
}).encode("utf-8")
|
||||
|
||||
url = f"{vision.base_url.rstrip('/')}/chat/completions"
|
||||
req = urllib.request.Request(url, data=body, headers=headers, method="POST")
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=vision.timeout_seconds) as resp:
|
||||
data = json.loads(resp.read().decode("utf-8"))
|
||||
return str(data["choices"][0]["message"]["content"]).strip()
|
||||
except urllib.error.HTTPError as exc:
|
||||
body_text = exc.read().decode(errors="replace")
|
||||
raise RuntimeError(f"Vision HTTP {exc.code} from {url}:\n{body_text}") from exc
|
||||
|
||||
|
||||
def _description_key(kind: str, item_id: int, start_s: float, end_s: float, cfg: AppConfig) -> str:
|
||||
path = cfg.paths.reference_trailer if kind == "beat" else cfg.paths.source_movie
|
||||
try:
|
||||
stamp = int(path.stat().st_mtime)
|
||||
except OSError:
|
||||
stamp = 0
|
||||
return (
|
||||
f"{kind}:{item_id}:"
|
||||
f"{start_s:.3f}:{end_s:.3f}:"
|
||||
f"{cfg.vision.provider}:{cfg.vision.model}:{stamp}"
|
||||
)
|
||||
|
||||
|
||||
def _describe_sample(
|
||||
*,
|
||||
kind: str,
|
||||
item_id: int,
|
||||
label: str,
|
||||
video_path: Path,
|
||||
start_s: float,
|
||||
end_s: float,
|
||||
cfg: AppConfig,
|
||||
cache: dict,
|
||||
budget: list[int],
|
||||
) -> str | None:
|
||||
key = _description_key(kind, item_id, start_s, end_s, cfg)
|
||||
cached = cache["items"].get(key)
|
||||
if cached:
|
||||
return str(cached.get("description", ""))
|
||||
if budget[0] <= 0:
|
||||
return None
|
||||
|
||||
image_urls = [
|
||||
url for url in (_frame_data_url(video_path, t) for t in _sample_times(start_s, end_s))
|
||||
if url is not None
|
||||
]
|
||||
if len(image_urls) < 2:
|
||||
return None
|
||||
|
||||
description = _call_vision_model(label, image_urls, cfg)
|
||||
cache["items"][key] = {
|
||||
"kind": kind,
|
||||
"item_id": item_id,
|
||||
"start_s": start_s,
|
||||
"end_s": end_s,
|
||||
"label": label,
|
||||
"description": description,
|
||||
}
|
||||
budget[0] -= 1
|
||||
return description
|
||||
|
||||
|
||||
def _terms(text: str) -> set[str]:
|
||||
words = re.findall(r"[a-zA-Z][a-zA-Z0-9_'-]{2,}", text.lower())
|
||||
return {w for w in words if w not in _STOPWORDS}
|
||||
|
||||
|
||||
def _text_similarity(a: str, b: str) -> float:
|
||||
ta = _terms(a)
|
||||
tb = _terms(b)
|
||||
if not ta or not tb:
|
||||
return 0.0
|
||||
overlap = len(ta & tb)
|
||||
return float(overlap / max(8, min(len(ta), len(tb))))
|
||||
|
||||
|
||||
def _scene_seed_points(scene: Scene, max_points: int) -> list[float]:
|
||||
if max_points <= 1 or scene.duration_s <= 0:
|
||||
return [scene.start_s]
|
||||
usable_end = max(scene.start_s, scene.end_s - 0.2)
|
||||
if usable_end <= scene.start_s:
|
||||
return [scene.start_s]
|
||||
step = (usable_end - scene.start_s) / max(1, max_points - 1)
|
||||
return [scene.start_s + step * idx for idx in range(max_points)]
|
||||
|
||||
|
||||
def build_vision_seed_in_points(
|
||||
beats: Sequence[TrailerBeat],
|
||||
scenes: Sequence[Scene],
|
||||
cfg: AppConfig,
|
||||
) -> dict[int, list[tuple[float, float]]]:
|
||||
"""
|
||||
Return extra in-point seeds from cached vision descriptions.
|
||||
|
||||
The function is intentionally small-budget: for each beat it describes the
|
||||
beat once and only a few top scene-level candidates. Existing descriptions
|
||||
are read from cache and cost nothing.
|
||||
"""
|
||||
if not cfg.vision.enabled:
|
||||
return {}
|
||||
if not beats or not scenes:
|
||||
return {}
|
||||
|
||||
from src.cv.vibe_check import run_vibe_check
|
||||
|
||||
cache = _load_cache(cfg)
|
||||
budget = [cfg.vision.max_new_descriptions_per_run]
|
||||
scenes_by_id = {scene.scene_id: scene for scene in scenes}
|
||||
seeds: dict[int, list[tuple[float, float]]] = {}
|
||||
|
||||
for beat in beats:
|
||||
beat_desc = _describe_sample(
|
||||
kind="beat",
|
||||
item_id=beat.beat_id,
|
||||
label=f"trailer beat {beat.beat_id}",
|
||||
video_path=beat.trailer_path,
|
||||
start_s=beat.start_s,
|
||||
end_s=beat.end_s,
|
||||
cfg=cfg,
|
||||
cache=cache,
|
||||
budget=budget,
|
||||
)
|
||||
if not beat_desc:
|
||||
continue
|
||||
|
||||
hits = run_vibe_check(
|
||||
beat,
|
||||
scenes,
|
||||
top_k=cfg.vision.scene_candidate_top_k,
|
||||
hist_method=cfg.cv.vibe_check.hist_compare_method,
|
||||
phash_max_distance=64,
|
||||
)
|
||||
|
||||
ranked: list[tuple[float, Scene]] = []
|
||||
for hit in hits:
|
||||
scene = scenes_by_id.get(hit.scene_id)
|
||||
if scene is None:
|
||||
continue
|
||||
scene_desc = _describe_sample(
|
||||
kind="scene",
|
||||
item_id=scene.scene_id,
|
||||
label=f"source scene {scene.scene_id}",
|
||||
video_path=scene.source_path,
|
||||
start_s=scene.start_s,
|
||||
end_s=scene.end_s,
|
||||
cfg=cfg,
|
||||
cache=cache,
|
||||
budget=budget,
|
||||
)
|
||||
if not scene_desc:
|
||||
continue
|
||||
score = _text_similarity(beat_desc, scene_desc)
|
||||
if score >= cfg.vision.similarity_threshold:
|
||||
ranked.append((score, scene))
|
||||
|
||||
ranked.sort(key=lambda item: item[0], reverse=True)
|
||||
points: list[tuple[float, float]] = []
|
||||
for score, scene in ranked[:cfg.vision.max_seed_scenes]:
|
||||
logger.info(
|
||||
"Beat %d: vision seed scene=%d score=%.3f",
|
||||
beat.beat_id,
|
||||
scene.scene_id,
|
||||
score,
|
||||
)
|
||||
weighted_score = max(
|
||||
cfg.cv.deep_scan.coarse_candidate_threshold,
|
||||
min(0.98, cfg.vision.seed_score * (0.75 + min(1.0, score) * 0.25)),
|
||||
)
|
||||
points.extend(
|
||||
(point, weighted_score)
|
||||
for point in _scene_seed_points(scene, cfg.vision.seed_points_per_scene)
|
||||
)
|
||||
|
||||
if points:
|
||||
merged: dict[float, float] = {}
|
||||
for point, weighted_score in points:
|
||||
key = round(max(0.0, point), 3)
|
||||
merged[key] = max(weighted_score, merged.get(key, 0.0))
|
||||
seeds[beat.beat_id] = sorted((point, score) for point, score in merged.items())
|
||||
|
||||
_save_cache(cfg, cache)
|
||||
return seeds
|
||||
@@ -0,0 +1,3 @@
|
||||
"""
|
||||
src/pipeline/__init__.py — Orchestration layer
|
||||
"""
|
||||
@@ -0,0 +1,291 @@
|
||||
"""
|
||||
src/pipeline/matcher.py — Top-level CV matching orchestrator
|
||||
|
||||
This is the single entry point for the full 2-phase CV pipeline:
|
||||
|
||||
Phase 0: Load / build scene index (PySceneDetect + fingerprinting)
|
||||
Phase 1: Vibe Check — histogram + pHash filter → Top-K candidates per beat
|
||||
Phase 2: Deep Scan — template matching → frame-accurate MatchResult per beat
|
||||
|
||||
Usage:
|
||||
from src.core.config import load_config
|
||||
from src.pipeline.matcher import run_matching
|
||||
|
||||
cfg = load_config()
|
||||
beats = [...] # list[TrailerBeat] from trailer analysis
|
||||
results = run_matching(cfg, beats)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import Sequence
|
||||
|
||||
from src.core.config import AppConfig
|
||||
from src.core.models import MatchResult, Scene, TrailerBeat
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
SeedPoint = float | tuple[float, float]
|
||||
|
||||
|
||||
def _scene_seed_points(scene: Scene, max_points: int) -> list[float]:
|
||||
if max_points <= 1 or scene.duration_s <= 0:
|
||||
return [scene.start_s]
|
||||
usable_end = max(scene.start_s, scene.end_s - 0.2)
|
||||
if usable_end <= scene.start_s:
|
||||
return [scene.start_s]
|
||||
step = (usable_end - scene.start_s) / max(1, max_points - 1)
|
||||
return [scene.start_s + step * idx for idx in range(max_points)]
|
||||
|
||||
|
||||
def _build_scene_seed_in_points(
|
||||
beats: Sequence[TrailerBeat],
|
||||
scenes: Sequence[Scene],
|
||||
cfg: AppConfig,
|
||||
) -> dict[int, list[float]]:
|
||||
from src.cv.vibe_check import run_vibe_check
|
||||
|
||||
scenes_by_id = {scene.scene_id: scene for scene in scenes}
|
||||
seeds: dict[int, list[float]] = {}
|
||||
for beat in beats:
|
||||
hits = run_vibe_check(
|
||||
beat,
|
||||
scenes,
|
||||
top_k=cfg.cv.deep_scan.scene_seed_top_k,
|
||||
hist_method=cfg.cv.vibe_check.hist_compare_method,
|
||||
phash_max_distance=64,
|
||||
)
|
||||
points: list[float] = []
|
||||
for hit in hits:
|
||||
scene = scenes_by_id.get(hit.scene_id)
|
||||
if scene is None:
|
||||
continue
|
||||
points.extend(_scene_seed_points(scene, cfg.cv.deep_scan.scene_seed_points_per_scene))
|
||||
if points:
|
||||
seeds[beat.beat_id] = sorted({round(max(0.0, p), 3) for p in points})
|
||||
logger.info(
|
||||
"Beat %d: added %d scene-level seed candidates from %d source scenes.",
|
||||
beat.beat_id,
|
||||
len(seeds[beat.beat_id]),
|
||||
len(hits),
|
||||
)
|
||||
return seeds
|
||||
|
||||
|
||||
def _merge_seed_in_points(
|
||||
*seed_maps: dict[int, Sequence[SeedPoint]] | None,
|
||||
) -> dict[int, list[SeedPoint]]:
|
||||
merged: dict[int, dict[float, float | None]] = {}
|
||||
for seed_map in seed_maps:
|
||||
if not seed_map:
|
||||
continue
|
||||
for beat_id, points in seed_map.items():
|
||||
beat_points = merged.setdefault(beat_id, {})
|
||||
for point in points:
|
||||
if isinstance(point, tuple):
|
||||
t_sec = round(max(0.0, float(point[0])), 3)
|
||||
score = float(point[1])
|
||||
else:
|
||||
t_sec = round(max(0.0, float(point)), 3)
|
||||
score = None
|
||||
old_score = beat_points.get(t_sec)
|
||||
if old_score is None:
|
||||
beat_points[t_sec] = score
|
||||
elif score is not None:
|
||||
beat_points[t_sec] = max(old_score, score)
|
||||
|
||||
result: dict[int, list[SeedPoint]] = {}
|
||||
for beat_id, points in merged.items():
|
||||
result[beat_id] = [
|
||||
(t_sec, score) if score is not None else t_sec
|
||||
for t_sec, score in sorted(points.items())
|
||||
]
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Beat fingerprinting
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def fingerprint_beats(
|
||||
beats: Sequence[TrailerBeat],
|
||||
cfg: AppConfig,
|
||||
) -> list[TrailerBeat]:
|
||||
"""
|
||||
Enrich every TrailerBeat with its visual fingerprint (histogram + pHash).
|
||||
|
||||
Extracts the midpoint frame from the reference trailer and fingerprints it
|
||||
using the same Text-Safe Crop parameters as the scene indexer.
|
||||
|
||||
Args:
|
||||
beats: TrailerBeat list (fingerprints will be None initially).
|
||||
cfg: Application configuration.
|
||||
|
||||
Returns:
|
||||
New list of TrailerBeat objects with luma_hist, sat_hist, phash set.
|
||||
"""
|
||||
from dataclasses import replace
|
||||
from src.cv.fingerprinting import fingerprint_frame
|
||||
from src.cv.frame_extractor import grab_frame_at_path
|
||||
|
||||
vc_cfg = cfg.cv.vibe_check
|
||||
enriched: list[TrailerBeat] = []
|
||||
|
||||
for beat in beats:
|
||||
frame = grab_frame_at_path(beat.trailer_path, beat.midpoint_s)
|
||||
if frame is None:
|
||||
logger.warning("Beat %d: cannot decode midpoint frame, leaving unfingerpinted.", beat.beat_id)
|
||||
enriched.append(beat)
|
||||
continue
|
||||
|
||||
luma_b, sat_b, phash = fingerprint_frame(frame, vc_cfg)
|
||||
enriched.append(replace(beat, luma_hist=luma_b, sat_hist=sat_b, phash=phash))
|
||||
|
||||
logger.info("Fingerprinted %d / %d beats.", sum(1 for b in enriched if b.phash), len(beats))
|
||||
return enriched
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main pipeline entry point
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def run_matching(
|
||||
cfg: AppConfig,
|
||||
beats: Sequence[TrailerBeat],
|
||||
force_reindex: bool = False,
|
||||
seed_in_points: dict[int, Sequence[SeedPoint]] | None = None,
|
||||
) -> list[MatchResult]:
|
||||
"""
|
||||
Execute the full 2-phase CV matching pipeline.
|
||||
|
||||
Args:
|
||||
cfg: Application configuration (loaded from config.toml).
|
||||
beats: All trailer beats to source (must have trailer_path set).
|
||||
force_reindex: If True, ignore the scene cache and re-run PySceneDetect.
|
||||
|
||||
Returns:
|
||||
List of MatchResult, one per beat (unmatched beats are omitted).
|
||||
Results are in the same order as the input beats.
|
||||
"""
|
||||
from src.cv.scene_indexer import build_scene_index
|
||||
|
||||
logger.info("=" * 60)
|
||||
logger.info("AI Trailer Generator v2 — CV Matching Pipeline")
|
||||
logger.info("Source : %s", cfg.paths.source_movie.name)
|
||||
logger.info("Trailer: %s", cfg.paths.reference_trailer.name)
|
||||
logger.info("Beats : %d", len(beats))
|
||||
logger.info("=" * 60)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Phase 0: Scene index
|
||||
# ------------------------------------------------------------------
|
||||
logger.info("[Phase 0] Building scene index …")
|
||||
scenes: list[Scene] = build_scene_index(cfg, force_reindex=force_reindex)
|
||||
scenes_by_id: dict[int, Scene] = {s.scene_id: s for s in scenes}
|
||||
logger.info("[Phase 0] %d scenes indexed.", len(scenes))
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Phase 0b: Fingerprint the beats
|
||||
# ------------------------------------------------------------------
|
||||
logger.info("[Phase 0b] Fingerprinting %d trailer beats …", len(beats))
|
||||
beats = fingerprint_beats(beats, cfg)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Phase 1 & 2: Global Scan (bypasses Scene Indexer / Vibe Check entirely)
|
||||
# ------------------------------------------------------------------
|
||||
logger.info("[Phase 1 & 2] Running FFmpeg Global Scan for %d beats ...", len(beats))
|
||||
from src.cv.global_scan import run_global_scan
|
||||
|
||||
scene_seed_in_points = _build_scene_seed_in_points(beats, scenes, cfg)
|
||||
vision_seed_in_points = {}
|
||||
if cfg.vision.enabled:
|
||||
try:
|
||||
from src.llm.vision_cache import build_vision_seed_in_points
|
||||
|
||||
vision_seed_in_points = build_vision_seed_in_points(beats, scenes, cfg)
|
||||
except Exception as exc:
|
||||
logger.error("Vision seeding failed: %s — continuing with CV-only seeds.", exc)
|
||||
results = run_global_scan(
|
||||
beats,
|
||||
cfg,
|
||||
scenes=scenes,
|
||||
seed_in_points=_merge_seed_in_points(seed_in_points, scene_seed_in_points, vision_seed_in_points),
|
||||
)
|
||||
|
||||
logger.info("[Phase 1 & 2] Done. %d / %d beats matched.", len(results), len(beats))
|
||||
logger.info("=" * 60)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Convenience: build an EditTimeline from match results
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def build_timeline(
|
||||
beats: Sequence[TrailerBeat],
|
||||
results: Sequence[MatchResult],
|
||||
cfg: AppConfig,
|
||||
) -> "src.core.models.EditTimeline": # type: ignore[name-defined]
|
||||
"""
|
||||
Combine beats + match results into an ordered EditTimeline.
|
||||
|
||||
Unmatched beats are skipped; timeline positions are computed
|
||||
sequentially from the usable source-match durations.
|
||||
|
||||
Args:
|
||||
beats: All trailer beats (defines order + durations).
|
||||
results: MatchResult list from run_matching().
|
||||
cfg: Application configuration.
|
||||
|
||||
Returns:
|
||||
EditTimeline ready for FCPXML / EDL export.
|
||||
"""
|
||||
from src.core.models import EditClip, EditTimeline
|
||||
|
||||
results_by_beat: dict[int, MatchResult] = {r.beat_id: r for r in results}
|
||||
|
||||
clips: list[EditClip] = []
|
||||
cursor = 0.0
|
||||
|
||||
for beat in beats:
|
||||
match = results_by_beat.get(beat.beat_id)
|
||||
if match is None:
|
||||
logger.warning("Beat %d has no match — gap in timeline.", beat.beat_id)
|
||||
cursor += beat.duration_s
|
||||
continue
|
||||
|
||||
match_duration = max(0.0, match.duration_s)
|
||||
source_duration = min(beat.duration_s, match_duration) if match_duration > 0 else beat.duration_s
|
||||
trailer_tail_s = max(0.0, beat.duration_s - source_duration)
|
||||
if trailer_tail_s > 0:
|
||||
logger.warning(
|
||||
"Beat %d uses %.2fs source + %.2fs generated trailer tail.",
|
||||
beat.beat_id,
|
||||
source_duration,
|
||||
trailer_tail_s,
|
||||
)
|
||||
|
||||
clip = EditClip(
|
||||
clip_index=len(clips),
|
||||
beat=beat,
|
||||
match=match,
|
||||
timeline_start_s=cursor,
|
||||
timeline_end_s=cursor + beat.duration_s,
|
||||
source_duration_s=source_duration,
|
||||
trailer_tail_s=trailer_tail_s,
|
||||
)
|
||||
clips.append(clip)
|
||||
cursor += beat.duration_s
|
||||
|
||||
timeline = EditTimeline(
|
||||
title=cfg.paths.reference_trailer.stem,
|
||||
frame_rate=cfg.export.edl_frame_rate,
|
||||
clips=tuple(clips),
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Timeline built: %d clips, total duration %.2fs",
|
||||
timeline.clip_count, timeline.total_duration_s,
|
||||
)
|
||||
return timeline
|
||||
@@ -0,0 +1,427 @@
|
||||
"""
|
||||
src/pipeline/reporter.py — Visual Match Report Generator
|
||||
|
||||
Generates an HTML file containing side-by-side video clips of:
|
||||
Left: The original beat from the reference trailer
|
||||
Right: The matched scene from the source movie
|
||||
|
||||
This allows instant visual verification of the CV pipeline's results.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
from src.core.config import AppConfig
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _extract_clip(video_path: Path, start_s: float, duration_s: float, out_path: Path) -> None:
|
||||
"""Use ffmpeg to extract a silent, low-res preview clip."""
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Fast input seek close to the target, then accurate output seek for
|
||||
# frame-faithful preview clips. A plain "-ss before -i" can land on a
|
||||
# nearby keyframe and make the report look several frames out of sync.
|
||||
preroll_s = 2.0 if start_s >= 2.0 else 0.0
|
||||
input_seek_s = max(0.0, start_s - preroll_s)
|
||||
accurate_seek_s = start_s - input_seek_s
|
||||
|
||||
cmd = [
|
||||
"ffmpeg", "-y", "-loglevel", "error",
|
||||
"-ss", str(input_seek_s),
|
||||
"-i", str(video_path),
|
||||
"-ss", str(accurate_seek_s),
|
||||
"-t", str(duration_s),
|
||||
"-map", "0:v:0",
|
||||
"-c:v", "libx264",
|
||||
"-preset", "ultrafast",
|
||||
"-crf", "28",
|
||||
"-vf", "scale=640:-2", # scale down for lightweight report
|
||||
"-an", # no audio
|
||||
"-movflags", "+faststart",
|
||||
str(out_path)
|
||||
]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True)
|
||||
if result.returncode != 0:
|
||||
logger.error(
|
||||
"ffmpeg clip extraction failed for %s:\n%s",
|
||||
out_path.name, result.stderr.decode(errors="replace")
|
||||
)
|
||||
|
||||
|
||||
def _extract_clip_with_black_tail(
|
||||
video_path: Path,
|
||||
start_s: float,
|
||||
source_duration_s: float,
|
||||
total_duration_s: float,
|
||||
out_path: Path,
|
||||
) -> None:
|
||||
"""Extract a source preview and append black frames for trailer-only tails."""
|
||||
tail_s = max(0.0, total_duration_s - source_duration_s)
|
||||
if tail_s <= 0.02:
|
||||
_extract_clip(video_path, start_s, source_duration_s, out_path)
|
||||
return
|
||||
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
source_tmp = out_path.with_name(f"{out_path.stem}_source_tmp.mp4")
|
||||
tail_tmp = out_path.with_name(f"{out_path.stem}_tail_tmp.mp4")
|
||||
preroll_s = 2.0 if start_s >= 2.0 else 0.0
|
||||
input_seek_s = max(0.0, start_s - preroll_s)
|
||||
accurate_seek_s = start_s - input_seek_s
|
||||
|
||||
# First render the matched source portion with the same accurate seek path
|
||||
# as _extract_clip(). Using trim=start=... after an input seek is brittle
|
||||
# because FFmpeg may preserve non-zero packet timestamps around keyframes.
|
||||
source_cmd = [
|
||||
"ffmpeg", "-y", "-loglevel", "error",
|
||||
"-ss", str(input_seek_s),
|
||||
"-i", str(video_path),
|
||||
"-ss", str(accurate_seek_s),
|
||||
"-t", str(source_duration_s),
|
||||
"-map", "0:v:0",
|
||||
"-c:v", "libx264",
|
||||
"-preset", "ultrafast",
|
||||
"-crf", "28",
|
||||
"-vf", "scale=640:360,setsar=1,fps=25,setpts=PTS-STARTPTS",
|
||||
"-an",
|
||||
"-movflags", "+faststart",
|
||||
str(source_tmp),
|
||||
]
|
||||
|
||||
result = subprocess.run(source_cmd, capture_output=True)
|
||||
if result.returncode != 0:
|
||||
logger.error(
|
||||
"ffmpeg source preview extraction failed for %s:\n%s",
|
||||
out_path.name,
|
||||
result.stderr.decode(errors="replace"),
|
||||
)
|
||||
return
|
||||
|
||||
tail_cmd = [
|
||||
"ffmpeg", "-y", "-loglevel", "error",
|
||||
"-f", "lavfi",
|
||||
"-i", f"color=c=black:s=640x360:r=25:d={tail_s}",
|
||||
"-c:v", "libx264",
|
||||
"-preset", "ultrafast",
|
||||
"-crf", "28",
|
||||
"-an",
|
||||
"-movflags", "+faststart",
|
||||
str(tail_tmp),
|
||||
]
|
||||
result = subprocess.run(tail_cmd, capture_output=True)
|
||||
if result.returncode != 0:
|
||||
logger.error(
|
||||
"ffmpeg black tail render failed for %s:\n%s",
|
||||
out_path.name,
|
||||
result.stderr.decode(errors="replace"),
|
||||
)
|
||||
return
|
||||
|
||||
concat_cmd = [
|
||||
"ffmpeg", "-y", "-loglevel", "error",
|
||||
"-i", str(source_tmp),
|
||||
"-i", str(tail_tmp),
|
||||
"-filter_complex", "[0:v][1:v]concat=n=2:v=1:a=0[v]",
|
||||
"-map", "[v]",
|
||||
"-c:v", "libx264",
|
||||
"-preset", "ultrafast",
|
||||
"-crf", "28",
|
||||
"-an",
|
||||
"-movflags", "+faststart",
|
||||
str(out_path),
|
||||
]
|
||||
result = subprocess.run(concat_cmd, capture_output=True)
|
||||
if result.returncode != 0:
|
||||
logger.error(
|
||||
"ffmpeg tailed preview concat failed for %s:\n%s",
|
||||
out_path.name,
|
||||
result.stderr.decode(errors="replace"),
|
||||
)
|
||||
|
||||
for tmp in (source_tmp, tail_tmp):
|
||||
try:
|
||||
tmp.unlink(missing_ok=True)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def _extract_segmented_clip(
|
||||
video_path: Path,
|
||||
segments: list,
|
||||
total_duration_s: float,
|
||||
out_path: Path,
|
||||
) -> None:
|
||||
"""Render a beat-length source preview from multiple matched source islands."""
|
||||
if not segments:
|
||||
_extract_clip_with_black_tail(video_path, 0.0, 0.0, total_duration_s, out_path)
|
||||
return
|
||||
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
tmp_paths: list[Path] = []
|
||||
cursor = 0.0
|
||||
|
||||
def add_black(duration_s: float) -> None:
|
||||
if duration_s <= 0.02:
|
||||
return
|
||||
tmp = out_path.with_name(f"{out_path.stem}_part_{len(tmp_paths):03d}_black.mp4")
|
||||
cmd = [
|
||||
"ffmpeg", "-y", "-loglevel", "error",
|
||||
"-f", "lavfi",
|
||||
"-i", f"color=c=black:s=640x360:r=25:d={duration_s}",
|
||||
"-c:v", "libx264", "-preset", "ultrafast", "-crf", "28",
|
||||
"-an", "-movflags", "+faststart",
|
||||
str(tmp),
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True)
|
||||
if result.returncode == 0:
|
||||
tmp_paths.append(tmp)
|
||||
else:
|
||||
logger.error("ffmpeg black segment render failed:\n%s", result.stderr.decode(errors="replace"))
|
||||
|
||||
def add_source(start_s: float, duration_s: float) -> None:
|
||||
if duration_s <= 0.02:
|
||||
return
|
||||
tmp = out_path.with_name(f"{out_path.stem}_part_{len(tmp_paths):03d}_src.mp4")
|
||||
preroll_s = 2.0 if start_s >= 2.0 else 0.0
|
||||
input_seek_s = max(0.0, start_s - preroll_s)
|
||||
accurate_seek_s = start_s - input_seek_s
|
||||
cmd = [
|
||||
"ffmpeg", "-y", "-loglevel", "error",
|
||||
"-ss", str(input_seek_s),
|
||||
"-i", str(video_path),
|
||||
"-ss", str(accurate_seek_s),
|
||||
"-t", str(duration_s),
|
||||
"-map", "0:v:0",
|
||||
"-c:v", "libx264", "-preset", "ultrafast", "-crf", "28",
|
||||
"-vf", "scale=640:360,setsar=1,fps=25,setpts=PTS-STARTPTS",
|
||||
"-an", "-movflags", "+faststart",
|
||||
str(tmp),
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True)
|
||||
if result.returncode == 0 and tmp.exists():
|
||||
tmp_paths.append(tmp)
|
||||
else:
|
||||
logger.error("ffmpeg source segment render failed:\n%s", result.stderr.decode(errors="replace"))
|
||||
|
||||
for segment in sorted(segments, key=lambda s: s.trailer_offset_s):
|
||||
offset_s = max(0.0, float(segment.trailer_offset_s))
|
||||
duration_s = max(0.0, float(segment.duration_s))
|
||||
add_black(offset_s - cursor)
|
||||
add_source(float(segment.in_point_s), duration_s)
|
||||
cursor = max(cursor, offset_s + duration_s)
|
||||
|
||||
add_black(total_duration_s - cursor)
|
||||
|
||||
if len(tmp_paths) == 1:
|
||||
tmp_paths[0].replace(out_path)
|
||||
return
|
||||
|
||||
inputs: list[str] = []
|
||||
labels: list[str] = []
|
||||
for idx, tmp in enumerate(tmp_paths):
|
||||
inputs.extend(["-i", str(tmp)])
|
||||
labels.append(f"[{idx}:v]")
|
||||
filter_complex = "".join(labels) + f"concat=n={len(tmp_paths)}:v=1:a=0[v]"
|
||||
cmd = [
|
||||
"ffmpeg", "-y", "-loglevel", "error",
|
||||
*inputs,
|
||||
"-filter_complex", filter_complex,
|
||||
"-map", "[v]",
|
||||
"-c:v", "libx264", "-preset", "ultrafast", "-crf", "28",
|
||||
"-an", "-movflags", "+faststart",
|
||||
str(out_path),
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True)
|
||||
if result.returncode != 0:
|
||||
logger.error("ffmpeg segmented preview concat failed:\n%s", result.stderr.decode(errors="replace"))
|
||||
|
||||
for tmp in tmp_paths:
|
||||
try:
|
||||
tmp.unlink(missing_ok=True)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def _build_frame_locked_compare(ref_path: Path, src_path: Path, out_path: Path) -> None:
|
||||
"""Render reference and source into one side-by-side video stream."""
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
normalize = (
|
||||
"fps=25,scale=640:360:force_original_aspect_ratio=decrease,"
|
||||
"pad=640:360:(ow-iw)/2:(oh-ih)/2,setsar=1,setpts=PTS-STARTPTS"
|
||||
)
|
||||
filter_complex = (
|
||||
f"[0:v]{normalize}[ref];"
|
||||
f"[1:v]{normalize}[src];"
|
||||
"[ref][src]hstack=inputs=2[v]"
|
||||
)
|
||||
cmd = [
|
||||
"ffmpeg", "-y", "-loglevel", "error",
|
||||
"-i", str(ref_path),
|
||||
"-i", str(src_path),
|
||||
"-filter_complex", filter_complex,
|
||||
"-map", "[v]",
|
||||
"-c:v", "libx264",
|
||||
"-preset", "ultrafast",
|
||||
"-crf", "28",
|
||||
"-an",
|
||||
"-movflags", "+faststart",
|
||||
str(out_path),
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True)
|
||||
if result.returncode != 0:
|
||||
logger.error(
|
||||
"ffmpeg compare render failed for %s:\n%s",
|
||||
out_path.name,
|
||||
result.stderr.decode(errors="replace"),
|
||||
)
|
||||
|
||||
|
||||
def generate_report(beats: list, results: list, cfg: AppConfig) -> Path:
|
||||
"""
|
||||
Generate an HTML side-by-side report.
|
||||
Returns the path to the .html file.
|
||||
"""
|
||||
report_dir = cfg.paths.output_dir / "report"
|
||||
report_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
html_path = report_dir / "match_report.html"
|
||||
results_by_beat = {r.beat_id: r for r in results}
|
||||
|
||||
logger.info("Generating report clips in %s (this might take a moment) ...", report_dir)
|
||||
|
||||
html = [
|
||||
"<!DOCTYPE html>",
|
||||
"<html><head><meta charset='utf-8'><title>AI Trailer Match Report</title>",
|
||||
"<style>",
|
||||
"body { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; background: #0f0f0f; color: #e0e0e0; margin: 40px; }",
|
||||
"h1 { color: #fff; border-bottom: 1px solid #333; padding-bottom: 10px; }",
|
||||
".stats { font-size: 1.2em; margin-bottom: 30px; color: #aaa; }",
|
||||
".beat-row { display: flex; margin-bottom: 30px; background: #1a1a1a; padding: 20px; border-radius: 12px; border: 1px solid #333; }",
|
||||
".info { width: 250px; padding-right: 20px; flex-shrink: 0; }",
|
||||
".info h3 { margin-top: 0; color: #fff; }",
|
||||
".video-container { display: flex; gap: 20px; flex-grow: 1; }",
|
||||
".videos { flex-grow: 1; }",
|
||||
".compare { margin-bottom: 18px; }",
|
||||
".video-col { flex: 1; }",
|
||||
".video-col p { margin-top: 0; font-weight: bold; color: #888; }",
|
||||
"video { width: 100%; border-radius: 6px; box-shadow: 0 4px 6px rgba(0,0,0,0.5); background: #000; }",
|
||||
".status-match { color: #4ade80; font-weight: bold; font-size: 1.1em; }",
|
||||
".status-miss { color: #f87171; font-weight: bold; font-size: 1.1em; }",
|
||||
".score { font-family: monospace; font-size: 1.1em; color: #60a5fa; }",
|
||||
".code-hint { background: #000; padding: 10px; border-radius: 4px; font-family: monospace; font-size: 0.9em; margin-top: 15px; color: #a3e635; }",
|
||||
"</style></head><body>",
|
||||
f"<h1>AI Trailer Generator — Match Report</h1>",
|
||||
f"<div class='stats'>Total Beats: {len(beats)} | Matched: {len(results)}</div>",
|
||||
"<script>",
|
||||
"function syncBeat(row) {",
|
||||
" const vids = row.querySelectorAll('video');",
|
||||
" if (vids.length < 2) return;",
|
||||
" const ref = vids[0];",
|
||||
" const src = vids[1];",
|
||||
" let syncing = false;",
|
||||
" function align() {",
|
||||
" if (syncing) return;",
|
||||
" syncing = true;",
|
||||
" const target = Math.min(ref.currentTime, Math.max(0, (src.duration || ref.currentTime) - 0.02));",
|
||||
" if (Math.abs(src.currentTime - target) > 0.035) src.currentTime = target;",
|
||||
" if (ref.paused && !src.paused) src.pause();",
|
||||
" if (!ref.paused && src.paused) src.play().catch(() => {});",
|
||||
" syncing = false;",
|
||||
" }",
|
||||
" ref.addEventListener('play', () => { src.currentTime = Math.min(ref.currentTime, Math.max(0, (src.duration || ref.currentTime) - 0.02)); src.play().catch(() => {}); });",
|
||||
" ref.addEventListener('pause', () => src.pause());",
|
||||
" ref.addEventListener('seeked', () => { src.currentTime = Math.min(ref.currentTime, Math.max(0, (src.duration || ref.currentTime) - 0.02)); });",
|
||||
" ref.addEventListener('timeupdate', align);",
|
||||
"}",
|
||||
"document.addEventListener('DOMContentLoaded', () => document.querySelectorAll('.beat-row').forEach(syncBeat));",
|
||||
"</script>"
|
||||
]
|
||||
|
||||
for beat in beats:
|
||||
res = results_by_beat.get(beat.beat_id)
|
||||
|
||||
# Extract Reference Clip
|
||||
ref_mp4 = report_dir / f"beat_{beat.beat_id:03d}_ref.mp4"
|
||||
_extract_clip(beat.trailer_path, beat.start_s, beat.duration_s, ref_mp4)
|
||||
|
||||
html.append("<div class='beat-row'>")
|
||||
|
||||
# Info Panel
|
||||
html.append("<div class='info'>")
|
||||
html.append(f"<h3>Beat {beat.beat_id:03d}</h3>")
|
||||
html.append(f"<p><b>Type:</b> {beat.beat_type.name}</p>")
|
||||
html.append(f"<p><b>Trailer:</b> {beat.start_s:.2f}s → {beat.end_s:.2f}s</p>")
|
||||
|
||||
if res:
|
||||
segments = list(getattr(res, "segments", ()) or [])
|
||||
source_duration = sum(max(0.0, float(s.duration_s)) for s in segments)
|
||||
if not segments:
|
||||
source_duration = max(0.0, res.out_point_s - res.in_point_s)
|
||||
preview_duration = min(beat.duration_s, source_duration) if source_duration > 0 else beat.duration_s
|
||||
last_segment_end = max(
|
||||
(float(s.trailer_offset_s) + float(s.duration_s) for s in segments),
|
||||
default=preview_duration,
|
||||
)
|
||||
trailer_tail_s = max(0.0, beat.duration_s - last_segment_end)
|
||||
if getattr(res, "is_confirmed", True):
|
||||
html.append("<p class='status-match'>MATCHED</p>")
|
||||
else:
|
||||
html.append("<p style='color: #fbbf24; font-weight: bold; font-size: 1.1em;'>PROVISIONAL MATCH</p>")
|
||||
html.append(f"<p><b>Scene ID:</b> {res.scene_id}</p>")
|
||||
html.append(f"<p><b>Movie In:</b> {res.in_point_s:.2f}s</p>")
|
||||
html.append(f"<p><b>Source Dur:</b> {source_duration:.2f}s</p>")
|
||||
if len(segments) > 1:
|
||||
html.append(f"<p><b>Segments:</b> {len(segments)} matched visual islands</p>")
|
||||
if trailer_tail_s > 0:
|
||||
html.append(f"<p><b>Unmatched Tail:</b> {trailer_tail_s:.2f}s placeholder</p>")
|
||||
html.append(f"<p><b>Score:</b> <span class='score'>{res.match_score:.3f}</span></p>")
|
||||
if trailer_tail_s > 0:
|
||||
html.append("<p style='color: #fbbf24; font-size: 0.9em;'>Some trailer frames are still unmatched; report fills only those gaps with placeholder black.</p>")
|
||||
|
||||
# Warn if score is low
|
||||
if res.match_score < 0.80:
|
||||
html.append("<p style='color: #fbbf24; font-size: 0.9em;'>⚠️ Score below 0.80. Verify visually.</p>")
|
||||
|
||||
# Extract Source Clip
|
||||
src_mp4 = report_dir / f"beat_{beat.beat_id:03d}_src.mp4"
|
||||
compare_mp4 = report_dir / f"beat_{beat.beat_id:03d}_compare.mp4"
|
||||
if segments:
|
||||
_extract_segmented_clip(res.source_path, segments, beat.duration_s, src_mp4)
|
||||
else:
|
||||
_extract_clip_with_black_tail(
|
||||
res.source_path,
|
||||
res.in_point_s,
|
||||
preview_duration,
|
||||
beat.duration_s,
|
||||
src_mp4,
|
||||
)
|
||||
_build_frame_locked_compare(ref_mp4, src_mp4, compare_mp4)
|
||||
else:
|
||||
html.append("<p class='status-miss'>NO MATCH</p>")
|
||||
src_mp4 = None
|
||||
compare_mp4 = None
|
||||
|
||||
html.append(f"<div class='code-hint'>python cli.py rematch --beat {beat.beat_id}</div>")
|
||||
html.append("</div>") # /info
|
||||
|
||||
# Video Panel
|
||||
html.append("<div class='videos'>")
|
||||
if compare_mp4:
|
||||
html.append(f"<div class='compare'><p>Frame-Locked Compare</p><video src='{compare_mp4.name}' controls loop muted autoplay></video></div>")
|
||||
else:
|
||||
html.append("<div class='video-container'>")
|
||||
html.append(f"<div class='video-col'><p>Reference Trailer</p><video src='{ref_mp4.name}' controls loop muted autoplay></video></div>")
|
||||
html.append("<div class='video-col'><p>Matched Source</p><div style='width: 100%; aspect-ratio: 16/9; background: #222; display: flex; align-items: center; justify-content: center; border-radius: 6px; color: #555;'>No Match</div></div>")
|
||||
html.append("</div>") # /video-container
|
||||
html.append("</div>") # /videos
|
||||
html.append("</div>") # /beat-row
|
||||
|
||||
html.append("</body></html>")
|
||||
|
||||
html_path.write_text("\n".join(html), encoding="utf-8")
|
||||
return html_path
|
||||
@@ -0,0 +1,175 @@
|
||||
"""
|
||||
src/pipeline/trailer_analyzer.py — Reference trailer → list[TrailerBeat]
|
||||
|
||||
Responsibility:
|
||||
1. Run PySceneDetect on the REFERENCE TRAILER (not the source movie)
|
||||
to detect cut boundaries → raw beat intervals
|
||||
2. Fingerprint the midpoint frame of each beat (for Vibe Check)
|
||||
3. Transcribe dialogue per beat via Whisper (optional, injected)
|
||||
4. Optionally classify BeatType via the LLM dramaturg (injected)
|
||||
|
||||
Returns: list[TrailerBeat] ready to feed into run_matching().
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from dataclasses import replace
|
||||
from pathlib import Path
|
||||
from typing import Callable, Sequence
|
||||
|
||||
from src.core.config import AppConfig
|
||||
from src.core.models import BeatType, DialogueLine, TrailerBeat
|
||||
from src.cv.fingerprinting import fingerprint_frame
|
||||
from src.cv.frame_extractor import grab_midpoint_frame, open_video
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Injection type aliases — keeps this module free of hard audio/LLM imports
|
||||
TranscribeCallback = Callable[[Path, float, float, float], list[DialogueLine]]
|
||||
ClassifyCallback = Callable[[list[TrailerBeat]], list[TrailerBeat]]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Step 1: Scene detection on the reference trailer
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _detect_trailer_beats(cfg: AppConfig) -> list[tuple[float, float, int, int]]:
|
||||
"""
|
||||
Run PySceneDetect on the reference trailer.
|
||||
|
||||
Returns list of (start_s, end_s, start_frame, end_frame).
|
||||
Uses the same ContentDetector thresholds as the source movie.
|
||||
"""
|
||||
try:
|
||||
from scenedetect import open_video as sd_open_video, SceneManager
|
||||
from scenedetect.detectors import ContentDetector
|
||||
except ImportError:
|
||||
raise ImportError("pip install scenedetect[opencv]")
|
||||
|
||||
trailer_path = cfg.paths.reference_trailer
|
||||
video = sd_open_video(str(trailer_path))
|
||||
manager = SceneManager()
|
||||
manager.add_detector(
|
||||
ContentDetector(
|
||||
threshold=cfg.scene_detection.content_threshold,
|
||||
min_scene_len=int(
|
||||
cfg.scene_detection.min_scene_duration_s * video.frame_rate
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
logger.info("Detecting beats in reference trailer: %s …", trailer_path.name)
|
||||
manager.detect_scenes(video=video, show_progress=False)
|
||||
|
||||
raw = manager.get_scene_list()
|
||||
result = [
|
||||
(s.get_seconds(), e.get_seconds(), s.get_frames(), e.get_frames())
|
||||
for s, e in raw
|
||||
]
|
||||
logger.info("Detected %d beats in reference trailer.", len(result))
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Step 2: Fingerprint beats
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _fingerprint_beats(
|
||||
raw_beats: list[tuple[float, float, int, int]],
|
||||
cfg: AppConfig,
|
||||
) -> list[TrailerBeat]:
|
||||
"""Extract midpoint frame for each beat and compute fingerprints."""
|
||||
vc_cfg = cfg.cv.vibe_check
|
||||
trailer_path = cfg.paths.reference_trailer
|
||||
beats: list[TrailerBeat] = []
|
||||
|
||||
with open_video(trailer_path) as cap:
|
||||
for idx, (start_s, end_s, start_frame, end_frame) in enumerate(raw_beats):
|
||||
frame = grab_midpoint_frame(cap, start_s, end_s)
|
||||
|
||||
if frame is None:
|
||||
logger.warning("Beat %d: midpoint frame decode failed.", idx)
|
||||
beats.append(TrailerBeat(
|
||||
beat_id=idx,
|
||||
trailer_path=trailer_path,
|
||||
start_s=start_s, end_s=end_s,
|
||||
start_frame=start_frame, end_frame=end_frame,
|
||||
))
|
||||
continue
|
||||
|
||||
luma_b, sat_b, phash = fingerprint_frame(frame, vc_cfg)
|
||||
beats.append(TrailerBeat(
|
||||
beat_id=idx,
|
||||
trailer_path=trailer_path,
|
||||
start_s=start_s, end_s=end_s,
|
||||
start_frame=start_frame, end_frame=end_frame,
|
||||
luma_hist=luma_b,
|
||||
sat_hist=sat_b,
|
||||
phash=phash,
|
||||
))
|
||||
|
||||
return beats
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def analyze_reference_trailer(
|
||||
cfg: AppConfig,
|
||||
transcribe_callback: TranscribeCallback | None = None,
|
||||
classify_callback: ClassifyCallback | None = None,
|
||||
) -> list[TrailerBeat]:
|
||||
"""
|
||||
Full reference-trailer analysis pipeline.
|
||||
|
||||
Args:
|
||||
cfg: Application configuration.
|
||||
transcribe_callback: Optional fn(path, start_s, end_s, offset_s)
|
||||
→ list[DialogueLine]. Injected to keep this
|
||||
module free of faster-whisper imports.
|
||||
classify_callback: Optional fn(beats) → beats with BeatType set.
|
||||
Injected to keep this module LLM-free.
|
||||
|
||||
Returns:
|
||||
List of TrailerBeat objects with fingerprints (and optionally
|
||||
dialogue + BeatType) populated.
|
||||
"""
|
||||
# Step 1 — cut detection
|
||||
raw_beats = _detect_trailer_beats(cfg)
|
||||
|
||||
# Step 2 — fingerprint
|
||||
beats = _fingerprint_beats(raw_beats, cfg)
|
||||
|
||||
# Step 3 — dialogue (optional)
|
||||
if transcribe_callback is not None:
|
||||
enriched: list[TrailerBeat] = []
|
||||
for beat in beats:
|
||||
try:
|
||||
lines = transcribe_callback(
|
||||
beat.trailer_path,
|
||||
beat.start_s,
|
||||
beat.end_s,
|
||||
beat.start_s, # time_offset so timestamps are absolute
|
||||
)
|
||||
enriched.append(replace(beat, dialogue=tuple(lines)))
|
||||
except Exception as exc:
|
||||
logger.warning("Beat %d transcription failed: %s", beat.beat_id, exc)
|
||||
enriched.append(beat)
|
||||
beats = enriched
|
||||
|
||||
# Step 4 — LLM dramaturgy (optional)
|
||||
if classify_callback is not None:
|
||||
try:
|
||||
beats = classify_callback(beats)
|
||||
except Exception as exc:
|
||||
logger.warning("Beat classification failed: %s — keeping UNKNOWN.", exc)
|
||||
|
||||
logger.info(
|
||||
"Trailer analysis complete: %d beats, %d with dialogue, %d classified.",
|
||||
len(beats),
|
||||
sum(1 for b in beats if b.dialogue),
|
||||
sum(1 for b in beats if b.beat_type != BeatType.UNKNOWN),
|
||||
)
|
||||
return beats
|
||||
Reference in New Issue
Block a user