Initial project import

This commit is contained in:
Melbar
2026-05-02 09:07:41 +02:00
commit 8e1bcf142f
38 changed files with 7928 additions and 0 deletions
+1
View File
@@ -0,0 +1 @@
# src.core package
+387
View File
@@ -0,0 +1,387 @@
"""
src/core/config.py — Configuration loader for AI Trailer Generator v2
Loads config.toml and exposes typed, nested dataclasses.
All CV thresholds, paths, and model settings are sourced exclusively here.
API keys are NEVER stored in config.toml; they are loaded from .env.
"""
from __future__ import annotations
import os
import tomllib
try:
from dotenv import load_dotenv as _load_dotenv
_HAS_DOTENV = True
except ImportError: # dotenv optional — falls back to existing env vars
_HAS_DOTENV = False
from dataclasses import dataclass, field
from pathlib import Path
from typing import Literal
# ---------------------------------------------------------------------------
# Leaf sections
# ---------------------------------------------------------------------------
@dataclass(frozen=True)
class PathsConfig:
source_movie: Path
reference_trailer: Path
output_dir: Path
cache_dir: Path
proxy_dir: Path
@dataclass(frozen=True)
class VideoConfig:
extract_fps: float
proxy_width: int
proxy_height: int
@dataclass(frozen=True)
class VibeCheckConfig:
top_k_candidates: int
hist_compare_method: int
hist_bins_hue: int
hist_bins_saturation: int
phash_max_distance: int
crop_top_fraction: float
crop_bottom_fraction: float
@dataclass(frozen=True)
class DeepScanConfig:
coarse_step_seconds: float
match_threshold: float
provisional_match_threshold: float
coarse_candidate_threshold: float
sequence_score_weight: float
span_score_weight: float
coarse_score_weight: float
duration_score_weight: float
duration_tie_break_score_delta: float
min_duration_coverage: float
continuity_seed_offsets_s: tuple[float, ...]
scene_seed_top_k: int
scene_seed_points_per_scene: int
content_rerank_candidate_count: int
skip_coarse_scan_with_weighted_seeds: bool
max_refine_candidates: int
match_method: int
refine_window_seconds: float
refine_step_seconds: float
content_align_window_seconds: float
content_align_sample_step_s: float
content_validation_weight: float
provisional_content_threshold: float
start_tie_break_score_delta: float
start_preroll_frames: int
sequence_candidate_count: int
sequence_min_distance_s: float
span_sample_step_s: float
trim_tail_frames: int
scene_boundary_epsilon_s: float
scoreable_luma_mean_min: float
scoreable_luma_p90_min: float
scoreable_contrast_min: float
@dataclass(frozen=True)
class CVConfig:
vibe_check: VibeCheckConfig
deep_scan: DeepScanConfig
@dataclass(frozen=True)
class SceneDetectionConfig:
content_threshold: float
min_scene_duration_s: float
@dataclass(frozen=True)
class WhisperConfig:
model: str
language: str
device: Literal["cuda", "cpu"]
compute_type: Literal["float16", "int8", "float32"]
@dataclass(frozen=True)
class LLMConfig:
provider: Literal["ollama", "openai", "openrouter"]
base_url: str
model: str
timeout_seconds: int
temperature: float
max_tokens: int
# Loaded from .env — NEVER committed to version control
api_key: str = ""
@dataclass(frozen=True)
class VisionConfig:
enabled: bool
provider: Literal["openai", "openrouter"]
base_url: str
model: str
timeout_seconds: int
temperature: float
max_tokens: int
scene_candidate_top_k: int
max_new_descriptions_per_run: int
max_seed_scenes: int
seed_points_per_scene: int
seed_score: float
max_refine_candidates: int
local_scan_step_s: float
local_scan_max_points_per_scene: int
local_scan_top_candidates: int
local_scan_tie_break_score_delta: float
multi_shot_cut_corr_threshold: float
multi_shot_boundary_tolerance_s: float
fullscan_fallback: bool
content_threshold: float
similarity_threshold: float
api_key: str = ""
@dataclass(frozen=True)
class ExportConfig:
fcpxml_version: str
edl_frame_rate: float
output_format: Literal["fcpxml", "edl", "both"]
# ---------------------------------------------------------------------------
# Root config — single object passed through the entire application
# ---------------------------------------------------------------------------
@dataclass(frozen=True)
class AppConfig:
project_name: str
version: str
log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR"]
paths: PathsConfig
video: VideoConfig
cv: CVConfig
scene_detection: SceneDetectionConfig
whisper: WhisperConfig
llm: LLMConfig
vision: VisionConfig
export: ExportConfig
# ---------------------------------------------------------------------------
# Loader
# ---------------------------------------------------------------------------
_DEFAULT_CONFIG_PATH = Path(__file__).parents[2] / "config.toml"
_DEFAULT_ENV_PATH = Path(__file__).parents[2] / ".env"
def load_config(
config_path: Path = _DEFAULT_CONFIG_PATH,
env_path: Path = _DEFAULT_ENV_PATH,
) -> AppConfig:
"""
Parse config.toml and return a fully-typed, immutable AppConfig.
API keys are read from the .env file (or existing environment variables);
they are never stored in config.toml.
Args:
config_path: Absolute or relative path to the TOML file.
Defaults to <project_root>/config.toml.
env_path: Path to the .env file.
Defaults to <project_root>/.env.
Raises:
FileNotFoundError: If the TOML file does not exist.
KeyError / TypeError: If a required key is missing or has the wrong type.
"""
# Load .env first so os.environ is populated before we read it below.
if _HAS_DOTENV:
_load_dotenv(dotenv_path=env_path, override=False)
if not config_path.exists():
raise FileNotFoundError(
f"Config file not found: {config_path}\n"
"Copy config.toml.example to config.toml and adjust your paths."
)
with config_path.open("rb") as fh:
raw: dict = tomllib.load(fh)
project = raw["project"]
paths_raw = raw["paths"]
video_raw = raw["video"]
cv_raw = raw["cv"]
sd_raw = raw["scene_detection"]
whisper_raw = raw["whisper"]
llm_raw = raw["llm"]
vision_raw = raw.get("vision", {})
export_raw = raw["export"]
# Resolve paths relative to the config file's parent directory so the
# project is relocatable, but keep absolute paths as-is.
def _resolve(p: str) -> Path:
path = Path(p)
return path if path.is_absolute() else (config_path.parent / path).resolve()
paths = PathsConfig(
source_movie=_resolve(paths_raw["source_movie"]),
reference_trailer=_resolve(paths_raw["reference_trailer"]),
output_dir=_resolve(paths_raw["output_dir"]),
cache_dir=_resolve(paths_raw["cache_dir"]),
proxy_dir=_resolve(paths_raw["proxy_dir"]),
)
video = VideoConfig(
extract_fps=float(video_raw["extract_fps"]),
proxy_width=int(video_raw["proxy_width"]),
proxy_height=int(video_raw["proxy_height"]),
)
vibe_check = VibeCheckConfig(
top_k_candidates=int(cv_raw["vibe_check"]["top_k_candidates"]),
hist_compare_method=int(cv_raw["vibe_check"]["hist_compare_method"]),
hist_bins_hue=int(cv_raw["vibe_check"]["hist_bins_hue"]),
hist_bins_saturation=int(cv_raw["vibe_check"]["hist_bins_saturation"]),
phash_max_distance=int(cv_raw["vibe_check"]["phash_max_distance"]),
crop_top_fraction=float(cv_raw["vibe_check"]["crop_top_fraction"]),
crop_bottom_fraction=float(cv_raw["vibe_check"]["crop_bottom_fraction"]),
)
deep_scan = DeepScanConfig(
coarse_step_seconds=float(cv_raw["deep_scan"]["coarse_step_seconds"]),
match_threshold=float(cv_raw["deep_scan"]["match_threshold"]),
provisional_match_threshold=float(cv_raw["deep_scan"].get("provisional_match_threshold", 0.45)),
coarse_candidate_threshold=float(cv_raw["deep_scan"].get("coarse_candidate_threshold", cv_raw["deep_scan"]["match_threshold"])),
sequence_score_weight=float(cv_raw["deep_scan"].get("sequence_score_weight", 0.55)),
span_score_weight=float(cv_raw["deep_scan"].get("span_score_weight", 0.15)),
coarse_score_weight=float(cv_raw["deep_scan"].get("coarse_score_weight", 0.10)),
duration_score_weight=float(cv_raw["deep_scan"].get("duration_score_weight", 0.20)),
duration_tie_break_score_delta=float(cv_raw["deep_scan"].get("duration_tie_break_score_delta", 0.03)),
min_duration_coverage=float(cv_raw["deep_scan"].get("min_duration_coverage", 0.65)),
continuity_seed_offsets_s=tuple(
float(v) for v in cv_raw["deep_scan"].get(
"continuity_seed_offsets_s",
[-1.0, 0.0, 0.5, 1.0, 1.5, 2.0, 3.0],
)
),
scene_seed_top_k=int(cv_raw["deep_scan"].get("scene_seed_top_k", 30)),
scene_seed_points_per_scene=int(cv_raw["deep_scan"].get("scene_seed_points_per_scene", 6)),
content_rerank_candidate_count=int(cv_raw["deep_scan"].get("content_rerank_candidate_count", 100)),
skip_coarse_scan_with_weighted_seeds=bool(cv_raw["deep_scan"].get("skip_coarse_scan_with_weighted_seeds", False)),
max_refine_candidates=int(cv_raw["deep_scan"].get("max_refine_candidates", 6)),
match_method=int(cv_raw["deep_scan"]["match_method"]),
refine_window_seconds=float(cv_raw["deep_scan"].get("refine_window_seconds", 0.6)),
refine_step_seconds=float(cv_raw["deep_scan"]["refine_step_seconds"]),
content_align_window_seconds=float(cv_raw["deep_scan"].get("content_align_window_seconds", 0.48)),
content_align_sample_step_s=float(cv_raw["deep_scan"].get("content_align_sample_step_s", 0.28)),
content_validation_weight=float(cv_raw["deep_scan"].get("content_validation_weight", 0.35)),
provisional_content_threshold=float(cv_raw["deep_scan"].get("provisional_content_threshold", 0.42)),
start_tie_break_score_delta=float(cv_raw["deep_scan"].get("start_tie_break_score_delta", 0.015)),
start_preroll_frames=int(cv_raw["deep_scan"].get("start_preroll_frames", 0)),
sequence_candidate_count=int(cv_raw["deep_scan"].get("sequence_candidate_count", 240)),
sequence_min_distance_s=float(cv_raw["deep_scan"].get("sequence_min_distance_s", 1.0)),
span_sample_step_s=float(cv_raw["deep_scan"].get("span_sample_step_s", 0.08)),
trim_tail_frames=int(cv_raw["deep_scan"].get("trim_tail_frames", 2)),
scene_boundary_epsilon_s=float(cv_raw["deep_scan"].get("scene_boundary_epsilon_s", 0.12)),
scoreable_luma_mean_min=float(cv_raw["deep_scan"].get("scoreable_luma_mean_min", 24.0)),
scoreable_luma_p90_min=float(cv_raw["deep_scan"].get("scoreable_luma_p90_min", 58.0)),
scoreable_contrast_min=float(cv_raw["deep_scan"].get("scoreable_contrast_min", 24.0)),
)
scene_detection = SceneDetectionConfig(
content_threshold=float(sd_raw["content_threshold"]),
min_scene_duration_s=float(sd_raw["min_scene_duration_s"]),
)
whisper = WhisperConfig(
model=whisper_raw["model"],
language=whisper_raw["language"],
device=whisper_raw["device"],
compute_type=whisper_raw["compute_type"],
)
# Resolve API key: env var takes precedence over config (which shouldn't have it).
# Supported env vars (in priority order):
# OPENROUTER_API_KEY → for provider = openrouter
# OPENAI_API_KEY → for provider = openai
# LLM_API_KEY → universal fallback
_provider = llm_raw["provider"]
_api_key = (
os.environ.get("OPENROUTER_API_KEY", "")
if _provider == "openrouter"
else os.environ.get("OPENAI_API_KEY", "")
if _provider == "openai"
else ""
) or os.environ.get("LLM_API_KEY", "")
llm = LLMConfig(
provider=_provider,
base_url=llm_raw["base_url"],
model=llm_raw["model"],
timeout_seconds=int(llm_raw["timeout_seconds"]),
temperature=float(llm_raw["temperature"]),
max_tokens=int(llm_raw["max_tokens"]),
api_key=_api_key,
)
vision_provider = vision_raw.get("provider", _provider if _provider in ("openai", "openrouter") else "openrouter")
vision_api_key = (
os.environ.get("OPENROUTER_API_KEY", "")
if vision_provider == "openrouter"
else os.environ.get("OPENAI_API_KEY", "")
) or os.environ.get("VISION_API_KEY", "") or os.environ.get("LLM_API_KEY", "")
vision = VisionConfig(
enabled=bool(vision_raw.get("enabled", False)),
provider=vision_provider,
base_url=str(vision_raw.get("base_url", llm.base_url)),
model=str(vision_raw.get("model", llm.model)),
timeout_seconds=int(vision_raw.get("timeout_seconds", llm.timeout_seconds)),
temperature=float(vision_raw.get("temperature", 0.0)),
max_tokens=int(vision_raw.get("max_tokens", 350)),
scene_candidate_top_k=int(vision_raw.get("scene_candidate_top_k", 8)),
max_new_descriptions_per_run=int(vision_raw.get("max_new_descriptions_per_run", 12)),
max_seed_scenes=int(vision_raw.get("max_seed_scenes", 3)),
seed_points_per_scene=int(vision_raw.get("seed_points_per_scene", 12)),
seed_score=float(vision_raw.get("seed_score", 0.88)),
max_refine_candidates=int(vision_raw.get("max_refine_candidates", 6)),
local_scan_step_s=float(vision_raw.get("local_scan_step_s", 0.12)),
local_scan_max_points_per_scene=int(vision_raw.get("local_scan_max_points_per_scene", 180)),
local_scan_top_candidates=int(vision_raw.get("local_scan_top_candidates", 18)),
local_scan_tie_break_score_delta=float(vision_raw.get("local_scan_tie_break_score_delta", 0.08)),
multi_shot_cut_corr_threshold=float(vision_raw.get("multi_shot_cut_corr_threshold", 0.20)),
multi_shot_boundary_tolerance_s=float(vision_raw.get("multi_shot_boundary_tolerance_s", 0.20)),
fullscan_fallback=bool(vision_raw.get("fullscan_fallback", False)),
content_threshold=float(vision_raw.get("content_threshold", 0.22)),
similarity_threshold=float(vision_raw.get("similarity_threshold", 0.18)),
api_key=vision_api_key,
)
export = ExportConfig(
fcpxml_version=str(export_raw["fcpxml_version"]),
edl_frame_rate=float(export_raw["edl_frame_rate"]),
output_format=export_raw["output_format"],
)
return AppConfig(
project_name=project["name"],
version=project["version"],
log_level=project["log_level"],
paths=paths,
video=video,
cv=CVConfig(vibe_check=vibe_check, deep_scan=deep_scan),
scene_detection=scene_detection,
whisper=whisper,
llm=llm,
vision=vision,
export=export,
)
+287
View File
@@ -0,0 +1,287 @@
"""
src/core/models.py — Canonical data models for AI Trailer Generator v2
Rules:
- Every model is a frozen dataclass (immutable after creation).
- All fields are strictly typed; no bare dicts or untyped lists.
- Seconds are always float; frame numbers are always int.
- Confidence scores live in [0.0, 1.0].
"""
from __future__ import annotations
from dataclasses import dataclass, field
from enum import Enum, auto
from pathlib import Path
from typing import Optional
# ===========================================================================
# Enumerations
# ===========================================================================
class MatchMethod(Enum):
"""CV template matching method (mirrors cv2.TM_* constants)."""
TM_SQDIFF = 0
TM_SQDIFF_NORMED = 1
TM_CCORR = 2
TM_CCORR_NORMED = 3
TM_CCOEFF = 4
TM_CCOEFF_NORMED = 5
class BeatType(Enum):
"""Narrative role of a trailer beat (for dramaturgy / LLM use only)."""
HOOK = auto() # Opening attention grabber
SETUP = auto() # World / character introduction
CONFLICT = auto() # Inciting incident / rising tension
CLIMAX = auto() # Peak action / emotion
RESOLUTION = auto() # Cool-down / tagline
UNKNOWN = auto()
class ExportFormat(Enum):
FCPXML = "fcpxml"
EDL = "edl"
BOTH = "both"
# ===========================================================================
# Phase 0 — Source-movie scene index
# ===========================================================================
@dataclass(frozen=True)
class DialogueLine:
"""Single transcribed line from Whisper output."""
start_s: float # onset in seconds
end_s: float # offset in seconds
text: str # verbatim transcript
speaker: Optional[str] = None # diarisation label if available
@property
def duration_s(self) -> float:
return self.end_s - self.start_s
@dataclass(frozen=True)
class Scene:
"""
One detected scene in the source movie.
Produced by PySceneDetect; enriched by Whisper dialogue and
(optionally) perceptual hashes during the Vibe Check phase.
"""
scene_id: int # zero-based index in source movie
source_path: Path # absolute path to the source video file
start_s: float # scene start in seconds
end_s: float # scene end in seconds
start_frame: int # first frame number
end_frame: int # last frame number
# Populated after Vibe Check fingerprinting
luma_hist: Optional[bytes] = None # serialised np.ndarray (pickle)
sat_hist: Optional[bytes] = None
phash: Optional[str] = None # 64-bit hex string
# Populated after Whisper pass
dialogue: tuple[DialogueLine, ...] = field(default_factory=tuple)
@property
def duration_s(self) -> float:
return self.end_s - self.start_s
@property
def midpoint_s(self) -> float:
return self.start_s + self.duration_s / 2.0
def __repr__(self) -> str:
return (
f"Scene(id={self.scene_id}, "
f"{self.start_s:.2f}s{self.end_s:.2f}s, "
f"dur={self.duration_s:.2f}s)"
)
# ===========================================================================
# Phase 1 — Reference-trailer beat
# ===========================================================================
@dataclass(frozen=True)
class TrailerBeat:
"""
One cut / segment in the reference trailer.
The 'beat' is the atomic unit of a trailer: it maps exactly to one
clip that will later be sourced from the original movie.
"""
beat_id: int
trailer_path: Path
start_s: float
end_s: float
start_frame: int
end_frame: int
beat_type: BeatType = BeatType.UNKNOWN # set by LLM dramaturgy pass
# Visual fingerprints of the *middle* frame (populated by CV pipeline)
luma_hist: Optional[bytes] = None
sat_hist: Optional[bytes] = None
phash: Optional[str] = None
# Dialogue extracted from this beat
dialogue: tuple[DialogueLine, ...] = field(default_factory=tuple)
@property
def duration_s(self) -> float:
return self.end_s - self.start_s
@property
def midpoint_s(self) -> float:
return self.start_s + self.duration_s / 2.0
def __repr__(self) -> str:
return (
f"TrailerBeat(id={self.beat_id}, "
f"{self.beat_type.name}, "
f"{self.start_s:.2f}s{self.end_s:.2f}s)"
)
# ===========================================================================
# Phase 2 — CV match result
# ===========================================================================
@dataclass(frozen=True)
class VibeHit:
"""
Intermediate result from Phase 1 (Vibe Check — histogram/pHash).
Represents a *candidate* scene that passed the coarse filter.
Not yet a confirmed match; forwarded to Deep Scan.
"""
beat_id: int
scene_id: int
hist_score: float # histogram similarity [0.0, 1.0] (CORREL method)
phash_distance: int # Hamming distance [0, 64]; lower = more similar
combined_score: float # weighted aggregate used for ranking
@dataclass(frozen=True)
class MatchSegment:
"""
One source-backed visual island inside a trailer beat.
Some trailer beats contain multiple shots separated by fades/title frames.
A single continuous source in/out cannot represent those beats accurately.
"""
trailer_offset_s: float
duration_s: float
scene_id: int
in_point_s: float
out_point_s: float
match_score: float
is_confirmed: bool = True
@dataclass(frozen=True)
class MatchResult:
"""
Final, confirmed match from Phase 2 (Deep Scan — template matching).
One MatchResult per TrailerBeat: the best frame-accurate hit found
inside the source movie.
"""
beat_id: int # which trailer beat was matched
scene_id: int # which source scene contains the match
source_path: Path # absolute path to source video
# Frame-accurate in-point / out-point in the SOURCE movie
in_point_s: float # matched frame onset in source seconds
out_point_s: float # computed out-point (in_point + beat duration)
in_point_frame: int # matched frame number in source movie
# Match quality
match_score: float # cv2.matchTemplate peak value [0.0, 1.0]
match_location: tuple[int, int] = field(default_factory=lambda: (0, 0))
# (x, y) pixel location of the best match within the source frame
# Provenance
vibe_hit: Optional[VibeHit] = None # the candidate that led here
is_confirmed: bool = True
segments: tuple[MatchSegment, ...] = field(default_factory=tuple)
@property
def duration_s(self) -> float:
return self.out_point_s - self.in_point_s
def __repr__(self) -> str:
return (
f"MatchResult(beat={self.beat_id} → scene={self.scene_id}, "
f"in={self.in_point_s:.3f}s, score={self.match_score:.3f})"
)
# ===========================================================================
# Phase 3 — Edit timeline (pre-export)
# ===========================================================================
@dataclass(frozen=True)
class EditClip:
"""
One clip on the final edit timeline, ready for FCPXML / EDL export.
Combines beat dramaturgy + the CV-confirmed source in/out points.
"""
clip_index: int # position on the timeline (0-based)
beat: TrailerBeat
match: MatchResult
# Timeline position (in the OUTPUT trailer)
timeline_start_s: float
timeline_end_s: float
source_duration_s: float | None = None
trailer_tail_s: float = 0.0
# Optional audio override (e.g. VO or music)
audio_path: Optional[Path] = None
audio_offset_s: float = 0.0
@property
def timeline_duration_s(self) -> float:
return self.timeline_end_s - self.timeline_start_s
@property
def source_timeline_duration_s(self) -> float:
if self.source_duration_s is not None:
return max(0.0, self.source_duration_s)
return self.timeline_duration_s
def __repr__(self) -> str:
return (
f"EditClip(#{self.clip_index}, "
f"tl={self.timeline_start_s:.2f}s{self.timeline_end_s:.2f}s, "
f"src={self.match.in_point_s:.3f}s)"
)
@dataclass(frozen=True)
class EditTimeline:
"""
The complete ordered sequence of EditClips that forms the trailer.
Passed to the export layer (FCPXML / EDL writer).
"""
title: str
frame_rate: float # e.g. 23.976
clips: tuple[EditClip, ...] # ordered by clip_index
@property
def total_duration_s(self) -> float:
if not self.clips:
return 0.0
last = max(self.clips, key=lambda c: c.timeline_end_s)
return last.timeline_end_s
@property
def clip_count(self) -> int:
return len(self.clips)