aitrailer/config.toml

# =============================================================================
# AI Trailer Generator v2 — Central Configuration
# =============================================================================
# All tunable parameters, thresholds, and file paths are defined here.
# NO hardcoded values are allowed in the Python source code.
# =============================================================================

[project]
name        = "AI Trailer Generator v2"
version     = "2.0.0"
log_level   = "INFO"   # DEBUG | INFO | WARNING | ERROR

# -----------------------------------------------------------------------------
# [paths] — External video sources (read-only access)
# -----------------------------------------------------------------------------
[paths]
source_movie       = "B:/Proxy/BehindTheRedDoor_FTR_1080P_2398_Fixed.mp4"
reference_trailer  = "F:/Encodings/BehindTheRedDoor_Trailer_REFERENCE.mp4"

# Output destinations (inside project sandbox)
output_dir         = "output"
cache_dir          = ".cache"
proxy_dir          = "proxy"

# -----------------------------------------------------------------------------
# [video] — Decode / proxy settings
# -----------------------------------------------------------------------------
[video]
# Target FPS for internal frame extraction (0 = use source FPS)
extract_fps         = 1.0
# Proxy resolution for template matching (width x height)
proxy_width         = 640
proxy_height        = 360

# -----------------------------------------------------------------------------
# [cv] — Computer Vision engine parameters
# Phase 1 — "Vibe Check" (histogram / perceptual hash scene-level filter)
# Phase 2 — "Deep Scan"  (template matching frame-level precision)
# -----------------------------------------------------------------------------
[cv]

[cv.vibe_check]
# Number of top candidate scenes to forward to Deep Scan
top_k_candidates      = 100

# Histogram comparison method:
# CORREL=0 | CHISQR=1 | INTERSECT=2 | BHATTACHARYYA=3
hist_compare_method   = 0

# Histogram bins per channel (hue, saturation)
hist_bins_hue         = 50
hist_bins_saturation  = 60

# pHash similarity threshold (lower = stricter; 0–64 range)
# NOTE: 12 is for near-duplicate detection. Cross-video matching
#       (trailer vs source movie with different grading/compression)
#       needs 25–35. Start at 32 and tighten if you get false positives.
phash_max_distance    = 32

# ---- Text-Safe Crop -------------------------------------------------------
# Fraction of frame height to EXCLUDE from the top (e.g. logos, title cards)
crop_top_fraction    = 0.15
# Fraction of frame height to EXCLUDE from the bottom (e.g. letterbox, subs)
crop_bottom_fraction = 0.30

[cv.deep_scan]
# Step size in SECONDS between sampled frames during the coarse scan pass
coarse_step_seconds   = 0.5

# Minimum template match score (0.0–1.0) to accept a candidate as a hit
match_threshold       = 0.65

# Store/report lower-confidence automatic candidates for visual review instead
# of dropping them as "NO MATCH". Confirmed exports can still use match_threshold.
provisional_match_threshold = 0.45

# Lower gate for entering temporal multi-frame refinement. The final decision
# still uses sequence/span scoring; this only avoids rejecting real matches
# because one midpoint frame is weak.
coarse_candidate_threshold = 0.50

# Candidate ranking weights. Duration coverage matters when the same visual
# shot appears multiple times: prefer the occurrence that can cover the beat.
sequence_score_weight = 0.55
span_score_weight     = 0.15
coarse_score_weight   = 0.10
duration_score_weight = 0.20
duration_tie_break_score_delta = 0.03
min_duration_coverage = 0.65
continuity_seed_offsets_s = [-1.0, 0.0, 0.5, 1.0, 1.5, 2.0, 3.0]
scene_seed_top_k = 30
scene_seed_points_per_scene = 6
content_rerank_candidate_count = 100
skip_coarse_scan_with_weighted_seeds = false

# cv2.matchTemplate method:
# TM_CCOEFF_NORMED=5 (recommended), TM_CCORR_NORMED=3
match_method          = 5

# If a coarse hit is found, refine by scanning ± this many seconds
refine_window_seconds = 0.6
refine_step_seconds   = 0.04  # ≈ 1 frame at 25 fps
content_align_window_seconds = 0.48
content_align_sample_step_s  = 0.28
content_validation_weight    = 0.35
provisional_content_threshold = 0.42

# When several adjacent frame offsets score almost the same, prefer the earlier
# one. This avoids matches that are visually correct but start a few frames late.
start_tie_break_score_delta = 0.015
start_preroll_frames        = 0

# Automatic temporal verification after a coarse image hit.
# More candidates reduces false positives from visually similar shots.
sequence_candidate_count = 240
sequence_min_distance_s  = 1.0
max_refine_candidates    = 6

# Match-span detection: trim when the source starts drifting into a different shot.
span_sample_step_s       = 0.08
trim_tail_frames         = 4

# If a refined in-point lands this close to a detected scene end, treat it as
# the next scene. Scene detectors often place cuts a frame or two around the
# visible boundary.
scene_boundary_epsilon_s = 0.12
scoreable_luma_mean_min = 24.0
scoreable_luma_p90_min  = 58.0
scoreable_contrast_min  = 24.0

# -----------------------------------------------------------------------------
# [scene_detection] — PySceneDetect parameters (used to segment source movie)
# -----------------------------------------------------------------------------
[scene_detection]
# Threshold for ContentDetector (lower = more sensitive)
content_threshold     = 27.0
# Minimum scene duration in seconds
min_scene_duration_s  = 1.5

# -----------------------------------------------------------------------------
# [whisper] — Dialogue / audio analysis
# -----------------------------------------------------------------------------
[whisper]
model              = "large-v3"
language           = "ar"
device             = "cuda"        # cuda | cpu
compute_type       = "float16"     # float16 | int8 | float32

# -----------------------------------------------------------------------------
# [llm] — Used ONLY for thematic segmentation / dramaturgy
# -----------------------------------------------------------------------------
[llm]
provider           = "openrouter"
base_url           = "https://openrouter.ai/api/v1"
model              = "google/gemma-4-31b-it"
timeout_seconds    = 120
temperature        = 0.3
max_tokens         = 4096

# -----------------------------------------------------------------------------
# [vision] — Optional cached visual descriptions for ambiguous matching
# -----------------------------------------------------------------------------
[vision]
# Disabled by default to avoid surprise API cost. Enable when you want the
# matcher to ask a vision-capable model for cached 3-frame scene descriptions.
enabled            = false
provider           = "openrouter"
base_url           = "https://openrouter.ai/api/v1"
model              = "google/gemma-4-31b-it"
timeout_seconds    = 90
temperature        = 0.0
max_tokens         = 350

# Cost controls: per beat, only the top scene-level candidates are described,
# and cached descriptions in .cache/vision_descriptions.json are reused.
scene_candidate_top_k       = 8
max_new_descriptions_per_run = 12
max_seed_scenes             = 3
seed_points_per_scene       = 12
seed_score                  = 0.88
max_refine_candidates       = 6
local_scan_step_s           = 0.12
local_scan_max_points_per_scene = 180
local_scan_top_candidates   = 18
local_scan_tie_break_score_delta = 0.08
multi_shot_cut_corr_threshold = 0.20
multi_shot_boundary_tolerance_s = 0.20
fullscan_fallback           = false
content_threshold           = 0.22
similarity_threshold        = 0.18

# -----------------------------------------------------------------------------
# [export] — FCPXML / EDL export settings
# -----------------------------------------------------------------------------
[export]
fcpxml_version     = "1.10"
edl_frame_rate     = 23.976        # fps used in EDL timecode generation
output_format      = "fcpxml"      # fcpxml | edl | both