commit 8e1bcf142fa042815b5b4ec412c00e3ebf89ff9d
Author: Melbar <tangshode@gmail.com>
Date:   Sat May 2 09:07:41 2026 +0200

    Initial project import

diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000..626846b
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,15 @@
+# =============================================================================
+# AI Trailer Generator v2 — Environment Variables
+# =============================================================================
+# Copy this file to .env and fill in your actual keys.
+# .env is listed in .gitignore and will NEVER be committed.
+# =============================================================================
+
+# OpenRouter API key (required when [llm] provider = "openrouter")
+OPENROUTER_API_KEY=sk-or-v1-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+
+# OpenAI API key (required when [llm] provider = "openai")
+# OPENAI_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+
+# Universal fallback (used if provider-specific key is not set)
+# LLM_API_KEY=
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..d3e1e3e
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,44 @@
+# ---------------------------------------------------------------------------
+# AI Trailer Generator v2 — .gitignore
+# ---------------------------------------------------------------------------
+
+# Python
+__pycache__/
+*.py[cod]
+*.pyo
+*.pyd
+*.egg-info/
+dist/
+build/
+*.whl
+.venv/
+venv/
+.mypy_cache/
+.ruff_cache/
+.pytest_cache/
+
+# Project-generated artefacts (potentially huge)
+.cache/
+output/
+proxy/
+*.mp4
+*.mov
+*.mxf
+*.wav
+*.mp3
+*.jpg
+*.jpeg
+*.png
+
+# IDE
+.vscode/
+.idea/
+*.swp
+
+# OS
+.DS_Store
+Thumbs.db
+
+# Secrets / local overrides
+.env
+config.local.toml
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..5323691
--- /dev/null
+++ b/README.md
@@ -0,0 +1,384 @@
+# AI Trailer Generator v2
+
+**Frame-accurate trailer reconstruction via pure Computer Vision**
+
+> Gibt einen Reference Trailer und den dazugehörigen Quellfilm hinein — bekommt eine fertige FCPXML/EDL heraus, die den Trailer Frame-genau aus dem Quellfilm nachbaut.
+
+---
+
+## Das Kernprinzip
+
+Standardmäßig kein LLM für visuelles Matching. Optional kann ein Vision-Layer
+gecachte 3-Frame-Beschreibungen als zusätzliche Suchanker liefern; der finale
+Match bleibt aber CV-verifiziert.
+
+| Phase | Was passiert | Technologie |
+|-------|-------------|-------------|
+| **0 — Prep** | Reference Trailer analysieren & Beats extrahieren | PySceneDetect + OpenCV |
+| **1 — Global Scan**| Gesamten Quellfilm via FFmpeg-Stream (2 FPS) gegen alle Beats scannen | FFmpeg Pipe + Luma-Histogramm |
+| **1b — Optional Vision Seeds** | Unsichere Top-K Szenen mit 3-Frame-Beschreibungen cachen | OpenAI-kompatibles Vision-LLM |
+| **2 — Refine** | Beste Treffer auf Frame-Ebene präzisieren | OpenCV `matchTemplate` |
+| **3 — Dramaturgie** | Narrative BeatType-Klassifikation aus Dialog-Text | OpenRouter LLM |
+| **4 — Export** | Timeline → FCPXML 1.10 oder CMX 3600 EDL | xml.etree + eigener Timecode-Layer |
+
+**Text-Safe Crop:** Obere 15% und untere 30% des Frames werden vor jedem Vergleich ausgeblendet, um Title Cards, Logos und Letterbox zu ignorieren.
+
+---
+
+## Voraussetzungen
+
+- Python **3.11+**
+- [ffmpeg](https://ffmpeg.org/download.html) im PATH (für Whisper Audio-Extraktion)
+- CUDA-fähige GPU empfohlen (für faster-whisper; CPU funktioniert auch)
+
+---
+
+## Setup
+
+### 1. Virtual Environment erstellen & aktivieren
+
+```powershell
+# Im Projektordner
+python -m venv .venv
+.\.venv\Scripts\Activate.ps1
+
+# Falls ExecutionPolicy blockiert:
+# Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser
+```
+
+### 2. Abhängigkeiten installieren
+
+```powershell
+pip install -r requirements.txt
+```
+
+### 3. API-Key konfigurieren
+
+```powershell
+# .env aus dem Template kopieren
+Copy-Item .env.example .env
+
+# Dann .env öffnen und den echten Key eintragen:
+# OPENROUTER_API_KEY=sk-or-v1-...
+```
+
+### 4. Videodateien eintragen
+
+`config.toml` öffnen und die Pfade anpassen:
+
+```toml
+[paths]
+source_movie      = "B:/Proxy/DeinFilm_FTR.mp4"
+reference_trailer = "F:/Encodings/DeinFilm_Trailer.mp4"
+```
+
+---
+
+## Verwendung
+
+```powershell
+# Vollständige Pipeline (analyze → match → report → export)
+python cli.py run
+
+# Ohne Whisper-Transkription (schneller)
+python cli.py run --no-audio
+
+# Ohne LLM-Klassifikation
+python cli.py run --no-audio --no-llm
+
+# Schrittweise
+python cli.py analyze            # Reference Trailer → Beats erkennen
+python cli.py match              # Globaler FFmpeg Scan (Szenen-unabhängig)
+python cli.py report             # HTML Report mit Video-Vergleich bauen
+python cli.py export --format both   # FCPXML + EDL ausgeben
+
+# Gezielt nur einen Beat bearbeiten (empfohlen für erste Iterationen)
+python cli.py match --beat 5
+python cli.py match --beat 5 --vision   # optionale gecachte Vision-Seeds
+python cli.py report --beat 5
+python cli.py export --beat 5 --format both
+
+# Fehlerhafte Matches korrigieren
+python cli.py rematch --beat 5 --threshold 0.50  # Schwelle anpassen (Globaler Scan wird für diesen Beat wiederholt)
+python cli.py rematch --beat 5 --refine          # Cached Match per lokalem Bildinhalt-Offset nachschärfen
+```
+
+Der HTML-Report regeneriert seine Preview-Clips bei jedem Lauf mit genauer
+FFmpeg-Nachsuche und synchronisiert die beiden Video-Player pro Beat. Dadurch
+ist der Report zur Frame-Prüfung geeignet und zeigt keine alten gecachten
+Preview-Clips.
+Source-Previews bekommen bei Trailer-only-Tails denselben schwarzen Tail wie der
+Export, damit der Browser nicht einen zu kurzen Source-Clip gegen den längeren
+Referenzbeat weiterspult oder loopt.
+Zur Synchronprüfung rendert der Report ein einzelnes Frame-Locked-Compare-Video
+mit Referenz und Source in demselben MP4-Stream. Dieses Compare-Video ist
+maßgeblich, weil zwei getrennte Browser-Videoelemente nie zuverlässig
+framegenau synchron bleiben.
+
+Wenn ein Trailer-Beat am Ende eine Blende, Schwarzfläche oder Textkarte enthält,
+die im Source-Film nicht als normaler Shot vorhanden ist, endet der Source-Match
+am letzten stabil passenden Frame. Exportierte Timelines behalten trotzdem die
+volle Beat-Länge und fügen danach automatisch einen schwarzen Trailer-Tail mit
+Marker für Fade/Dissolve ein.
+
+Gezielte Ein-Beat-Matches nutzen zusätzlich vorhandene automatische Nachbarbeats
+aus dem Cache als zeitliche Suchanker. Das hilft bei aufeinanderfolgenden Shots,
+ohne manuelle Szenen oder Timecodes zu kuratieren.
+Bei `match --beat N` wird ein alter Cache-Treffer für genau diesen Beat entfernt
+und nur ein neu gefundener automatischer Treffer wieder eingetragen. Ein
+fehlgeschlagener neuer Lauf kann dadurch keinen alten falschen Report-Treffer
+stehen lassen.
+
+Der globale Bildvergleich arbeitet auf kontrast-normalisierten Luma- und
+Kantenfeatures statt auf rohen Farb-Pixeln. Dadurch bleiben Schwarzweiß- oder
+anders gegradete Trailerbilder mit dem Source-Material vergleichbar, während
+unähnliche Farbshots schlechter ranken.
+Die Inpoint-Feinjustage bestimmt den Versatz lokal aus dem Bildinhalt: Um einen
+groben Treffer herum werden mehrere Referenzframes gegen mehrere Source-Offsets
+verglichen, und der beste gemeinsame Offset wird übernommen. Das ist schneller
+als ein erneuter globaler Scan und vermeidet pauschale Frame-Prerolls.
+Schwarze Referenzframes aus Blenden oder Titel-Tails werden für diese
+Offset-Messung ausgelassen, damit echte Bildbewegung und nicht die Blende selbst
+den Inpoint bestimmt.
+`rematch --refine` nutzt denselben lokalen FFmpeg/Pillow-Aligner und schreibt
+den korrigierten Inpoint direkt zurück in `.cache/match_results.json`.
+
+Zusätzlich werden aus den besten szenenweiten Luma/Histogramm-Kandidaten
+mehrere Inpoint-Suchanker erzeugt. Diese Scene-Seeds verwenden keine harte
+pHash-Sperre, weil pHash bei stark anders gegradeten Trailerbildern echte
+Matches zu früh ausschließen kann.
+Optional kann `python cli.py match --beat N --vision` einen Vision-Layer
+zuschalten. Dann werden pro Trailer-Beat und pro wenigen Scene-Level-Kandidaten
+je drei Frames (Anfang, Mitte, Ende) von einem visionfähigen OpenAI-kompatiblen
+Modell beschrieben. Die Beschreibungen liegen in
+`.cache/vision_descriptions.json` und werden wiederverwendet. Vision erzeugt
+nur zusätzliche Suchanker; der eigentliche Match muss weiterhin durch CV,
+Content-Reranking, Timing und Duration-Coverage bestätigt werden.
+Der gewichtete Vision-Seed-Pfad ersetzt standardmäßig keinen normalen
+FFmpeg-Vollscan. Vision-Beschreibungen sind semantische Hinweise, aber keine
+Beweise; der volle CV-Scan bleibt deshalb aktiv, damit falsch bewertete
+Vision-Szenen echte Treffer nicht verdrängen. Für schnelle Experimente kann
+`skip_coarse_scan_with_weighted_seeds = true` gesetzt werden.
+Gewichtete Vision-Seeds werden nicht zuerst durch den alten Midpoint-Template
+Refine verschoben; sie gehen direkt in die lokale Content-Alignment-Prüfung.
+Das schützt wiederholte Gesprächseinstellungen, bei denen ähnliche Momente
+mehrfach in derselben Szene vorkommen.
+Innerhalb der automatisch von Vision vorgeschlagenen Szenen läuft zusätzlich
+eine dichte lokale Bildsequenzsuche. Sie misst den Phasenversatz in kleinen
+Zeitschritten direkt am Bildinhalt und bevorzugt Kandidaten mit genügend
+Restdauer in derselben Source-Szene. Das ist kein manueller Override: Vision
+grenzt nur Suchbereiche ein, die Auswahl bleibt Content-, Timing- und
+Coverage-getrieben.
+Nach einem dichten Vision-Treffer darf der spätere lokale Aligner nur noch im
+Bereich dieses Scan-Schritts nachjustieren. So kann ein korrekt gefundener
+Bewegungsmoment nicht wieder um viele Frames in eine ähnlich aussehende Phase
+derselben Szene verschoben werden.
+Wenn mehrere Vision-Kandidaten in derselben Source-Szene ähnlich gut scoren
+und die Beat-Dauer abdecken, bevorzugt der Matcher die frühere Phase. Das
+verhindert, dass ein späterer, minimal stärkerer Standbildtreffer die
+Bewegungsphase des Trailers sichtbar überholt.
+Enthält ein Trailerbeat selbst einen harten Umschnitt, werden Kandidaten an
+angrenzenden Source-Szenengrenzen zusätzlich als zusammenhängender Multi-Shot-
+Span geprüft. Ein Match darf dann über eine Source-Szenengrenze laufen, aber
+nur wenn die relative Source-Grenze zeitlich zu einem erkannten Trailer-Umschnitt
+passt. So kann ein Beat aus Frage/Antwort-Shots vollständig erfasst werden,
+ohne Szenen willkürlich zusammenzukleben.
+Auch der lokale Content-Aligner darf einen Inpoint nur noch übernehmen, wenn
+die feste Whole-Frame-/Spatial-Validation dadurch besser wird.
+Vor dem teuren Frame-Refine wird der gesamte Kandidatenpool mit einer schnellen
+festen Inhaltsprüfung neu sortiert. Dadurch können korrekte Treffer aus
+wiederholten Einstellungen einer Szene nach oben kommen, auch wenn ein freier
+Template-Peak an anderer Stelle numerisch stärker war. Suchanker bleiben im
+Pool erhalten, dürfen aber erst nach der Inhaltsprüfung nach oben rücken. Wenn
+ein Kandidat visuell plausibel ist, aber wegen Trailerblende oder kurzem
+Source-Span die normale Coverage knapp verfehlt, wird er als provisional Match
+behalten statt als `NO MATCH` verworfen.
+Dieses Reranking berücksichtigt zusätzlich die verbleibende Szenenlänge ab dem
+Kandidaten-Inpoint. Dadurch werden zu späte ähnliche Gesprächsphasen innerhalb
+derselben Szene nicht mehr vor frühere, tragfähigere Phasen sortiert.
+Das Inhalts-Reranking nutzt bewusst nur wenige repräsentative Referenzframes und
+eine begrenzte Kandidatenzahl. So bleiben wiederholte Szenen auffindbar, ohne
+dass der Lauf durch tausende Random-Seeks minutenlang festhängt.
+Confirmed Matches werden zusätzlich durch eine feste nahezu-Whole-Frame-Prüfung
+aus Luma, Kanten, Farbhistogramm und räumlichen 4x4-Farbhistogrammen gedeckelt.
+Dadurch kann ein freier Template-Hit mit ähnlicher Fenster-/Gesichtsstruktur
+nicht mehr als sicherer Match gelten, wenn die Gesamtkomposition oder die
+Bewegungsphase sichtbar eine andere Szene ist.
+Für gewichtete Vision-Kandidaten gibt es zusätzlich eine eigene Provisional-
+Bewertung aus Content-Score, Restdauer und Seed-Stärke. Dadurch können echte,
+aber durch Trailer-Grading/Crop numerisch schwache Treffer im Report landen,
+ohne als confirmed Match durchzugehen.
+Die Cache-Normalisierung für Report/Export verwendet dieselbe niedrigere
+Content-Untergrenze für nicht bestätigte Vision-Provisional-Treffer, damit ein
+gerade gefundener automatischer Match nicht beim Report-Aufbau wieder
+weggefiltert wird.
+Sie übernimmt auch die Multi-Shot-Coverage-Regel: gecachte Treffer, die passend
+zu internen Trailer-Umschnitten über angrenzende Source-Szenen laufen, werden
+nicht mehr auf die erste Source-Szene zurückgekürzt.
+Gezielte Einzel-Beat-Matches gewichten außerdem die automatisch aus Nachbarbeats
+abgeleiteten Continuity-Seeds. Wenn ein Beat direkt an einen bereits passenden
+Vorgänger anschließt, kann ein späterer ähnlich aussehender Moment derselben
+Dialogszene den erwarteten Anschluss nicht mehr nur wegen eines höheren
+Standbildscores verdrängen.
+Diese Continuity-Seeds sind aber nur Suchanker: in derselben Szene darf ein
+späterer Inpoint gewinnen, wenn die mehrframeige Content-Prüfung die
+Bewegungsphase klar besser trifft. Dadurch bleiben Anschlussmatches stabil,
+ohne Hand-/Kopfbewegungen auf einen falschen Zeitpunkt festzunageln.
+Continuity- und Vision-Seeds allein schalten den globalen FFmpeg-Scan
+standardmäßig nicht ab. Sie sind Suchanker, keine Beweise; der volle CV-Scan
+bleibt aktiv, damit semantisch plausible, aber falsche Vision-Treffer echte
+Bildmatches nicht verdrängen.
+Lange Trailerbeats werden nicht mehr automatisch über ihre gesamte Beat-Länge
+gegen einen einzigen Source-Clip validiert. Sobald nach einem sichtbaren
+Source-Abschnitt eine anhaltende Schwarzblende oder Titel-/Credit-Insel beginnt,
+endet der matchbare Referenzbereich dort; zwei aufeinanderfolgende dunkle
+Samples reichen dafür. Spätere Text-/Creditbilder im selben Beat gehen damit
+nicht mehr in Reranking, Validation oder Span-Schätzung ein.
+Zusätzlich werden sehr dunkle, kontrastarme oder noch nicht sauber
+auf-/abgeblendete Referenzframes aus Score, Inhalts-Reranking,
+Phasen-Alignment und Motion-Templates herausgenommen. Blenden sollen bestimmen,
+wie der Clip später exportiert wird, aber nicht, ob der Bildinhalt als Match
+gilt.
+Treffer unter `provisional_content_threshold` werden gar nicht mehr gespeichert
+oder aus alten Cache-Ergebnissen übernommen. Das verhindert, dass offensichtlich
+falsche Szenen im Report als Match-Kandidat weiterleben.
+
+### Log-Level
+
+```powershell
+python cli.py run --log-level DEBUG
+```
+
+---
+
+## Projektstruktur
+
+```
+ai_trailer_2026/
+│
+├── config.toml              ← Alle Parameter (kein Hardcoding im Code)
+├── .env                     ← API-Keys (NICHT commiten)
+├── cli.py                   ← Einstiegspunkt
+│
+├── src/
+│   ├── core/
+│   │   ├── config.py        load_config() → AppConfig (frozen dataclasses)
+│   │   └── models.py        Scene, TrailerBeat, VibeHit, MatchResult, EditTimeline
+│   ├── cv/
+│   │   ├── fingerprinting.py   Text-Safe Crop · HS-Histogramme · pHash
+│   │   ├── vibe_check.py       Phase 1: Histogram+pHash Filter
+│   │   ├── scene_indexer.py    PySceneDetect → Fingerprint → JSON-Cache
+│   │   ├── frame_extractor.py  VideoCapture-Wrapper
+│   │   └── deep_scan.py        Phase 2: Coarse+Refine Template-Matching
+│   ├── audio/
+│   │   └── transcriber.py   faster-whisper Transkription
+│   ├── llm/
+│   │   ├── dramaturg.py     OpenRouter → BeatType (Dialog/Dramaturgie)
+│   │   └── vision_cache.py  optionale gecachte 3-Frame Vision-Seeds
+│   ├── pipeline/
+│   │   ├── trailer_analyzer.py  Reference-Trailer → TrailerBeat[]
+│   │   └── matcher.py           Orchestrierung + EditTimeline-Builder
+│   └── export/
+│       ├── timecode.py      Sekunden ↔ FCPXML-Rational ↔ SMPTE
+│       ├── fcpxml_writer.py FCPXML 1.10
+│       └── edl_writer.py    CMX 3600 EDL
+│
+├── output/                  ← FCPXML/EDL Output (gitignored)
+├── .cache/                  ← Szenen-Index + Match-Ergebnisse (gitignored)
+└── tests/                   52 Unit-Tests (pytest)
+```
+
+---
+
+## Cache-Verhalten
+
+Damit nicht bei jedem Lauf der gesamte Quellfilm neu analysiert werden muss:
+
+| Datei | Inhalt | Neu bauen mit |
+|-------|--------|---------------|
+| `.cache/scene_index.json` | Alle Quellfilm-Szenen + Fingerprints | `--force-reindex` |
+| `.cache/trailer_beats.json` | Erkannte Trailer-Beats | `python cli.py analyze` erneut |
+| `.cache/match_results.json` | CV-Matching-Ergebnisse | `python cli.py match` erneut |
+| `.cache/vision_descriptions.json` | Optionale 3-Frame Vision-Beschreibungen für Beats/Szenen | löschen oder anderes Vision-Modell konfigurieren |
+
+---
+
+## Tests
+
+```powershell
+pytest tests/ -v
+```
+
+Alle Tests laufen ohne echte Videodateien (synthetische Frames via numpy/OpenCV).
+
+---
+
+## Konfiguration (Auszug)
+
+Alle Werte in `config.toml` — keine hardgecodeten Konstanten im Code.
+
+```toml
+[cv.vibe_check]
+top_k_candidates     = 10     # Top-K Kandidaten für Deep Scan
+phash_max_distance   = 12     # Hamming-Distanz Schwelle (0–64)
+crop_top_fraction    = 0.15   # Obere 15% ausblenden (Logos)
+crop_bottom_fraction = 0.30   # Untere 30% ausblenden (Letterbox/Subs)
+
+[cv.deep_scan]
+coarse_step_seconds  = 0.5    # Scan-Schrittgröße (Coarse Pass)
+match_threshold      = 0.65   # Mindestscore für bestätigte automatische Matches
+provisional_match_threshold = 0.45 # Niedrigere automatische Kandidaten im Report zeigen
+coarse_candidate_threshold = 0.50 # Niedrigeres Gate vor Multi-Frame-Refine
+refine_window_seconds = 0.6   # Suchfenster für framegenaue Inpoint-Feinjustage
+refine_step_seconds  = 0.04   # ~1 Frame bei 25fps (Refine Pass)
+content_align_window_seconds = 0.48 # Lokales Suchfenster um einen groben Treffer
+content_align_sample_step_s  = 0.28 # Referenzframes für direkten Bildinhalt-Offset
+content_validation_weight    = 0.35 # Gewicht der festen Whole-Frame-/Spatial-Endprüfung
+provisional_content_threshold = 0.42 # Untergrenze für Report-/Cache-Kandidaten
+start_tie_break_score_delta = 0.015 # Bei fast gleichen Scores früheren Inpoint wählen
+start_preroll_frames        = 0  # Kein pauschaler Start-Ausgleich; Offset kommt aus Bildinhalt
+sequence_candidate_count = 240 # Breiter Kandidatenpool vor Inhalts-Reranking
+max_refine_candidates = 6 # Teurer Frame-Refine läuft nur auf den besten Inhaltskandidaten
+scene_seed_top_k = 30 # Scene-Level-Kandidaten als zusätzliche Suchanker
+scene_seed_points_per_scene = 6 # Inpoint-Samples pro Scene-Level-Kandidat
+content_rerank_candidate_count = 100 # Grobe Kandidaten vor Inhalts-Reranking
+skip_coarse_scan_with_weighted_seeds = false # Vision-Seeds nur als Hinweise; Vollscan bleibt robust
+sequence_score_weight = 0.55  # Gewicht für mehrere zeitliche Vergleichsframes
+span_score_weight     = 0.15  # Gewicht für Stabilität bis zum Beat-Ende
+coarse_score_weight   = 0.10  # Gewicht des groben Midpoint-Treffers
+duration_score_weight = 0.20  # Gewicht für nutzbare Länge des Source-Treffers
+duration_tie_break_score_delta = 0.03 # Bei ähnlichem Score längeren Treffer bevorzugen
+min_duration_coverage = 0.65 # Treffer muss mindestens 65% des matchbaren Referenzanteils tragen
+continuity_seed_offsets_s = [-1.0, 0.0, 0.5, 1.0, 1.5, 2.0, 3.0] # Suchanker um gematchte Nachbarbeats
+span_sample_step_s       = 0.08 # Schrittweite für End-/Drift-Erkennung
+trim_tail_frames         = 4  # Sicherheitsabstand gegen kurze Blitzer am Ende
+scene_boundary_epsilon_s = 0.12 # Szenengrenzen-Toleranz gegen 1-2 Frame Cut-Drift
+scoreable_luma_mean_min = 24.0 # Zu dunkle/Fade-Frames nicht scoren
+scoreable_luma_p90_min  = 58.0 # Helle Bildanteile müssen sichtbar genug sein
+scoreable_contrast_min  = 24.0 # Kontrastarme Blenden/Titelinseln ignorieren
+
+[vision]
+enabled = false # Kostenkontrolle: per CLI mit --vision aktivierbar
+model = "google/gemma-4-31b-it" # Muss ein visionfähiges OpenAI-kompatibles Modell sein
+scene_candidate_top_k = 8 # Nur wenige Top-Szenen pro Beat beschreiben
+max_new_descriptions_per_run = 12 # API-Kosten pro Lauf begrenzen
+max_seed_scenes = 3 # Nur beste Vision-Szenen als Suchanker weitergeben
+seed_points_per_scene = 12 # Inpoint-Samples pro Vision-Szene
+seed_score = 0.88 # Vision-Seeds bekommen mehr Priorität als normale Scene-Seeds
+max_refine_candidates = 6 # Vision-Pfad prüft mehrere Bewegungsphasen derselben Szene
+local_scan_step_s = 0.12 # Dichte lokale Bildsuche in Vision-Szenen
+local_scan_max_points_per_scene = 180 # Laufzeitgrenze pro Source-Szene
+local_scan_top_candidates = 18 # Beste lokale Kandidaten gehen ins Refinement
+local_scan_tie_break_score_delta = 0.08 # Ähnliche Vision-Treffer: frühere Phase bevorzugen
+multi_shot_cut_corr_threshold = 0.20 # Interne Trailer-Umschnitte erkennen
+multi_shot_boundary_tolerance_s = 0.20 # Source-Grenze muss zum Trailer-Cut passen
+fullscan_fallback = false # Nur relevant, wenn skip_coarse_scan_with_weighted_seeds=true ist
+content_threshold = 0.22 # Lockeres Content-Gate nur für gewichtete Vision-Seeds
+similarity_threshold = 0.18 # Mindest-Textähnlichkeit für Vision-Seeds
+```
+
+---
+
+## Lizenz
+
+Internes Tool — nicht für den öffentlichen Vertrieb.
diff --git a/cli.py b/cli.py
new file mode 100644
index 0000000..5105d5f
--- /dev/null
+++ b/cli.py
@@ -0,0 +1,899 @@
+"""
+cli.py — AI Trailer Generator v2 — Command-Line Interface
+
+Usage:
+    python cli.py analyze   [--config CONFIG] [--no-audio] [--no-llm]
+    python cli.py match     [--config CONFIG] [--force-reindex]
+    python cli.py rematch   --beat N [--threshold F] [--refine]
+    python cli.py report    [--config CONFIG]
+    python cli.py run       [--config CONFIG] [--force-reindex] [--no-audio] [--no-llm]
+    python cli.py export    [--config CONFIG] [--format fcpxml|edl|both]
+
+On --no-audio / --no-llm:
+    These flags do NOT affect matching quality.
+    Whisper and the LLM only assign narrative labels (HOOK/SETUP/CLIMAX)
+    to beats in the export metadata. The CV pipeline is identical either way.
+    Use them for fast iterations: they skip large model downloads.
+
+All heavy imports are deferred so --help is instant.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import sys
+from pathlib import Path
+
+
+# ---------------------------------------------------------------------------
+# Logging setup
+# ---------------------------------------------------------------------------
+
+def _setup_logging(level: str = "INFO") -> None:
+    # Force UTF-8 for Windows console emoji printing
+    if sys.stdout.encoding != 'utf-8':
+        sys.stdout.reconfigure(encoding='utf-8')
+    logging.basicConfig(
+        format="%(asctime)s  %(levelname)-8s  %(name)s — %(message)s",
+        datefmt="%H:%M:%S",
+        level=getattr(logging, level.upper(), logging.INFO),
+        stream=sys.stdout,
+    )
+    logging.getLogger("PIL").setLevel(logging.WARNING)
+
+
+def _ensure_utf8_console() -> None:
+    """Make argparse help safe on Windows before logging is configured."""
+    if sys.stdout.encoding != "utf-8":
+        sys.stdout.reconfigure(encoding="utf-8")
+
+
+# ---------------------------------------------------------------------------
+# Cache helpers (match results ↔ JSON)
+# ---------------------------------------------------------------------------
+
+def _results_cache_path(cfg: "AppConfig") -> Path:  # type: ignore[name-defined]
+    return cfg.paths.cache_dir / "match_results.json"
+
+
+def _save_results(results: list, cfg: "AppConfig") -> None:  # type: ignore[name-defined]
+    from src.core.models import MatchResult
+    data = [
+        {
+            "beat_id":        r.beat_id,
+            "scene_id":       r.scene_id,
+            "source_path":    str(r.source_path),
+            "in_point_s":     r.in_point_s,
+            "out_point_s":    r.out_point_s,
+            "in_point_frame": r.in_point_frame,
+            "match_score":    r.match_score,
+            "match_location": list(r.match_location),
+            "is_confirmed":   r.is_confirmed,
+            "segments": [
+                {
+                    "trailer_offset_s": s.trailer_offset_s,
+                    "duration_s": s.duration_s,
+                    "scene_id": s.scene_id,
+                    "in_point_s": s.in_point_s,
+                    "out_point_s": s.out_point_s,
+                    "match_score": s.match_score,
+                    "is_confirmed": s.is_confirmed,
+                }
+                for s in getattr(r, "segments", ())
+            ],
+        }
+        for r in results
+    ]
+    p = _results_cache_path(cfg)
+    p.parent.mkdir(parents=True, exist_ok=True)
+    p.write_text(json.dumps(data, indent=2), encoding="utf-8")
+    logging.getLogger(__name__).info("Match results cached → %s", p)
+
+
+def _load_results(cfg: "AppConfig") -> list:  # type: ignore[name-defined]
+    from src.core.models import MatchResult, MatchSegment
+    p = _results_cache_path(cfg)
+    if not p.exists():
+        raise FileNotFoundError(f"No cached results at {p}. Run 'match' first.")
+    raw = json.loads(p.read_text(encoding="utf-8"))
+    return [
+        MatchResult(
+            beat_id=d["beat_id"],
+            scene_id=d["scene_id"],
+            source_path=Path(d["source_path"]),
+            in_point_s=d["in_point_s"],
+            out_point_s=d["out_point_s"],
+            in_point_frame=d["in_point_frame"],
+            match_score=d["match_score"],
+            match_location=tuple(d["match_location"]),
+            is_confirmed=d.get("is_confirmed", True),
+            segments=tuple(
+                MatchSegment(
+                    trailer_offset_s=float(s["trailer_offset_s"]),
+                    duration_s=float(s["duration_s"]),
+                    scene_id=int(s["scene_id"]),
+                    in_point_s=float(s["in_point_s"]),
+                    out_point_s=float(s["out_point_s"]),
+                    match_score=float(s["match_score"]),
+                    is_confirmed=bool(s.get("is_confirmed", True)),
+                )
+                for s in d.get("segments", ())
+            ),
+        )
+        for d in raw
+    ]
+
+
+def _load_scene_cache_light(cfg) -> list[dict]:
+    p = cfg.paths.cache_dir / "scene_index.json"
+    if not p.exists():
+        return []
+    return json.loads(p.read_text(encoding="utf-8"))
+
+
+def _scene_fps_light(scene: dict, cfg) -> float:
+    duration_s = max(0.0, float(scene["end_s"]) - float(scene["start_s"]))
+    frame_count = max(0, int(scene["end_frame"]) - int(scene["start_frame"]))
+    return frame_count / duration_s if duration_s > 0 and frame_count > 0 else cfg.export.edl_frame_rate
+
+
+def _scene_for_time_light(scenes: list[dict], t_sec: float, cfg) -> dict | None:
+    for idx, scene in enumerate(scenes):
+        if float(scene["start_s"]) <= t_sec < float(scene["end_s"]):
+            if (
+                float(scene["end_s"]) - t_sec <= cfg.cv.deep_scan.scene_boundary_epsilon_s
+                and idx + 1 < len(scenes)
+            ):
+                return scenes[idx + 1]
+            return scene
+    return None
+
+
+def _scene_by_id_light(scenes: list[dict], scene_id: int) -> dict | None:
+    return next((s for s in scenes if int(s["scene_id"]) == scene_id), None)
+
+
+def _contiguous_duration_light(beat, in_point_s: float, scenes: list[dict], cfg, matchable_duration_s: float) -> float:
+    if matchable_duration_s <= 0:
+        return 0.0
+    try:
+        from src.cv.global_scan import _reference_internal_cut_offsets
+        cut_offsets = _reference_internal_cut_offsets(beat, cfg)
+    except Exception:
+        cut_offsets = []
+
+    start_idx = None
+    for idx, scene in enumerate(scenes):
+        if float(scene["start_s"]) <= in_point_s < float(scene["end_s"]):
+            start_idx = idx
+            break
+    if start_idx is None:
+        return 0.0
+
+    target_end = in_point_s + matchable_duration_s
+    current_end = in_point_s
+    for scene in scenes[start_idx:]:
+        scene_end = float(scene["end_s"])
+        if target_end <= scene_end:
+            return matchable_duration_s
+
+        boundary_offset = scene_end - in_point_s
+        if not any(
+            abs(boundary_offset - cut_offset) <= cfg.vision.multi_shot_boundary_tolerance_s
+            for cut_offset in cut_offsets
+        ):
+            tail_s = max(0.0, cfg.cv.deep_scan.trim_tail_frames / _scene_fps_light(scene, cfg))
+            return max(0.0, scene_end - in_point_s - tail_s)
+        current_end = scene_end
+
+    return max(0.0, current_end - in_point_s)
+
+
+def _normalize_cached_results(beats: list, results: list, cfg) -> list:
+    """
+    Re-apply current generic timing rules to cached results.
+
+    This keeps old automatic cache entries from preserving obsolete scene-boundary
+    or tail-trim behavior without introducing manual per-beat truth.
+    """
+    from dataclasses import replace
+
+    scenes = _load_scene_cache_light(cfg)
+    if not scenes:
+        return results
+
+    beats_by_id = {b.beat_id: b for b in beats}
+    normalized = []
+    for result in results:
+        beat = beats_by_id.get(result.beat_id)
+        if result.match_score < cfg.cv.deep_scan.provisional_match_threshold:
+            continue
+
+        scene = _scene_for_time_light(scenes, result.in_point_s, cfg)
+        declared_scene = _scene_by_id_light(scenes, result.scene_id)
+
+        # If the automatic matcher selected a scene but its in-point sits just
+        # before that scene's detected start, treat this as scene-boundary drift
+        # and clamp to the declared scene. This is generic: no beat IDs, no
+        # manual timestamps, just consistent scene/time reconciliation.
+        if declared_scene is not None:
+            declared_start = float(declared_scene["start_s"])
+            declared_end = float(declared_scene["end_s"])
+            declared_fps = _scene_fps_light(declared_scene, cfg)
+            boundary_tolerance_s = (
+                cfg.cv.deep_scan.scene_boundary_epsilon_s
+                + cfg.cv.deep_scan.start_preroll_frames / declared_fps
+            )
+            if declared_start - boundary_tolerance_s <= result.in_point_s < declared_end:
+                scene = declared_scene
+
+        if beat is None or scene is None:
+            normalized.append(result)
+            continue
+
+        fps = _scene_fps_light(scene, cfg)
+        adjusted_in_s = result.in_point_s
+        scene_changed = int(scene["scene_id"]) != result.scene_id
+        starts_before_scene = result.in_point_s < float(scene["start_s"])
+        if scene_changed or starts_before_scene or result.duration_s <= 0.12:
+            adjusted_in_s = max(0.0, result.in_point_s - (cfg.cv.deep_scan.start_preroll_frames / fps))
+            adjusted_in_s = max(float(scene["start_s"]), adjusted_in_s)
+            scene = _scene_for_time_light(scenes, adjusted_in_s, cfg) or scene
+            fps = _scene_fps_light(scene, cfg)
+
+        matchable_duration_s = beat.duration_s
+        try:
+            from src.cv.global_scan import estimate_matchable_reference_duration
+            matchable_duration_s = estimate_matchable_reference_duration(beat, cfg)
+        except Exception:
+            pass
+
+        tail_s = max(0.0, cfg.cv.deep_scan.trim_tail_frames / fps)
+        single_scene_duration_s = max(0.0, min(beat.duration_s, float(scene["end_s"]) - adjusted_in_s) - tail_s)
+        contiguous_duration_s = _contiguous_duration_light(
+            beat,
+            adjusted_in_s,
+            scenes,
+            cfg,
+            matchable_duration_s,
+        )
+        max_duration_s = max(single_scene_duration_s, min(beat.duration_s, contiguous_duration_s))
+
+        normalized_result = result
+        if (
+            scene_changed
+            or starts_before_scene
+            or result.duration_s <= 0.12
+            or result.out_point_s > adjusted_in_s + max_duration_s + (1.0 / fps)
+        ):
+            normalized_result = replace(
+                result,
+                scene_id=int(scene["scene_id"]),
+                in_point_s=adjusted_in_s,
+                out_point_s=adjusted_in_s + max_duration_s,
+                in_point_frame=int(adjusted_in_s * fps),
+            )
+
+        coverage = (
+            max(0.0, normalized_result.duration_s) / matchable_duration_s
+            if matchable_duration_s > 0 else 0.0
+        )
+        if coverage < cfg.cv.deep_scan.min_duration_coverage:
+            continue
+
+        try:
+            from src.cv.content_align import align_cached_match_by_content
+            _, content_score = align_cached_match_by_content(
+                beat,
+                normalized_result.in_point_s,
+                cfg,
+                search_window_s=min(0.8, cfg.cv.deep_scan.content_align_window_seconds),
+                fps=12.5,
+            )
+            content_gate = (
+                cfg.cv.deep_scan.provisional_content_threshold
+                if normalized_result.is_confirmed
+                else min(cfg.cv.deep_scan.provisional_content_threshold, cfg.vision.content_threshold)
+            )
+            if content_score < content_gate:
+                continue
+            if content_score < cfg.cv.deep_scan.match_threshold and normalized_result.is_confirmed:
+                normalized_result = replace(
+                    normalized_result,
+                    match_score=min(normalized_result.match_score, content_score),
+                    is_confirmed=False,
+                )
+        except Exception:
+            pass
+
+        normalized.append(normalized_result)
+
+    return normalized
+
+
+# ---------------------------------------------------------------------------
+# Command handlers
+# ---------------------------------------------------------------------------
+
+def _build_transcribe_callback(cfg):
+    """Return a transcribe_callback closure, or None if audio is disabled."""
+    from src.audio.transcriber import transcribe_video
+
+    def _cb(path, start_s, end_s, offset_s):
+        return transcribe_video(path, cfg, start_s=start_s, end_s=end_s, time_offset_s=offset_s)
+
+    return _cb
+
+
+def _build_classify_callback(cfg):
+    """Return a classify_callback closure."""
+    from src.llm.dramaturg import classify_beats
+
+    def _cb(beats):
+        return classify_beats(beats, cfg)
+
+    return _cb
+
+
+def cmd_analyze(args: argparse.Namespace, cfg) -> list:
+    from src.pipeline.trailer_analyzer import analyze_reference_trailer
+
+    transcribe_cb = _build_transcribe_callback(cfg) if not args.no_audio else None
+    classify_cb   = _build_classify_callback(cfg)   if not args.no_llm   else None
+
+    beats = analyze_reference_trailer(
+        cfg,
+        transcribe_callback=transcribe_cb,
+        classify_callback=classify_cb,
+    )
+
+    # Persist beats for downstream commands (including histogram bytes as hex)
+    beats_cache = cfg.paths.cache_dir / "trailer_beats.json"
+    beats_cache.parent.mkdir(parents=True, exist_ok=True)
+    beats_data = [
+        {
+            "beat_id":     b.beat_id,
+            "start_s":     b.start_s,
+            "end_s":       b.end_s,
+            "start_frame": b.start_frame,
+            "end_frame":   b.end_frame,
+            "beat_type":   b.beat_type.name,
+            "dialogue":    [{"start_s": d.start_s, "end_s": d.end_s, "text": d.text} for d in b.dialogue],
+            "phash":       b.phash,
+            "luma_hist":   b.luma_hist.hex() if b.luma_hist else None,
+            "sat_hist":    b.sat_hist.hex()  if b.sat_hist  else None,
+        }
+        for b in beats
+    ]
+    beats_cache.write_text(json.dumps(beats_data, indent=2, ensure_ascii=False), encoding="utf-8")
+    print(f"\n\u2705  {len(beats)} beats analyzed \u2192 {beats_cache}")
+    return beats
+
+
+def _load_beats(cfg) -> list:
+    from src.core.models import BeatType, DialogueLine, TrailerBeat
+
+    p = cfg.paths.cache_dir / "trailer_beats.json"
+    if not p.exists():
+        raise FileNotFoundError(f"No cached beats at {p}. Run 'analyze' first.")
+
+    raw = json.loads(p.read_text(encoding="utf-8"))
+    beats = []
+    for d in raw:
+        dialogue = tuple(
+            DialogueLine(start_s=x["start_s"], end_s=x["end_s"], text=x["text"])
+            for x in d.get("dialogue", [])
+        )
+        beats.append(TrailerBeat(
+            beat_id=d["beat_id"],
+            trailer_path=cfg.paths.reference_trailer,
+            start_s=d["start_s"],
+            end_s=d["end_s"],
+            start_frame=d["start_frame"],
+            end_frame=d["end_frame"],
+            beat_type=BeatType[d.get("beat_type", "UNKNOWN")],
+            dialogue=dialogue,
+            phash=d.get("phash"),
+            luma_hist=bytes.fromhex(d["luma_hist"]) if d.get("luma_hist") else None,
+            sat_hist= bytes.fromhex(d["sat_hist"])  if d.get("sat_hist")  else None,
+        ))
+    return beats
+
+
+def _select_beats(beats: list, beat_id: int | None) -> list:
+    """Return all beats or exactly one requested beat."""
+    if beat_id is None:
+        return beats
+    selected = [b for b in beats if b.beat_id == beat_id]
+    if not selected:
+        raise ValueError(f"Beat {beat_id} not found. Run 'analyze' first.")
+    return selected
+
+
+def _select_results(results: list, beat_ids: set[int] | None) -> list:
+    """Return all results or only results for the requested beats."""
+    if beat_ids is None:
+        return results
+    return [r for r in results if r.beat_id in beat_ids]
+
+
+def _find_scene_for_in_point(cfg, in_point_s: float):
+    from src.cv.scene_indexer import build_scene_index
+
+    scenes = build_scene_index(cfg)
+    for idx, scene in enumerate(scenes):
+        if scene.start_s <= in_point_s < scene.end_s:
+            if (
+                scene.end_s - in_point_s <= cfg.cv.deep_scan.scene_boundary_epsilon_s
+                and idx + 1 < len(scenes)
+            ):
+                return scenes[idx + 1]
+            return scene
+    return None
+
+
+def _reference_scoreable_segments(beat, cfg) -> list[tuple[float, float]]:
+    """Find visible source-matchable islands inside a trailer beat."""
+    from src.cv.frame_extractor import grab_frame_at_path
+    from src.cv.global_scan import _is_scoreable_reference_frame
+
+    step_s = max(0.08, cfg.cv.deep_scan.span_sample_step_s)
+    min_segment_s = max(0.32, step_s * 3.0)
+    bridge_gap_s = max(0.18, step_s * 2.0)
+    raw: list[tuple[float, float]] = []
+    start: float | None = None
+    last_seen: float | None = None
+    t = 0.0
+    while t <= beat.duration_s:
+        frame = grab_frame_at_path(beat.trailer_path, beat.start_s + t)
+        scoreable = frame is not None and _is_scoreable_reference_frame(frame, cfg)
+        if scoreable:
+            if start is None:
+                start = t
+            last_seen = t
+        elif start is not None and last_seen is not None and t - last_seen > bridge_gap_s:
+            end = min(beat.duration_s, last_seen + step_s)
+            if end - start >= min_segment_s:
+                raw.append((start, end))
+            start = None
+            last_seen = None
+        t = round(t + step_s, 6)
+
+    if start is not None and last_seen is not None:
+        end = min(beat.duration_s, last_seen + step_s)
+        if end - start >= min_segment_s:
+            raw.append((start, end))
+
+    return raw
+
+
+def _attach_visual_segments(results: list, beats: list, cfg) -> list:
+    """Attach automatic sub-shot matches for multi-island trailer beats."""
+    from dataclasses import replace
+    from src.core.models import MatchResult, MatchSegment
+    from src.cv.global_scan import run_global_scan
+
+    by_id = {b.beat_id: b for b in beats}
+    expanded: list[MatchResult] = []
+    for result in results:
+        beat = by_id.get(result.beat_id)
+        if beat is None:
+            expanded.append(result)
+            continue
+
+        islands = _reference_scoreable_segments(beat, cfg)
+        if len(islands) <= 1:
+            primary = MatchSegment(
+                trailer_offset_s=0.0,
+                duration_s=max(0.0, result.duration_s),
+                scene_id=result.scene_id,
+                in_point_s=result.in_point_s,
+                out_point_s=result.out_point_s,
+                match_score=result.match_score,
+                is_confirmed=result.is_confirmed,
+            )
+            expanded.append(replace(result, segments=(primary,)))
+            continue
+
+        segments: list[MatchSegment] = []
+        first_start, first_end = islands[0]
+        first_duration = min(max(0.0, result.duration_s), max(0.0, first_end - first_start))
+        segments.append(
+            MatchSegment(
+                trailer_offset_s=first_start,
+                duration_s=first_duration,
+                scene_id=result.scene_id,
+                in_point_s=result.in_point_s,
+                out_point_s=result.in_point_s + first_duration,
+                match_score=result.match_score,
+                is_confirmed=result.is_confirmed,
+            )
+        )
+
+        for start_s, end_s in islands[1:]:
+            segment_beat = replace(
+                beat,
+                start_s=beat.start_s + start_s,
+                end_s=beat.start_s + end_s,
+            )
+            segment_matches = run_global_scan([segment_beat], cfg, seed_in_points=None)
+            if not segment_matches:
+                continue
+            seg = segment_matches[0]
+            seg_dur = min(max(0.0, end_s - start_s), max(0.0, seg.duration_s))
+            segments.append(
+                MatchSegment(
+                    trailer_offset_s=start_s,
+                    duration_s=seg_dur,
+                    scene_id=seg.scene_id,
+                    in_point_s=seg.in_point_s,
+                    out_point_s=seg.in_point_s + seg_dur,
+                    match_score=seg.match_score,
+                    is_confirmed=seg.is_confirmed,
+                )
+            )
+
+        expanded.append(replace(result, segments=tuple(segments)))
+    return expanded
+
+
+def cmd_match(args: argparse.Namespace, cfg) -> list:
+    from src.pipeline.matcher import run_matching
+    from dataclasses import replace
+
+    if getattr(args, "vision", False):
+        cfg = replace(cfg, vision=replace(cfg.vision, enabled=True))
+    if getattr(args, "no_vision", False):
+        cfg = replace(cfg, vision=replace(cfg.vision, enabled=False))
+
+    all_beats = _load_beats(cfg)
+    beats = _select_beats(all_beats, getattr(args, "beat", None))
+    cached = _normalize_cached_results(all_beats, _load_results(cfg), cfg) if _results_cache_path(cfg).exists() else []
+    seed_in_points = (
+        _continuity_seed_in_points(args.beat, all_beats, cached, cfg)
+        if getattr(args, "beat", None) is not None
+        else None
+    )
+    results = run_matching(
+        cfg,
+        beats,
+        force_reindex=args.force_reindex,
+        seed_in_points=seed_in_points,
+    )
+    results = _attach_visual_segments(results, beats, cfg)
+
+    # A targeted one-beat match should improve the cache without deleting
+    # automatic matches for other beats.
+    if getattr(args, "beat", None) is not None and _results_cache_path(cfg).exists():
+        cached = [r for r in cached if r.beat_id != args.beat]
+        for result in results:
+            cached = _update_result(result, cached)
+        results_to_save = cached
+    else:
+        results_to_save = results
+
+    _save_results(results_to_save, cfg)
+
+    print(f"\n✅  {len(results)} / {len(beats)} beats matched.")
+    for r in results:
+        print(f"   Beat {r.beat_id:03d} → scene {r.scene_id:04d}  "
+              f"in={r.in_point_s:>8.3f}s  score={r.match_score:.3f}")
+    return results
+
+
+def _update_result(new_result, results: list) -> list:
+    """Replace or insert a MatchResult in the list (by beat_id)."""
+    updated = [r for r in results if r.beat_id != new_result.beat_id]
+    updated.append(new_result)
+    return sorted(updated, key=lambda r: r.beat_id)
+
+
+def _continuity_seed_in_points(beat_id: int, beats: list, results: list, cfg) -> dict[int, list[float | tuple[float, float]]]:
+    beats_by_id = {b.beat_id: b for b in beats}
+    results_by_id = {r.beat_id: r for r in results}
+    target = beats_by_id.get(beat_id)
+    if target is None:
+        return {}
+
+    seeds: list[tuple[float, float]] = []
+    base_score = max(cfg.cv.deep_scan.coarse_candidate_threshold + 0.08, 0.92)
+    prev_matches = [
+        (b, results_by_id[b.beat_id])
+        for b in beats
+        if b.beat_id < beat_id and b.beat_id in results_by_id
+    ]
+    if prev_matches:
+        prev_beat, prev_result = max(prev_matches, key=lambda item: item[0].beat_id)
+        trailer_gap_s = max(0.0, target.start_s - prev_beat.end_s)
+        expected = prev_result.out_point_s + trailer_gap_s
+        for offset in cfg.cv.deep_scan.continuity_seed_offsets_s:
+            offset_score = max(
+                cfg.cv.deep_scan.coarse_candidate_threshold,
+                base_score - abs(offset) * 0.06,
+            )
+            seeds.append((expected + offset, offset_score))
+
+    next_matches = [
+        (b, results_by_id[b.beat_id])
+        for b in beats
+        if b.beat_id > beat_id and b.beat_id in results_by_id
+    ]
+    if next_matches:
+        next_beat, next_result = min(next_matches, key=lambda item: item[0].beat_id)
+        trailer_gap_s = max(0.0, next_beat.start_s - target.end_s)
+        expected = next_result.in_point_s - trailer_gap_s - target.duration_s
+        for offset in cfg.cv.deep_scan.continuity_seed_offsets_s:
+            offset_score = max(
+                cfg.cv.deep_scan.coarse_candidate_threshold,
+                base_score - abs(offset) * 0.06,
+            )
+            seeds.append((expected - offset, offset_score))
+
+    unique: dict[float, float] = {}
+    for seed_t, seed_score in seeds:
+        rounded = round(max(0.0, seed_t), 3)
+        unique[rounded] = max(unique.get(rounded, 0.0), seed_score)
+    points = [(seed_t, score) for seed_t, score in sorted(unique.items())]
+    return {beat_id: points} if points else {}
+
+
+def cmd_rematch(args: argparse.Namespace, cfg) -> None:
+    """
+    Re-run automatic matching for ONE beat.
+
+    python cli.py rematch --beat 5               # re-scan CV for beat 5
+    python cli.py rematch --beat 5 --threshold 0.40  # relax threshold
+    """
+
+    beat_id = args.beat
+    beats   = _load_beats(cfg)
+    results = _load_results(cfg) if _results_cache_path(cfg).exists() else []
+
+    beat = next((b for b in beats if b.beat_id == beat_id), None)
+    if beat is None:
+        print(f"\u274c  Beat {beat_id} not found. Run 'analyze' first.")
+        return
+
+    # ---- Refine an already acceptable cached match -------------------------
+    if args.refine:
+        current = next((r for r in results if r.beat_id == beat_id), None)
+        if current is None:
+            print(f"❌  Beat {beat_id} has no cached match to refine. Run 'match --beat {beat_id}' first.")
+            return
+
+        from src.cv.content_align import align_cached_match_by_content
+        refined_in_s, sequence_score = align_cached_match_by_content(
+            beat,
+            current.in_point_s,
+            cfg,
+            search_window_s=args.refine_window,
+        )
+        usable_duration_s = max(0.0, current.out_point_s - current.in_point_s)
+        span_score = sequence_score
+        scene_data = _scene_for_time_light(_load_scene_cache_light(cfg), refined_in_s, cfg)
+        out_point_s = refined_in_s + usable_duration_s
+        if scene_data is not None:
+            out_point_s = min(out_point_s, float(scene_data["end_s"]))
+        matchable_duration_s = beat.duration_s
+        duration_coverage = (
+            max(0.0, out_point_s - refined_in_s) / matchable_duration_s
+            if matchable_duration_s > 0 else 0.0
+        )
+        if duration_coverage < cfg.cv.deep_scan.min_duration_coverage:
+            print(
+                f"❌  Beat {beat_id} refined candidate rejected: "
+                f"duration coverage {duration_coverage:.0%} < "
+                f"{cfg.cv.deep_scan.min_duration_coverage:.0%}"
+            )
+            return
+
+        try:
+            from src.cv.frame_extractor import get_video_info
+            fps = float(get_video_info(cfg.paths.source_movie)["fps"]) or cfg.export.edl_frame_rate
+        except Exception:
+            fps = cfg.export.edl_frame_rate
+
+        from src.core.models import MatchResult
+        refined = MatchResult(
+            beat_id=beat_id,
+            scene_id=int(scene_data["scene_id"]) if scene_data is not None else current.scene_id,
+            source_path=current.source_path,
+            in_point_s=max(0.0, refined_in_s),
+            out_point_s=out_point_s,
+            in_point_frame=int(max(0.0, refined_in_s) * fps),
+            match_score=max(sequence_score, span_score),
+            match_location=current.match_location,
+            is_confirmed=max(sequence_score, span_score) >= cfg.cv.deep_scan.match_threshold,
+        )
+        results = _update_result(refined, results)
+        _save_results(results, cfg)
+        print(
+            f"✅  Beat {beat_id} refined → "
+            f"in={refined.in_point_s:.3f}s, out={refined.out_point_s:.3f}s, "
+            f"sequence_score={refined.match_score:.3f}"
+        )
+        return
+
+    # ---- Re-run CV with optional threshold override ------------------------
+    from dataclasses import replace as dc_replace
+    run_cfg = cfg
+    if args.threshold is not None:
+        run_cfg = dc_replace(
+            cfg,
+            cv=dc_replace(
+                cfg.cv,
+                deep_scan=dc_replace(cfg.cv.deep_scan, match_threshold=args.threshold),
+            ),
+        )
+        print(f"ℹ️   threshold overridden to {args.threshold} for beat {beat_id}")
+
+    from src.cv.global_scan import run_global_scan
+    seed_in_points = _continuity_seed_in_points(beat_id, beats, results, run_cfg)
+    matches = run_global_scan([beat], run_cfg, seed_in_points=seed_in_points)
+    
+    if not matches:
+        print(f"❌  Beat {beat_id}: no match. Try --threshold 0.40.")
+        return
+        
+    match = matches[0]
+    results = _update_result(match, results)
+    _save_results(results, cfg)
+    print(f"✅  Beat {beat_id} rematched → (in={match.in_point_s:.3f}s, score={match.match_score:.3f})")
+
+
+def cmd_report(args: argparse.Namespace, cfg) -> None:
+    from src.pipeline.reporter import generate_report
+    beats = _select_beats(_load_beats(cfg), getattr(args, "beat", None))
+    beat_ids = {b.beat_id for b in beats} if getattr(args, "beat", None) is not None else None
+    results = _select_results(_normalize_cached_results(_load_beats(cfg), _load_results(cfg), cfg), beat_ids)
+    out     = generate_report(beats, results, cfg)
+    if getattr(args, "beat", None) is not None and not results:
+        print(
+            f"\n⚠️   Beat {args.beat} has no cached match yet. "
+            f"Run: python cli.py match --beat {args.beat}"
+        )
+    print(f"\n\u2705  Report \u2192 {out}")
+
+
+def cmd_export(args: argparse.Namespace, cfg) -> None:
+    from src.export.edl_writer   import write_edl
+    from src.export.fcpxml_writer import write_fcpxml
+    from src.pipeline.matcher    import build_timeline
+
+    beats = _select_beats(_load_beats(cfg), getattr(args, "beat", None))
+    beat_ids = {b.beat_id for b in beats} if getattr(args, "beat", None) is not None else None
+    results = _select_results(_normalize_cached_results(_load_beats(cfg), _load_results(cfg), cfg), beat_ids)
+    if getattr(args, "beat", None) is not None and not results:
+        print(f"❌  Beat {args.beat} has no cached match. Run 'match --beat {args.beat}' first.")
+        return
+    timeline = build_timeline(beats, results, cfg)
+
+    fmt = args.format or cfg.export.output_format
+    beat_id = getattr(args, "beat", None)
+    out_stem = (
+        f"{cfg.paths.reference_trailer.stem}_beat_{beat_id:03d}"
+        if beat_id is not None
+        else timeline.title
+    )
+
+    if fmt in ("fcpxml", "both"):
+        out = write_fcpxml(timeline, cfg, output_path=cfg.paths.output_dir / f"{out_stem}.fcpxml")
+        print(f"✅  FCPXML → {out}")
+
+    if fmt in ("edl", "both"):
+        out = write_edl(timeline, cfg, output_path=cfg.paths.output_dir / f"{out_stem}.edl")
+        print(f"✅  EDL    → {out}")
+
+
+def cmd_run(args: argparse.Namespace, cfg) -> None:
+    """Full pipeline: analyze → match → report → export."""
+    cmd_analyze(args, cfg)
+    cmd_match(args, cfg)
+    cmd_report(args, cfg)
+    cmd_export(args, cfg)
+
+
+# ---------------------------------------------------------------------------
+# Argument parser
+# ---------------------------------------------------------------------------
+
+def _build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        prog="ai-trailer",
+        description="AI Trailer Generator v2 — Pure CV scene matching",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "--config", type=Path, default=Path("config.toml"),
+        metavar="CONFIG", help="Path to config.toml (default: ./config.toml)",
+    )
+    parser.add_argument(
+        "--log-level", default="INFO",
+        choices=["DEBUG", "INFO", "WARNING", "ERROR"],
+        help="Logging verbosity (default: INFO)",
+    )
+
+    sub = parser.add_subparsers(dest="command", required=True)
+
+    # analyze
+    p_analyze = sub.add_parser("analyze", help="Detect trailer beats + fingerprint")
+    p_analyze.add_argument("--no-audio", action="store_true",
+                           help="Skip Whisper (only affects beat labels, not matching)")
+    p_analyze.add_argument("--no-llm", action="store_true",
+                           help="Skip LLM classification (only affects beat labels)")
+
+    # match
+    p_match = sub.add_parser("match", help="Run 2-phase CV matching")
+    p_match.add_argument("--force-reindex", action="store_true",
+                         help="Ignore scene cache and re-run PySceneDetect")
+    p_match.add_argument("--beat", type=int,
+                         help="Match only one beat and merge it into the cached results")
+    p_match.add_argument("--vision", action="store_true",
+                         help="Enable cached vision descriptions for extra automatic search seeds")
+    p_match.add_argument("--no-vision", action="store_true",
+                         help="Disable vision seeding even if [vision].enabled is true")
+
+    # rematch
+    p_rematch = sub.add_parser("rematch", help="Re-run or override matching for one beat")
+    p_rematch.add_argument("--beat",      type=int,   required=True,  help="Beat ID to rematch")
+    p_rematch.add_argument("--threshold", type=float, default=None,   help="Override match_threshold")
+    p_rematch.add_argument("--refine",    action="store_true",
+            help="Refine the cached match by measuring a local image-content offset")
+    p_rematch.add_argument("--refine-window", type=float, default=None,
+                           help="Seconds to search around the cached in-point when using --refine")
+
+    # report
+    p_report = sub.add_parser("report", help="Generate HTML visual comparison report")
+    p_report.add_argument("--beat", type=int, help="Report only one beat")
+
+    # export
+    p_export = sub.add_parser("export", help="Export timeline from cached results")
+    p_export.add_argument("--format", choices=["fcpxml", "edl", "both"],
+                          help="Override [export] output_format from config")
+    p_export.add_argument("--beat", type=int, help="Export only one beat")
+
+    # run
+    p_run = sub.add_parser("run", help="Full pipeline: analyze → match → export")
+    p_run.add_argument("--no-audio",      action="store_true")
+    p_run.add_argument("--no-llm",        action="store_true")
+    p_run.add_argument("--force-reindex", action="store_true")
+    p_run.add_argument("--vision",        action="store_true")
+    p_run.add_argument("--no-vision",     action="store_true")
+    p_run.add_argument("--format", choices=["fcpxml", "edl", "both"])
+    p_run.add_argument("--beat",          type=int,
+                       help="Run match/report/export for only one cached beat")
+
+    return parser
+
+
+# ---------------------------------------------------------------------------
+# Entry point
+# ---------------------------------------------------------------------------
+
+def main() -> None:
+    _ensure_utf8_console()
+    parser = _build_parser()
+    args   = parser.parse_args()
+
+    _setup_logging(args.log_level)
+
+    from src.core.config import load_config
+    cfg = load_config(args.config)
+
+    dispatch = {
+        "analyze": cmd_analyze,
+        "match":   cmd_match,
+        "rematch": cmd_rematch,
+        "report":  cmd_report,
+        "export":  cmd_export,
+        "run":     cmd_run,
+    }
+
+    handler = dispatch[args.command]
+    handler(args, cfg)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/config.toml b/config.toml
new file mode 100644
index 0000000..d3d159c
--- /dev/null
+++ b/config.toml
@@ -0,0 +1,198 @@
+# =============================================================================
+# AI Trailer Generator v2 — Central Configuration
+# =============================================================================
+# All tunable parameters, thresholds, and file paths are defined here.
+# NO hardcoded values are allowed in the Python source code.
+# =============================================================================
+
+[project]
+name        = "AI Trailer Generator v2"
+version     = "2.0.0"
+log_level   = "INFO"   # DEBUG | INFO | WARNING | ERROR
+
+# -----------------------------------------------------------------------------
+# [paths] — External video sources (read-only access)
+# -----------------------------------------------------------------------------
+[paths]
+source_movie       = "B:/Proxy/BehindTheRedDoor_FTR_1080P_2398_Fixed.mp4"
+reference_trailer  = "F:/Encodings/BehindTheRedDoor_Trailer_REFERENCE.mp4"
+
+# Output destinations (inside project sandbox)
+output_dir         = "output"
+cache_dir          = ".cache"
+proxy_dir          = "proxy"
+
+# -----------------------------------------------------------------------------
+# [video] — Decode / proxy settings
+# -----------------------------------------------------------------------------
+[video]
+# Target FPS for internal frame extraction (0 = use source FPS)
+extract_fps         = 1.0
+# Proxy resolution for template matching (width x height)
+proxy_width         = 640
+proxy_height        = 360
+
+# -----------------------------------------------------------------------------
+# [cv] — Computer Vision engine parameters
+# Phase 1 — "Vibe Check" (histogram / perceptual hash scene-level filter)
+# Phase 2 — "Deep Scan"  (template matching frame-level precision)
+# -----------------------------------------------------------------------------
+[cv]
+
+[cv.vibe_check]
+# Number of top candidate scenes to forward to Deep Scan
+top_k_candidates      = 100
+
+# Histogram comparison method:
+# CORREL=0 | CHISQR=1 | INTERSECT=2 | BHATTACHARYYA=3
+hist_compare_method   = 0
+
+# Histogram bins per channel (hue, saturation)
+hist_bins_hue         = 50
+hist_bins_saturation  = 60
+
+# pHash similarity threshold (lower = stricter; 0–64 range)
+# NOTE: 12 is for near-duplicate detection. Cross-video matching
+#       (trailer vs source movie with different grading/compression)
+#       needs 25–35. Start at 32 and tighten if you get false positives.
+phash_max_distance    = 32
+
+# ---- Text-Safe Crop -------------------------------------------------------
+# Fraction of frame height to EXCLUDE from the top (e.g. logos, title cards)
+crop_top_fraction    = 0.15
+# Fraction of frame height to EXCLUDE from the bottom (e.g. letterbox, subs)
+crop_bottom_fraction = 0.30
+
+[cv.deep_scan]
+# Step size in SECONDS between sampled frames during the coarse scan pass
+coarse_step_seconds   = 0.5
+
+# Minimum template match score (0.0–1.0) to accept a candidate as a hit
+match_threshold       = 0.65
+
+# Store/report lower-confidence automatic candidates for visual review instead
+# of dropping them as "NO MATCH". Confirmed exports can still use match_threshold.
+provisional_match_threshold = 0.45
+
+# Lower gate for entering temporal multi-frame refinement. The final decision
+# still uses sequence/span scoring; this only avoids rejecting real matches
+# because one midpoint frame is weak.
+coarse_candidate_threshold = 0.50
+
+# Candidate ranking weights. Duration coverage matters when the same visual
+# shot appears multiple times: prefer the occurrence that can cover the beat.
+sequence_score_weight = 0.55
+span_score_weight     = 0.15
+coarse_score_weight   = 0.10
+duration_score_weight = 0.20
+duration_tie_break_score_delta = 0.03
+min_duration_coverage = 0.65
+continuity_seed_offsets_s = [-1.0, 0.0, 0.5, 1.0, 1.5, 2.0, 3.0]
+scene_seed_top_k = 30
+scene_seed_points_per_scene = 6
+content_rerank_candidate_count = 100
+skip_coarse_scan_with_weighted_seeds = false
+
+# cv2.matchTemplate method:
+# TM_CCOEFF_NORMED=5 (recommended), TM_CCORR_NORMED=3
+match_method          = 5
+
+# If a coarse hit is found, refine by scanning ± this many seconds
+refine_window_seconds = 0.6
+refine_step_seconds   = 0.04  # ≈ 1 frame at 25 fps
+content_align_window_seconds = 0.48
+content_align_sample_step_s  = 0.28
+content_validation_weight    = 0.35
+provisional_content_threshold = 0.42
+
+# When several adjacent frame offsets score almost the same, prefer the earlier
+# one. This avoids matches that are visually correct but start a few frames late.
+start_tie_break_score_delta = 0.015
+start_preroll_frames        = 0
+
+# Automatic temporal verification after a coarse image hit.
+# More candidates reduces false positives from visually similar shots.
+sequence_candidate_count = 240
+sequence_min_distance_s  = 1.0
+max_refine_candidates    = 6
+
+# Match-span detection: trim when the source starts drifting into a different shot.
+span_sample_step_s       = 0.08
+trim_tail_frames         = 4
+
+# If a refined in-point lands this close to a detected scene end, treat it as
+# the next scene. Scene detectors often place cuts a frame or two around the
+# visible boundary.
+scene_boundary_epsilon_s = 0.12
+scoreable_luma_mean_min = 24.0
+scoreable_luma_p90_min  = 58.0
+scoreable_contrast_min  = 24.0
+
+# -----------------------------------------------------------------------------
+# [scene_detection] — PySceneDetect parameters (used to segment source movie)
+# -----------------------------------------------------------------------------
+[scene_detection]
+# Threshold for ContentDetector (lower = more sensitive)
+content_threshold     = 27.0
+# Minimum scene duration in seconds
+min_scene_duration_s  = 1.5
+
+# -----------------------------------------------------------------------------
+# [whisper] — Dialogue / audio analysis
+# -----------------------------------------------------------------------------
+[whisper]
+model              = "large-v3"
+language           = "ar"
+device             = "cuda"        # cuda | cpu
+compute_type       = "float16"     # float16 | int8 | float32
+
+# -----------------------------------------------------------------------------
+# [llm] — Used ONLY for thematic segmentation / dramaturgy
+# -----------------------------------------------------------------------------
+[llm]
+provider           = "openrouter"
+base_url           = "https://openrouter.ai/api/v1"
+model              = "google/gemma-4-31b-it"
+timeout_seconds    = 120
+temperature        = 0.3
+max_tokens         = 4096
+
+# -----------------------------------------------------------------------------
+# [vision] — Optional cached visual descriptions for ambiguous matching
+# -----------------------------------------------------------------------------
+[vision]
+# Disabled by default to avoid surprise API cost. Enable when you want the
+# matcher to ask a vision-capable model for cached 3-frame scene descriptions.
+enabled            = false
+provider           = "openrouter"
+base_url           = "https://openrouter.ai/api/v1"
+model              = "google/gemma-4-31b-it"
+timeout_seconds    = 90
+temperature        = 0.0
+max_tokens         = 350
+
+# Cost controls: per beat, only the top scene-level candidates are described,
+# and cached descriptions in .cache/vision_descriptions.json are reused.
+scene_candidate_top_k       = 8
+max_new_descriptions_per_run = 12
+max_seed_scenes             = 3
+seed_points_per_scene       = 12
+seed_score                  = 0.88
+max_refine_candidates       = 6
+local_scan_step_s           = 0.12
+local_scan_max_points_per_scene = 180
+local_scan_top_candidates   = 18
+local_scan_tie_break_score_delta = 0.08
+multi_shot_cut_corr_threshold = 0.20
+multi_shot_boundary_tolerance_s = 0.20
+fullscan_fallback           = false
+content_threshold           = 0.22
+similarity_threshold        = 0.18
+
+# -----------------------------------------------------------------------------
+# [export] — FCPXML / EDL export settings
+# -----------------------------------------------------------------------------
+[export]
+fcpxml_version     = "1.10"
+edl_frame_rate     = 23.976        # fps used in EDL timecode generation
+output_format      = "fcpxml"      # fcpxml | edl | both
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..5e831ff
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,68 @@
+[build-system]
+requires      = ["setuptools>=69", "wheel"]
+build-backend = "setuptools.backends.legacy:build"
+
+[project]
+name        = "ai-trailer-2026"
+version     = "2.0.0"
+description = "Frame-accurate trailer reconstruction via pure Computer Vision"
+requires-python = ">=3.11"
+
+dependencies = [
+    # Computer Vision
+    "opencv-python>=4.9",
+    "imagehash>=4.3",
+    "numpy>=1.26",
+    "Pillow>=10.0",
+
+    # Scene detection
+    "scenedetect[opencv]>=0.6",
+
+    # Audio / transcription
+    "faster-whisper>=1.0",
+
+    # Config / secrets
+    # tomllib — built-in stdlib (Python 3.11+), no install needed
+    "python-dotenv>=1.0",  # loads .env into os.environ
+
+    # Export
+    "lxml>=5.0",   # FCPXML generation
+]
+
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0",
+    "pytest-cov",
+    "mypy>=1.9",
+    "ruff>=0.4",
+]
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["src*"]
+
+# ---------------------------------------------------------------------------
+# Ruff (linter + formatter)
+# ---------------------------------------------------------------------------
+[tool.ruff]
+line-length    = 100
+target-version = "py311"
+
+[tool.ruff.lint]
+select  = ["E", "F", "I", "UP", "B", "C4", "ANN"]
+ignore  = ["ANN101", "ANN102"]
+
+# ---------------------------------------------------------------------------
+# Mypy
+# ---------------------------------------------------------------------------
+[tool.mypy]
+python_version         = "3.11"
+strict                 = true
+ignore_missing_imports = true
+
+# ---------------------------------------------------------------------------
+# Pytest
+# ---------------------------------------------------------------------------
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+addopts   = "-v --tb=short"
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..8c67a95
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,37 @@
+# AI Trailer Generator v2 — Python Dependencies
+# Generated from: pip freeze (Python 3.11, Windows)
+# Install with: pip install -r requirements.txt
+#
+# NOTE: faster-whisper and scenedetect may pull in torch/cuda extras
+#       depending on your platform. See README for CUDA setup.
+
+# Computer Vision
+opencv-python>=4.9
+numpy>=1.26
+Pillow>=10.0
+ImageHash>=4.3
+PyWavelets>=1.6        # required by ImageHash
+
+# Video scene detection
+scenedetect[opencv]>=0.6
+
+# Audio transcription
+# faster-whisper>=1.0   ← uncomment when ready to use Whisper
+#                          (pulls in torch; large download)
+
+# Config & secrets
+python-dotenv>=1.0     # loads .env into os.environ
+# tomllib — stdlib in Python 3.11+, no install needed
+
+# XML export
+# lxml>=5.0             ← optional: only needed for advanced FCPXML features
+#                          stdlib xml.etree.ElementTree is used by default
+
+# HTTP (LLM calls via urllib.request — no extra dep needed)
+# requests              ← not used; stdlib urllib is sufficient
+
+# Dev / testing
+pytest>=8.0
+pytest-cov
+# mypy>=1.9
+# ruff>=0.4
diff --git a/setup_venv.ps1 b/setup_venv.ps1
new file mode 100644
index 0000000..dac843d
--- /dev/null
+++ b/setup_venv.ps1
@@ -0,0 +1,89 @@
+# setup_venv.ps1 — AI Trailer Generator v2 — Virtual Environment Setup
+# Run once: .\setup_venv.ps1
+# -----------------------------------------------------------------------
+# If blocked by ExecutionPolicy:
+#   Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser
+
+$ErrorActionPreference = "Stop"
+$VENV_DIR = ".venv"
+
+function Resolve-ProjectPython {
+    $cmd = Get-Command python -ErrorAction SilentlyContinue
+    if ($cmd) {
+        return $cmd.Source
+    }
+
+    $candidates = @(
+        "$env:LOCALAPPDATA\Programs\Python\Python311\python.exe",
+        "$env:LOCALAPPDATA\Microsoft\WindowsApps\python.exe"
+    )
+
+    foreach ($candidate in $candidates) {
+        if ($candidate -and (Test-Path $candidate)) {
+            return $candidate
+        }
+    }
+
+    throw "Python 3.11+ not found. Install Python 3.11+ or add it to PATH."
+}
+
+Write-Host ""
+Write-Host "==================================================" -ForegroundColor Cyan
+Write-Host "  AI Trailer Generator v2 — venv Setup" -ForegroundColor Cyan
+Write-Host "==================================================" -ForegroundColor Cyan
+Write-Host ""
+
+# ---- 1. Check Python version ------------------------------------------------
+$PROJECT_PYTHON = Resolve-ProjectPython
+$pythonVersion = & $PROJECT_PYTHON --version 2>&1
+Write-Host "Python: $pythonVersion"
+if ($pythonVersion -notmatch "3\.(1[1-9]|[2-9]\d)") {
+    Write-Error "Python 3.11+ required. Found: $pythonVersion"
+    exit 1
+}
+
+# ---- 2. Create venv ---------------------------------------------------------
+if (Test-Path $VENV_DIR) {
+    Write-Host "Virtual environment already exists at '$VENV_DIR'. Skipping creation." -ForegroundColor Yellow
+} else {
+    Write-Host "Creating virtual environment in '$VENV_DIR' ..." -ForegroundColor Green
+    & $PROJECT_PYTHON -m venv $VENV_DIR
+    Write-Host "Done." -ForegroundColor Green
+}
+
+# ---- 3. Activate venv -------------------------------------------------------
+$activate = Join-Path $VENV_DIR "Scripts\Activate.ps1"
+Write-Host "Activating virtual environment ..."
+. $activate
+$VENV_PYTHON = Join-Path $VENV_DIR "Scripts\python.exe"
+
+# ---- 4. Upgrade pip ---------------------------------------------------------
+Write-Host "Upgrading pip ..." -ForegroundColor Green
+& $VENV_PYTHON -m pip install --upgrade pip --quiet
+
+# ---- 5. Install dependencies ------------------------------------------------
+Write-Host "Installing dependencies from requirements.txt ..." -ForegroundColor Green
+& $VENV_PYTHON -m pip install -r requirements.txt
+
+# ---- 6. Copy .env if missing ------------------------------------------------
+if (-not (Test-Path ".env")) {
+    if (Test-Path ".env.example") {
+        Copy-Item ".env.example" ".env"
+        Write-Host ""
+        Write-Host "  .env created from .env.example." -ForegroundColor Yellow
+        Write-Host "  >>> Open .env and fill in your OPENROUTER_API_KEY! <<<" -ForegroundColor Red
+    }
+}
+
+# ---- 7. Done ----------------------------------------------------------------
+Write-Host ""
+Write-Host "==================================================" -ForegroundColor Cyan
+Write-Host "  Setup complete!" -ForegroundColor Green
+Write-Host ""
+Write-Host "  Activate the venv with:"
+Write-Host "    .\.venv\Scripts\Activate.ps1" -ForegroundColor White
+Write-Host ""
+Write-Host "  Then run the pipeline:"
+Write-Host "    python cli.py run --no-audio --no-llm" -ForegroundColor White
+Write-Host "==================================================" -ForegroundColor Cyan
+Write-Host ""
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000..521670b
--- /dev/null
+++ b/src/__init__.py
@@ -0,0 +1 @@
+# src package
diff --git a/src/audio/__init__.py b/src/audio/__init__.py
new file mode 100644
index 0000000..20dc2fc
--- /dev/null
+++ b/src/audio/__init__.py
@@ -0,0 +1 @@
+# src.audio package — Whisper / dialogue analysis
diff --git a/src/audio/transcriber.py b/src/audio/transcriber.py
new file mode 100644
index 0000000..95be4d0
--- /dev/null
+++ b/src/audio/transcriber.py
@@ -0,0 +1,182 @@
+"""
+src/audio/transcriber.py — Whisper transcription via faster-whisper
+
+Responsibility:
+  - Transcribe audio from a video file into a list of DialogueLine objects
+  - Optionally restrict to a time window [start_s, end_s] (for single beats)
+  - All model config (model name, device, compute_type) comes from AppConfig
+
+The LLM is NOT used here. This is pure audio-to-text.
+"""
+
+from __future__ import annotations
+
+import logging
+import tempfile
+from pathlib import Path
+from typing import Sequence
+
+from src.core.config import AppConfig
+from src.core.models import DialogueLine
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Audio extraction helper (video → wav via ffmpeg)
+# ---------------------------------------------------------------------------
+
+def _extract_audio_segment(
+    video_path: Path,
+    start_s: float | None,
+    end_s: float | None,
+    out_wav: Path,
+) -> None:
+    """
+    Use ffmpeg (subprocess) to extract a mono 16kHz WAV from *video_path*.
+
+    Args:
+        video_path: Source video.
+        start_s:    Start time in seconds (None = beginning of file).
+        end_s:      End time in seconds (None = end of file).
+        out_wav:    Destination WAV path.
+
+    Raises:
+        RuntimeError: If ffmpeg exits with a non-zero code.
+    """
+    import subprocess
+
+    cmd = ["ffmpeg", "-y", "-loglevel", "error"]
+
+    if start_s is not None:
+        cmd += ["-ss", str(start_s)]
+    if end_s is not None and start_s is not None:
+        cmd += ["-t", str(end_s - start_s)]
+    elif end_s is not None:
+        cmd += ["-to", str(end_s)]
+
+    cmd += [
+        "-i", str(video_path),
+        "-vn",                        # no video
+        "-ac", "1",                   # mono
+        "-ar", "16000",               # 16 kHz — Whisper native rate
+        "-f", "wav",
+        str(out_wav),
+    ]
+
+    result = subprocess.run(cmd, capture_output=True)
+    if result.returncode != 0:
+        raise RuntimeError(
+            f"ffmpeg failed (code {result.returncode}):\n"
+            f"{result.stderr.decode(errors='replace')}"
+        )
+
+
+# ---------------------------------------------------------------------------
+# Core transcription
+# ---------------------------------------------------------------------------
+
+def transcribe_video(
+    video_path: Path,
+    cfg: AppConfig,
+    start_s: float | None = None,
+    end_s: float | None = None,
+    time_offset_s: float = 0.0,
+) -> list[DialogueLine]:
+    """
+    Transcribe dialogue from *video_path* using faster-whisper.
+
+    Args:
+        video_path:    Path to source or trailer video.
+        cfg:           Application configuration (whisper section).
+        start_s:       Clip start in video-file seconds (None = beginning).
+        end_s:         Clip end   in video-file seconds (None = end of file).
+        time_offset_s: Added to every transcript timestamp so that beat-level
+                       transcripts align with absolute movie time.
+
+    Returns:
+        List of DialogueLine ordered by start time.
+    """
+    try:
+        from faster_whisper import WhisperModel
+    except ImportError:
+        raise ImportError("faster-whisper not installed. Run: pip install faster-whisper")
+
+    w = cfg.whisper
+
+    logger.info(
+        "Transcribing %s [%.1f–%s] with %s on %s …",
+        video_path.name,
+        start_s or 0.0,
+        f"{end_s:.1f}s" if end_s else "end",
+        w.model,
+        w.device,
+    )
+
+    with tempfile.TemporaryDirectory() as tmp:
+        wav = Path(tmp) / "audio.wav"
+        _extract_audio_segment(video_path, start_s, end_s, wav)
+
+        model = WhisperModel(w.model, device=w.device, compute_type=w.compute_type)
+        segments, _ = model.transcribe(
+            str(wav),
+            language=w.language if w.language else None,
+            beam_size=5,
+        )
+
+        lines: list[DialogueLine] = []
+        for seg in segments:
+            lines.append(DialogueLine(
+                start_s=seg.start + time_offset_s,
+                end_s=seg.end   + time_offset_s,
+                text=seg.text.strip(),
+            ))
+
+    logger.info("Transcription done: %d segments.", len(lines))
+    return lines
+
+
+# ---------------------------------------------------------------------------
+# Convenience: transcribe a whole file and return grouped by scene
+# ---------------------------------------------------------------------------
+
+def transcribe_full_movie(
+    cfg: AppConfig,
+) -> list[DialogueLine]:
+    """
+    Transcribe the entire source movie. Use this result to enrich Scenes
+    via a dialogue_callback passed to build_scene_index().
+    """
+    return transcribe_video(cfg.paths.source_movie, cfg)
+
+
+def assign_dialogue_to_scenes(
+    all_dialogue: Sequence[DialogueLine],
+    scenes: list["src.core.models.Scene"],  # type: ignore[name-defined]
+) -> list["src.core.models.Scene"]:  # type: ignore[name-defined]
+    """
+    Distribute pre-transcribed DialogueLines into their respective Scenes.
+
+    A line is assigned to the scene whose window contains its midpoint.
+
+    Args:
+        all_dialogue: Full-movie transcript as flat list.
+        scenes:       Scene list (will be replaced with enriched copies).
+
+    Returns:
+        New list of Scene objects with dialogue tuples populated.
+    """
+    from dataclasses import replace
+    from src.core.models import Scene
+
+    enriched: list[Scene] = []
+    for scene in scenes:
+        matched = tuple(
+            line for line in all_dialogue
+            if scene.start_s <= (line.start_s + line.end_s) / 2.0 < scene.end_s
+        )
+        enriched.append(replace(scene, dialogue=matched))
+
+    total_assigned = sum(len(s.dialogue) for s in enriched)
+    logger.info("Assigned %d dialogue lines across %d scenes.", total_assigned, len(enriched))
+    return enriched
diff --git a/src/core/__init__.py b/src/core/__init__.py
new file mode 100644
index 0000000..61e4b74
--- /dev/null
+++ b/src/core/__init__.py
@@ -0,0 +1 @@
+# src.core package
diff --git a/src/core/config.py b/src/core/config.py
new file mode 100644
index 0000000..3e3f798
--- /dev/null
+++ b/src/core/config.py
@@ -0,0 +1,387 @@
+"""
+src/core/config.py — Configuration loader for AI Trailer Generator v2
+
+Loads config.toml and exposes typed, nested dataclasses.
+All CV thresholds, paths, and model settings are sourced exclusively here.
+API keys are NEVER stored in config.toml; they are loaded from .env.
+"""
+
+from __future__ import annotations
+
+import os
+import tomllib
+
+try:
+    from dotenv import load_dotenv as _load_dotenv
+    _HAS_DOTENV = True
+except ImportError:  # dotenv optional — falls back to existing env vars
+    _HAS_DOTENV = False
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Literal
+
+
+# ---------------------------------------------------------------------------
+# Leaf sections
+# ---------------------------------------------------------------------------
+
+@dataclass(frozen=True)
+class PathsConfig:
+    source_movie: Path
+    reference_trailer: Path
+    output_dir: Path
+    cache_dir: Path
+    proxy_dir: Path
+
+
+@dataclass(frozen=True)
+class VideoConfig:
+    extract_fps: float
+    proxy_width: int
+    proxy_height: int
+
+
+@dataclass(frozen=True)
+class VibeCheckConfig:
+    top_k_candidates: int
+    hist_compare_method: int
+    hist_bins_hue: int
+    hist_bins_saturation: int
+    phash_max_distance: int
+    crop_top_fraction: float
+    crop_bottom_fraction: float
+
+
+@dataclass(frozen=True)
+class DeepScanConfig:
+    coarse_step_seconds: float
+    match_threshold: float
+    provisional_match_threshold: float
+    coarse_candidate_threshold: float
+    sequence_score_weight: float
+    span_score_weight: float
+    coarse_score_weight: float
+    duration_score_weight: float
+    duration_tie_break_score_delta: float
+    min_duration_coverage: float
+    continuity_seed_offsets_s: tuple[float, ...]
+    scene_seed_top_k: int
+    scene_seed_points_per_scene: int
+    content_rerank_candidate_count: int
+    skip_coarse_scan_with_weighted_seeds: bool
+    max_refine_candidates: int
+    match_method: int
+    refine_window_seconds: float
+    refine_step_seconds: float
+    content_align_window_seconds: float
+    content_align_sample_step_s: float
+    content_validation_weight: float
+    provisional_content_threshold: float
+    start_tie_break_score_delta: float
+    start_preroll_frames: int
+    sequence_candidate_count: int
+    sequence_min_distance_s: float
+    span_sample_step_s: float
+    trim_tail_frames: int
+    scene_boundary_epsilon_s: float
+    scoreable_luma_mean_min: float
+    scoreable_luma_p90_min: float
+    scoreable_contrast_min: float
+
+
+@dataclass(frozen=True)
+class CVConfig:
+    vibe_check: VibeCheckConfig
+    deep_scan: DeepScanConfig
+
+
+@dataclass(frozen=True)
+class SceneDetectionConfig:
+    content_threshold: float
+    min_scene_duration_s: float
+
+
+@dataclass(frozen=True)
+class WhisperConfig:
+    model: str
+    language: str
+    device: Literal["cuda", "cpu"]
+    compute_type: Literal["float16", "int8", "float32"]
+
+
+@dataclass(frozen=True)
+class LLMConfig:
+    provider: Literal["ollama", "openai", "openrouter"]
+    base_url: str
+    model: str
+    timeout_seconds: int
+    temperature: float
+    max_tokens: int
+    # Loaded from .env — NEVER committed to version control
+    api_key: str = ""
+
+
+@dataclass(frozen=True)
+class VisionConfig:
+    enabled: bool
+    provider: Literal["openai", "openrouter"]
+    base_url: str
+    model: str
+    timeout_seconds: int
+    temperature: float
+    max_tokens: int
+    scene_candidate_top_k: int
+    max_new_descriptions_per_run: int
+    max_seed_scenes: int
+    seed_points_per_scene: int
+    seed_score: float
+    max_refine_candidates: int
+    local_scan_step_s: float
+    local_scan_max_points_per_scene: int
+    local_scan_top_candidates: int
+    local_scan_tie_break_score_delta: float
+    multi_shot_cut_corr_threshold: float
+    multi_shot_boundary_tolerance_s: float
+    fullscan_fallback: bool
+    content_threshold: float
+    similarity_threshold: float
+    api_key: str = ""
+
+
+@dataclass(frozen=True)
+class ExportConfig:
+    fcpxml_version: str
+    edl_frame_rate: float
+    output_format: Literal["fcpxml", "edl", "both"]
+
+
+# ---------------------------------------------------------------------------
+# Root config — single object passed through the entire application
+# ---------------------------------------------------------------------------
+
+@dataclass(frozen=True)
+class AppConfig:
+    project_name: str
+    version: str
+    log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR"]
+
+    paths: PathsConfig
+    video: VideoConfig
+    cv: CVConfig
+    scene_detection: SceneDetectionConfig
+    whisper: WhisperConfig
+    llm: LLMConfig
+    vision: VisionConfig
+    export: ExportConfig
+
+
+# ---------------------------------------------------------------------------
+# Loader
+# ---------------------------------------------------------------------------
+
+_DEFAULT_CONFIG_PATH = Path(__file__).parents[2] / "config.toml"
+_DEFAULT_ENV_PATH    = Path(__file__).parents[2] / ".env"
+
+
+def load_config(
+    config_path: Path = _DEFAULT_CONFIG_PATH,
+    env_path: Path = _DEFAULT_ENV_PATH,
+) -> AppConfig:
+    """
+    Parse config.toml and return a fully-typed, immutable AppConfig.
+
+    API keys are read from the .env file (or existing environment variables);
+    they are never stored in config.toml.
+
+    Args:
+        config_path: Absolute or relative path to the TOML file.
+                     Defaults to <project_root>/config.toml.
+        env_path:    Path to the .env file.
+                     Defaults to <project_root>/.env.
+
+    Raises:
+        FileNotFoundError: If the TOML file does not exist.
+        KeyError / TypeError: If a required key is missing or has the wrong type.
+    """
+    # Load .env first so os.environ is populated before we read it below.
+    if _HAS_DOTENV:
+        _load_dotenv(dotenv_path=env_path, override=False)
+
+    if not config_path.exists():
+        raise FileNotFoundError(
+            f"Config file not found: {config_path}\n"
+            "Copy config.toml.example to config.toml and adjust your paths."
+        )
+
+    with config_path.open("rb") as fh:
+        raw: dict = tomllib.load(fh)
+
+    project = raw["project"]
+    paths_raw = raw["paths"]
+    video_raw = raw["video"]
+    cv_raw = raw["cv"]
+    sd_raw = raw["scene_detection"]
+    whisper_raw = raw["whisper"]
+    llm_raw = raw["llm"]
+    vision_raw = raw.get("vision", {})
+    export_raw = raw["export"]
+
+    # Resolve paths relative to the config file's parent directory so the
+    # project is relocatable, but keep absolute paths as-is.
+    def _resolve(p: str) -> Path:
+        path = Path(p)
+        return path if path.is_absolute() else (config_path.parent / path).resolve()
+
+    paths = PathsConfig(
+        source_movie=_resolve(paths_raw["source_movie"]),
+        reference_trailer=_resolve(paths_raw["reference_trailer"]),
+        output_dir=_resolve(paths_raw["output_dir"]),
+        cache_dir=_resolve(paths_raw["cache_dir"]),
+        proxy_dir=_resolve(paths_raw["proxy_dir"]),
+    )
+
+    video = VideoConfig(
+        extract_fps=float(video_raw["extract_fps"]),
+        proxy_width=int(video_raw["proxy_width"]),
+        proxy_height=int(video_raw["proxy_height"]),
+    )
+
+    vibe_check = VibeCheckConfig(
+        top_k_candidates=int(cv_raw["vibe_check"]["top_k_candidates"]),
+        hist_compare_method=int(cv_raw["vibe_check"]["hist_compare_method"]),
+        hist_bins_hue=int(cv_raw["vibe_check"]["hist_bins_hue"]),
+        hist_bins_saturation=int(cv_raw["vibe_check"]["hist_bins_saturation"]),
+        phash_max_distance=int(cv_raw["vibe_check"]["phash_max_distance"]),
+        crop_top_fraction=float(cv_raw["vibe_check"]["crop_top_fraction"]),
+        crop_bottom_fraction=float(cv_raw["vibe_check"]["crop_bottom_fraction"]),
+    )
+
+    deep_scan = DeepScanConfig(
+        coarse_step_seconds=float(cv_raw["deep_scan"]["coarse_step_seconds"]),
+        match_threshold=float(cv_raw["deep_scan"]["match_threshold"]),
+        provisional_match_threshold=float(cv_raw["deep_scan"].get("provisional_match_threshold", 0.45)),
+        coarse_candidate_threshold=float(cv_raw["deep_scan"].get("coarse_candidate_threshold", cv_raw["deep_scan"]["match_threshold"])),
+        sequence_score_weight=float(cv_raw["deep_scan"].get("sequence_score_weight", 0.55)),
+        span_score_weight=float(cv_raw["deep_scan"].get("span_score_weight", 0.15)),
+        coarse_score_weight=float(cv_raw["deep_scan"].get("coarse_score_weight", 0.10)),
+        duration_score_weight=float(cv_raw["deep_scan"].get("duration_score_weight", 0.20)),
+        duration_tie_break_score_delta=float(cv_raw["deep_scan"].get("duration_tie_break_score_delta", 0.03)),
+        min_duration_coverage=float(cv_raw["deep_scan"].get("min_duration_coverage", 0.65)),
+        continuity_seed_offsets_s=tuple(
+            float(v) for v in cv_raw["deep_scan"].get(
+                "continuity_seed_offsets_s",
+                [-1.0, 0.0, 0.5, 1.0, 1.5, 2.0, 3.0],
+            )
+        ),
+        scene_seed_top_k=int(cv_raw["deep_scan"].get("scene_seed_top_k", 30)),
+        scene_seed_points_per_scene=int(cv_raw["deep_scan"].get("scene_seed_points_per_scene", 6)),
+        content_rerank_candidate_count=int(cv_raw["deep_scan"].get("content_rerank_candidate_count", 100)),
+        skip_coarse_scan_with_weighted_seeds=bool(cv_raw["deep_scan"].get("skip_coarse_scan_with_weighted_seeds", False)),
+        max_refine_candidates=int(cv_raw["deep_scan"].get("max_refine_candidates", 6)),
+        match_method=int(cv_raw["deep_scan"]["match_method"]),
+        refine_window_seconds=float(cv_raw["deep_scan"].get("refine_window_seconds", 0.6)),
+        refine_step_seconds=float(cv_raw["deep_scan"]["refine_step_seconds"]),
+        content_align_window_seconds=float(cv_raw["deep_scan"].get("content_align_window_seconds", 0.48)),
+        content_align_sample_step_s=float(cv_raw["deep_scan"].get("content_align_sample_step_s", 0.28)),
+        content_validation_weight=float(cv_raw["deep_scan"].get("content_validation_weight", 0.35)),
+        provisional_content_threshold=float(cv_raw["deep_scan"].get("provisional_content_threshold", 0.42)),
+        start_tie_break_score_delta=float(cv_raw["deep_scan"].get("start_tie_break_score_delta", 0.015)),
+        start_preroll_frames=int(cv_raw["deep_scan"].get("start_preroll_frames", 0)),
+        sequence_candidate_count=int(cv_raw["deep_scan"].get("sequence_candidate_count", 240)),
+        sequence_min_distance_s=float(cv_raw["deep_scan"].get("sequence_min_distance_s", 1.0)),
+        span_sample_step_s=float(cv_raw["deep_scan"].get("span_sample_step_s", 0.08)),
+        trim_tail_frames=int(cv_raw["deep_scan"].get("trim_tail_frames", 2)),
+        scene_boundary_epsilon_s=float(cv_raw["deep_scan"].get("scene_boundary_epsilon_s", 0.12)),
+        scoreable_luma_mean_min=float(cv_raw["deep_scan"].get("scoreable_luma_mean_min", 24.0)),
+        scoreable_luma_p90_min=float(cv_raw["deep_scan"].get("scoreable_luma_p90_min", 58.0)),
+        scoreable_contrast_min=float(cv_raw["deep_scan"].get("scoreable_contrast_min", 24.0)),
+    )
+
+    scene_detection = SceneDetectionConfig(
+        content_threshold=float(sd_raw["content_threshold"]),
+        min_scene_duration_s=float(sd_raw["min_scene_duration_s"]),
+    )
+
+    whisper = WhisperConfig(
+        model=whisper_raw["model"],
+        language=whisper_raw["language"],
+        device=whisper_raw["device"],
+        compute_type=whisper_raw["compute_type"],
+    )
+
+    # Resolve API key: env var takes precedence over config (which shouldn't have it).
+    # Supported env vars (in priority order):
+    #   OPENROUTER_API_KEY  → for provider = openrouter
+    #   OPENAI_API_KEY      → for provider = openai
+    #   LLM_API_KEY         → universal fallback
+    _provider = llm_raw["provider"]
+    _api_key = (
+        os.environ.get("OPENROUTER_API_KEY", "")
+        if _provider == "openrouter"
+        else os.environ.get("OPENAI_API_KEY", "")
+        if _provider == "openai"
+        else ""
+    ) or os.environ.get("LLM_API_KEY", "")
+
+    llm = LLMConfig(
+        provider=_provider,
+        base_url=llm_raw["base_url"],
+        model=llm_raw["model"],
+        timeout_seconds=int(llm_raw["timeout_seconds"]),
+        temperature=float(llm_raw["temperature"]),
+        max_tokens=int(llm_raw["max_tokens"]),
+        api_key=_api_key,
+    )
+
+    vision_provider = vision_raw.get("provider", _provider if _provider in ("openai", "openrouter") else "openrouter")
+    vision_api_key = (
+        os.environ.get("OPENROUTER_API_KEY", "")
+        if vision_provider == "openrouter"
+        else os.environ.get("OPENAI_API_KEY", "")
+    ) or os.environ.get("VISION_API_KEY", "") or os.environ.get("LLM_API_KEY", "")
+
+    vision = VisionConfig(
+        enabled=bool(vision_raw.get("enabled", False)),
+        provider=vision_provider,
+        base_url=str(vision_raw.get("base_url", llm.base_url)),
+        model=str(vision_raw.get("model", llm.model)),
+        timeout_seconds=int(vision_raw.get("timeout_seconds", llm.timeout_seconds)),
+        temperature=float(vision_raw.get("temperature", 0.0)),
+        max_tokens=int(vision_raw.get("max_tokens", 350)),
+        scene_candidate_top_k=int(vision_raw.get("scene_candidate_top_k", 8)),
+        max_new_descriptions_per_run=int(vision_raw.get("max_new_descriptions_per_run", 12)),
+        max_seed_scenes=int(vision_raw.get("max_seed_scenes", 3)),
+        seed_points_per_scene=int(vision_raw.get("seed_points_per_scene", 12)),
+        seed_score=float(vision_raw.get("seed_score", 0.88)),
+        max_refine_candidates=int(vision_raw.get("max_refine_candidates", 6)),
+        local_scan_step_s=float(vision_raw.get("local_scan_step_s", 0.12)),
+        local_scan_max_points_per_scene=int(vision_raw.get("local_scan_max_points_per_scene", 180)),
+        local_scan_top_candidates=int(vision_raw.get("local_scan_top_candidates", 18)),
+        local_scan_tie_break_score_delta=float(vision_raw.get("local_scan_tie_break_score_delta", 0.08)),
+        multi_shot_cut_corr_threshold=float(vision_raw.get("multi_shot_cut_corr_threshold", 0.20)),
+        multi_shot_boundary_tolerance_s=float(vision_raw.get("multi_shot_boundary_tolerance_s", 0.20)),
+        fullscan_fallback=bool(vision_raw.get("fullscan_fallback", False)),
+        content_threshold=float(vision_raw.get("content_threshold", 0.22)),
+        similarity_threshold=float(vision_raw.get("similarity_threshold", 0.18)),
+        api_key=vision_api_key,
+    )
+
+    export = ExportConfig(
+        fcpxml_version=str(export_raw["fcpxml_version"]),
+        edl_frame_rate=float(export_raw["edl_frame_rate"]),
+        output_format=export_raw["output_format"],
+    )
+
+    return AppConfig(
+        project_name=project["name"],
+        version=project["version"],
+        log_level=project["log_level"],
+        paths=paths,
+        video=video,
+        cv=CVConfig(vibe_check=vibe_check, deep_scan=deep_scan),
+        scene_detection=scene_detection,
+        whisper=whisper,
+        llm=llm,
+        vision=vision,
+        export=export,
+    )
diff --git a/src/core/models.py b/src/core/models.py
new file mode 100644
index 0000000..838609c
--- /dev/null
+++ b/src/core/models.py
@@ -0,0 +1,287 @@
+"""
+src/core/models.py — Canonical data models for AI Trailer Generator v2
+
+Rules:
+  - Every model is a frozen dataclass (immutable after creation).
+  - All fields are strictly typed; no bare dicts or untyped lists.
+  - Seconds are always float; frame numbers are always int.
+  - Confidence scores live in [0.0, 1.0].
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from enum import Enum, auto
+from pathlib import Path
+from typing import Optional
+
+
+# ===========================================================================
+# Enumerations
+# ===========================================================================
+
+class MatchMethod(Enum):
+    """CV template matching method (mirrors cv2.TM_* constants)."""
+    TM_SQDIFF         = 0
+    TM_SQDIFF_NORMED  = 1
+    TM_CCORR          = 2
+    TM_CCORR_NORMED   = 3
+    TM_CCOEFF         = 4
+    TM_CCOEFF_NORMED  = 5
+
+
+class BeatType(Enum):
+    """Narrative role of a trailer beat (for dramaturgy / LLM use only)."""
+    HOOK        = auto()   # Opening attention grabber
+    SETUP       = auto()   # World / character introduction
+    CONFLICT    = auto()   # Inciting incident / rising tension
+    CLIMAX      = auto()   # Peak action / emotion
+    RESOLUTION  = auto()   # Cool-down / tagline
+    UNKNOWN     = auto()
+
+
+class ExportFormat(Enum):
+    FCPXML = "fcpxml"
+    EDL    = "edl"
+    BOTH   = "both"
+
+
+# ===========================================================================
+# Phase 0 — Source-movie scene index
+# ===========================================================================
+
+@dataclass(frozen=True)
+class DialogueLine:
+    """Single transcribed line from Whisper output."""
+    start_s: float       # onset in seconds
+    end_s:   float       # offset in seconds
+    text:    str         # verbatim transcript
+    speaker: Optional[str] = None  # diarisation label if available
+
+    @property
+    def duration_s(self) -> float:
+        return self.end_s - self.start_s
+
+
+@dataclass(frozen=True)
+class Scene:
+    """
+    One detected scene in the source movie.
+
+    Produced by PySceneDetect; enriched by Whisper dialogue and
+    (optionally) perceptual hashes during the Vibe Check phase.
+    """
+    scene_id:       int          # zero-based index in source movie
+    source_path:    Path         # absolute path to the source video file
+    start_s:        float        # scene start in seconds
+    end_s:          float        # scene end   in seconds
+    start_frame:    int          # first frame number
+    end_frame:      int          # last  frame number
+
+    # Populated after Vibe Check fingerprinting
+    luma_hist:      Optional[bytes]  = None  # serialised np.ndarray (pickle)
+    sat_hist:       Optional[bytes]  = None
+    phash:          Optional[str]    = None  # 64-bit hex string
+
+    # Populated after Whisper pass
+    dialogue:       tuple[DialogueLine, ...] = field(default_factory=tuple)
+
+    @property
+    def duration_s(self) -> float:
+        return self.end_s - self.start_s
+
+    @property
+    def midpoint_s(self) -> float:
+        return self.start_s + self.duration_s / 2.0
+
+    def __repr__(self) -> str:
+        return (
+            f"Scene(id={self.scene_id}, "
+            f"{self.start_s:.2f}s–{self.end_s:.2f}s, "
+            f"dur={self.duration_s:.2f}s)"
+        )
+
+
+# ===========================================================================
+# Phase 1 — Reference-trailer beat
+# ===========================================================================
+
+@dataclass(frozen=True)
+class TrailerBeat:
+    """
+    One cut / segment in the reference trailer.
+
+    The 'beat' is the atomic unit of a trailer:  it maps exactly to one
+    clip that will later be sourced from the original movie.
+    """
+    beat_id:        int
+    trailer_path:   Path
+    start_s:        float
+    end_s:          float
+    start_frame:    int
+    end_frame:      int
+
+    beat_type:      BeatType = BeatType.UNKNOWN  # set by LLM dramaturgy pass
+
+    # Visual fingerprints of the *middle* frame (populated by CV pipeline)
+    luma_hist:      Optional[bytes] = None
+    sat_hist:       Optional[bytes] = None
+    phash:          Optional[str]   = None
+
+    # Dialogue extracted from this beat
+    dialogue:       tuple[DialogueLine, ...] = field(default_factory=tuple)
+
+    @property
+    def duration_s(self) -> float:
+        return self.end_s - self.start_s
+
+    @property
+    def midpoint_s(self) -> float:
+        return self.start_s + self.duration_s / 2.0
+
+    def __repr__(self) -> str:
+        return (
+            f"TrailerBeat(id={self.beat_id}, "
+            f"{self.beat_type.name}, "
+            f"{self.start_s:.2f}s–{self.end_s:.2f}s)"
+        )
+
+
+# ===========================================================================
+# Phase 2 — CV match result
+# ===========================================================================
+
+@dataclass(frozen=True)
+class VibeHit:
+    """
+    Intermediate result from Phase 1 (Vibe Check — histogram/pHash).
+
+    Represents a *candidate* scene that passed the coarse filter.
+    Not yet a confirmed match; forwarded to Deep Scan.
+    """
+    beat_id:            int
+    scene_id:           int
+    hist_score:         float   # histogram similarity [0.0, 1.0] (CORREL method)
+    phash_distance:     int     # Hamming distance [0, 64]; lower = more similar
+    combined_score:     float   # weighted aggregate used for ranking
+
+
+@dataclass(frozen=True)
+class MatchSegment:
+    """
+    One source-backed visual island inside a trailer beat.
+
+    Some trailer beats contain multiple shots separated by fades/title frames.
+    A single continuous source in/out cannot represent those beats accurately.
+    """
+    trailer_offset_s:   float
+    duration_s:         float
+    scene_id:           int
+    in_point_s:         float
+    out_point_s:        float
+    match_score:        float
+    is_confirmed:       bool = True
+
+
+@dataclass(frozen=True)
+class MatchResult:
+    """
+    Final, confirmed match from Phase 2 (Deep Scan — template matching).
+
+    One MatchResult per TrailerBeat: the best frame-accurate hit found
+    inside the source movie.
+    """
+    beat_id:            int       # which trailer beat was matched
+    scene_id:           int       # which source scene contains the match
+    source_path:        Path      # absolute path to source video
+
+    # Frame-accurate in-point / out-point in the SOURCE movie
+    in_point_s:         float     # matched frame onset in source seconds
+    out_point_s:        float     # computed out-point (in_point + beat duration)
+    in_point_frame:     int       # matched frame number in source movie
+
+    # Match quality
+    match_score:        float     # cv2.matchTemplate peak value [0.0, 1.0]
+    match_location:     tuple[int, int] = field(default_factory=lambda: (0, 0))
+    # (x, y) pixel location of the best match within the source frame
+
+    # Provenance
+    vibe_hit:           Optional[VibeHit] = None  # the candidate that led here
+    is_confirmed:       bool = True
+    segments:           tuple[MatchSegment, ...] = field(default_factory=tuple)
+
+    @property
+    def duration_s(self) -> float:
+        return self.out_point_s - self.in_point_s
+
+    def __repr__(self) -> str:
+        return (
+            f"MatchResult(beat={self.beat_id} → scene={self.scene_id}, "
+            f"in={self.in_point_s:.3f}s, score={self.match_score:.3f})"
+        )
+
+
+# ===========================================================================
+# Phase 3 — Edit timeline (pre-export)
+# ===========================================================================
+
+@dataclass(frozen=True)
+class EditClip:
+    """
+    One clip on the final edit timeline, ready for FCPXML / EDL export.
+
+    Combines beat dramaturgy + the CV-confirmed source in/out points.
+    """
+    clip_index:         int        # position on the timeline (0-based)
+    beat:               TrailerBeat
+    match:              MatchResult
+
+    # Timeline position (in the OUTPUT trailer)
+    timeline_start_s:   float
+    timeline_end_s:     float
+    source_duration_s:  float | None = None
+    trailer_tail_s:     float = 0.0
+
+    # Optional audio override (e.g. VO or music)
+    audio_path:         Optional[Path] = None
+    audio_offset_s:     float = 0.0
+
+    @property
+    def timeline_duration_s(self) -> float:
+        return self.timeline_end_s - self.timeline_start_s
+
+    @property
+    def source_timeline_duration_s(self) -> float:
+        if self.source_duration_s is not None:
+            return max(0.0, self.source_duration_s)
+        return self.timeline_duration_s
+
+    def __repr__(self) -> str:
+        return (
+            f"EditClip(#{self.clip_index}, "
+            f"tl={self.timeline_start_s:.2f}s–{self.timeline_end_s:.2f}s, "
+            f"src={self.match.in_point_s:.3f}s)"
+        )
+
+
+@dataclass(frozen=True)
+class EditTimeline:
+    """
+    The complete ordered sequence of EditClips that forms the trailer.
+
+    Passed to the export layer (FCPXML / EDL writer).
+    """
+    title:          str
+    frame_rate:     float                   # e.g. 23.976
+    clips:          tuple[EditClip, ...]    # ordered by clip_index
+
+    @property
+    def total_duration_s(self) -> float:
+        if not self.clips:
+            return 0.0
+        last = max(self.clips, key=lambda c: c.timeline_end_s)
+        return last.timeline_end_s
+
+    @property
+    def clip_count(self) -> int:
+        return len(self.clips)
diff --git a/src/cv/__init__.py b/src/cv/__init__.py
new file mode 100644
index 0000000..4d40340
--- /dev/null
+++ b/src/cv/__init__.py
@@ -0,0 +1 @@
+# src.cv package — Computer Vision engine
diff --git a/src/cv/content_align.py b/src/cv/content_align.py
new file mode 100644
index 0000000..f500a95
--- /dev/null
+++ b/src/cv/content_align.py
@@ -0,0 +1,240 @@
+from __future__ import annotations
+
+import math
+import shutil
+import subprocess
+from pathlib import Path
+
+import numpy as np
+from PIL import Image, ImageFilter, ImageOps
+
+from src.core.config import AppConfig
+from src.core.models import TrailerBeat
+
+
+def _run(cmd: list[str]) -> None:
+    result = subprocess.run(cmd, capture_output=True)
+    if result.returncode != 0:
+        raise RuntimeError(result.stderr.decode(errors="replace"))
+
+
+def _extract_frames(
+    video_path: Path,
+    start_s: float,
+    duration_s: float,
+    fps: float,
+    out_dir: Path,
+    prefix: str,
+) -> None:
+    out_dir.mkdir(parents=True, exist_ok=True)
+    _run([
+        "ffmpeg", "-y", "-loglevel", "error",
+        "-ss", str(max(0.0, start_s)),
+        "-i", str(video_path),
+        "-t", str(max(0.04, duration_s)),
+        "-vf", f"scale=640:360,fps={fps}",
+        str(out_dir / f"{prefix}_%04d.png"),
+    ])
+
+
+def _cropped_image(path: Path, cfg: AppConfig) -> Image.Image:
+    image = Image.open(path).convert("L")
+    image = _trim_dark_borders(image)
+    w, h = image.size
+    # Final validation should see the composition. The broader text-safe crop
+    # used for coarse search can remove bodies, furniture and lower-frame
+    # spatial cues that distinguish otherwise similar face/window shots.
+    top = int(h * 0.05)
+    bottom = int(h * 0.95)
+    return image.crop((0, top, w, bottom))
+
+
+def _trim_dark_borders(image: Image.Image) -> Image.Image:
+    """Remove encoded black matte/pillarbox borders before content scoring."""
+    gray = image.convert("L")
+    arr = np.asarray(gray, dtype=np.float32)
+    if arr.size == 0:
+        return image
+    h, w = arr.shape[:2]
+    col_signal = np.percentile(arr, 90, axis=0)
+    row_signal = np.percentile(arr, 90, axis=1)
+    active_cols = np.where(col_signal > 18.0)[0]
+    active_rows = np.where(row_signal > 18.0)[0]
+    if active_cols.size >= max(8, int(w * 0.35)):
+        x0 = max(0, int(active_cols[0]) - 2)
+        x1 = min(w, int(active_cols[-1]) + 3)
+    else:
+        x0, x1 = 0, w
+    if active_rows.size >= max(8, int(h * 0.35)):
+        y0 = max(0, int(active_rows[0]) - 2)
+        y1 = min(h, int(active_rows[-1]) + 3)
+    else:
+        y0, y1 = 0, h
+    if x1 - x0 < int(w * 0.35) or y1 - y0 < int(h * 0.35):
+        return image
+    return image.crop((x0, y0, x1, y1))
+
+
+def _feature(path: Path, cfg: AppConfig) -> np.ndarray:
+    image = _cropped_image(path, cfg)
+    w, h = image.size
+    image = image.crop((int(w * 0.10), int(h * 0.10), int(w * 0.90), int(h * 0.90)))
+    image = ImageOps.equalize(image).filter(ImageFilter.FIND_EDGES).resize((160, 62))
+    arr = np.asarray(image, dtype=np.float32)
+    return (arr - arr.mean()) / (arr.std() + 1e-6)
+
+
+def _luma_feature(path: Path, cfg: AppConfig) -> np.ndarray:
+    image = ImageOps.equalize(_cropped_image(path, cfg)).resize((160, 80))
+    arr = np.asarray(image, dtype=np.float32)
+    return (arr - arr.mean()) / (arr.std() + 1e-6)
+
+
+def _hist_feature(path: Path, cfg: AppConfig) -> np.ndarray:
+    image = _trim_dark_borders(Image.open(path).convert("RGB"))
+    w, h = image.size
+    top = int(h * 0.05)
+    bottom = int(h * 0.95)
+    arr = np.asarray(image.crop((0, top, w, bottom)).resize((160, 80)), dtype=np.float32)
+    hist_parts = []
+    for channel in range(3):
+        hist, _ = np.histogram(arr[:, :, channel], bins=32, range=(0, 255))
+        hist = hist.astype(np.float32)
+        hist_parts.append(hist / (hist.sum() + 1e-6))
+    return np.concatenate(hist_parts)
+
+
+def _spatial_hist_feature(path: Path, cfg: AppConfig) -> np.ndarray:
+    image = _trim_dark_borders(Image.open(path).convert("RGB"))
+    w, h = image.size
+    top = int(h * 0.05)
+    bottom = int(h * 0.95)
+    arr = np.asarray(image.crop((0, top, w, bottom)).resize((160, 80)), dtype=np.float32)
+    cells = []
+    grid_y = 4
+    grid_x = 4
+    cell_h = arr.shape[0] // grid_y
+    cell_w = arr.shape[1] // grid_x
+    for gy in range(grid_y):
+        for gx in range(grid_x):
+            cell = arr[gy * cell_h:(gy + 1) * cell_h, gx * cell_w:(gx + 1) * cell_w, :]
+            for channel in range(3):
+                hist, _ = np.histogram(cell[:, :, channel], bins=16, range=(0, 255))
+                hist = hist.astype(np.float32)
+                cells.append(hist / (hist.sum() + 1e-6))
+    return np.concatenate(cells)
+
+
+def _is_dark(path: Path, cfg: AppConfig) -> bool:
+    image = _trim_dark_borders(Image.open(path).convert("L"))
+    w, h = image.size
+    top = int(h * 0.05)
+    bottom = int(h * 0.95)
+    arr = np.asarray(image.crop((0, top, w, bottom)), dtype=np.float32)
+    return float(arr.mean()) < 28.0 and float(np.percentile(arr, 90)) < 58.0
+
+
+def _corr(a: np.ndarray, b: np.ndarray) -> float:
+    return float((a * b).mean())
+
+
+def _hist_intersection(a: np.ndarray, b: np.ndarray) -> float:
+    return float(np.minimum(a, b).sum() / (np.maximum(a, b).sum() + 1e-6))
+
+
+def _paired_frame_score(ref_path: Path, src_path: Path, cfg: AppConfig) -> float:
+    edge_score = _corr(_feature(ref_path, cfg), _feature(src_path, cfg))
+    luma_score = _corr(_luma_feature(ref_path, cfg), _luma_feature(src_path, cfg))
+    hist_score = _hist_intersection(_hist_feature(ref_path, cfg), _hist_feature(src_path, cfg))
+    spatial_score = _hist_intersection(_spatial_hist_feature(ref_path, cfg), _spatial_hist_feature(src_path, cfg))
+    return (
+        edge_score * 0.24
+        + luma_score * 0.24
+        + hist_score * 0.14
+        + spatial_score * 0.38
+    )
+
+
+def align_cached_match_by_content(
+    beat: TrailerBeat,
+    estimated_in_point_s: float,
+    cfg: AppConfig,
+    search_window_s: float | None = None,
+    fps: float = 25.0,
+) -> tuple[float, float]:
+    """
+    Measure the local source offset directly from rendered frame content.
+
+    This is intentionally independent from the global OpenCV matcher: it only
+    needs FFmpeg, Pillow and numpy, and it scans a small window around an
+    already plausible candidate.
+    """
+    window_s = (
+        search_window_s
+        if search_window_s is not None
+        else cfg.cv.deep_scan.content_align_window_seconds
+    )
+    sample_step_s = max(1.0 / fps, cfg.cv.deep_scan.content_align_sample_step_s)
+    source_start_s = max(0.0, estimated_in_point_s - window_s)
+    source_duration_s = beat.duration_s + (2.0 * window_s) + 0.5
+
+    tmp = cfg.paths.output_dir / "align_tmp" / f"beat_{beat.beat_id:03d}"
+    shutil.rmtree(tmp, ignore_errors=True)
+    tmp.mkdir(parents=True, exist_ok=True)
+    try:
+        ref_dir = tmp / "ref"
+        src_dir = tmp / "src"
+        _extract_frames(beat.trailer_path, beat.start_s, beat.duration_s, fps, ref_dir, "ref")
+        _extract_frames(cfg.paths.source_movie, source_start_s, source_duration_s, fps, src_dir, "src")
+
+        ref_frames = sorted(ref_dir.glob("ref_*.png"))
+        src_frames = sorted(src_dir.glob("src_*.png"))
+        if not ref_frames or not src_frames:
+            return estimated_in_point_s, 0.0
+
+        sample_frame_step = max(1, int(round(sample_step_s * fps)))
+        min_matchable_frames = max(1, len(ref_frames) - int(round(0.24 * fps)))
+        template_offsets: list[int] = []
+        templates: list[tuple[int, np.ndarray]] = []
+        for idx in range(0, min_matchable_frames, sample_frame_step):
+            path = ref_frames[idx]
+            if _is_dark(path, cfg):
+                continue
+            template_offsets.append(idx)
+            templates.append((idx, _feature(path, cfg)))
+
+        if len(templates) < 3:
+            template_offsets = list(range(0, min_matchable_frames, sample_frame_step))
+            templates = [
+                (idx, _feature(ref_frames[idx], cfg))
+                for idx in template_offsets
+            ]
+
+        search_start_frame = 0
+        search_end_frame = max(0, len(src_frames) - min_matchable_frames)
+        estimated_frame = int(round((estimated_in_point_s - source_start_s) * fps))
+        best_frame = estimated_frame
+        best_score = -1.0
+
+        for candidate_frame in range(search_start_frame, search_end_frame + 1):
+            scores: list[float] = []
+            for offset_frame in template_offsets:
+                src_idx = candidate_frame + offset_frame
+                if src_idx < 0 or src_idx >= len(src_frames):
+                    break
+                scores.append(_paired_frame_score(ref_frames[offset_frame], src_frames[src_idx], cfg))
+            if len(scores) < max(3, math.ceil(len(templates) * 0.65)):
+                continue
+
+            avg_score = sum(scores) / len(scores)
+            min_score = min(scores)
+            score = (avg_score * 0.68) + (min_score * 0.32)
+            if score > best_score + 0.003:
+                best_score = score
+                best_frame = candidate_frame
+            elif score >= best_score - 0.003 and abs(candidate_frame - estimated_frame) < abs(best_frame - estimated_frame):
+                best_frame = candidate_frame
+
+        return source_start_s + (best_frame / fps), max(0.0, best_score)
+    finally:
+        shutil.rmtree(tmp, ignore_errors=True)
diff --git a/src/cv/deep_scan.py b/src/cv/deep_scan.py
new file mode 100644
index 0000000..d8adcab
--- /dev/null
+++ b/src/cv/deep_scan.py
@@ -0,0 +1,253 @@
+"""
+src/cv/deep_scan.py — Phase 2: Frame-accurate template matching (Deep Scan)
+
+Responsibility:
+  Given a TrailerBeat and a ranked list of VibeHit candidates, open the
+  source video and scan each candidate scene in two passes:
+
+    1. Coarse pass:  step through at coarse_step_seconds intervals,
+                     comparing via cv2.matchTemplate.
+    2. Refine pass:  if coarse score > threshold, zoom in ± refine_window_seconds
+                     at refine_step_seconds resolution to pin the exact in-point.
+
+Returns a MatchResult if a confident hit is found, otherwise None.
+"""
+
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+from typing import Sequence
+
+import cv2
+import numpy as np
+
+from src.core.config import AppConfig
+from src.core.models import MatchResult, Scene, TrailerBeat, VibeHit
+from src.cv.fingerprinting import text_safe_crop
+from src.cv.frame_extractor import (
+    grab_frame_at,
+    grab_frame_at_path,
+    iter_frames_stepped,
+    open_video,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Template preparation
+# ---------------------------------------------------------------------------
+
+def _prepare_template(
+    trailer_beat: TrailerBeat,
+    cfg: AppConfig,
+    proxy_w: int,
+    proxy_h: int,
+) -> np.ndarray | None:
+    """
+    Extract, crop, and resize the representative frame from the trailer beat.
+
+    This frame becomes the cv2.matchTemplate "needle".
+    """
+    vc = cfg.cv.vibe_check
+    ds = cfg.cv.deep_scan
+
+    beat_frame = grab_frame_at_path(
+        trailer_beat.trailer_path,
+        trailer_beat.midpoint_s,
+    )
+    if beat_frame is None:
+        logger.warning("Beat %d: cannot decode midpoint frame.", trailer_beat.beat_id)
+        return None
+
+    cropped = text_safe_crop(beat_frame, vc.crop_top_fraction, vc.crop_bottom_fraction)
+    resized = cv2.resize(cropped, (proxy_w, proxy_h), interpolation=cv2.INTER_AREA)
+    
+    # Crop the template by 10% on all sides to allow sliding window (translation invariance)
+    # when matching against the source movie, which might have slight pan/scan shifts.
+    margin_y = int(proxy_h * 0.10)
+    margin_x = int(proxy_w * 0.10)
+    template = resized[margin_y : proxy_h - margin_y, margin_x : proxy_w - margin_x]
+    
+    return template
+
+
+# ---------------------------------------------------------------------------
+# Single-frame match
+# ---------------------------------------------------------------------------
+
+def _match_frame(
+    source_frame: np.ndarray,
+    template: np.ndarray,
+    method: int,
+    proxy_w: int,
+    proxy_h: int,
+    crop_top: float,
+    crop_bottom: float,
+) -> tuple[float, tuple[int, int]]:
+    """
+    Run cv2.matchTemplate between *source_frame* and *template*.
+
+    Returns:
+        (score, (x, y)) where score ∈ [0, 1] for CCOEFF_NORMED.
+    """
+    cropped = text_safe_crop(source_frame, crop_top, crop_bottom)
+    haystack = cv2.resize(cropped, (proxy_w, proxy_h), interpolation=cv2.INTER_AREA)
+
+    # Match the slightly smaller template inside the full proxy frame
+    result = cv2.matchTemplate(haystack, template, method)
+    _, max_val, _, max_loc = cv2.minMaxLoc(result)
+    return float(max_val), (int(max_loc[0]), int(max_loc[1]))
+
+
+# ---------------------------------------------------------------------------
+# Deep Scan core
+# ---------------------------------------------------------------------------
+
+def scan_scene(
+    beat: TrailerBeat,
+    scene: Scene,
+    template: np.ndarray,
+    cfg: AppConfig,
+) -> tuple[float, float, tuple[int, int]] | None:
+    """
+    Scan one source scene in two passes (coarse → refine).
+
+    Returns:
+        (best_timestamp_s, best_score, best_location) or None if no hit.
+    """
+    ds = cfg.cv.deep_scan
+    vc = cfg.cv.vibe_check
+    proxy_w = cfg.video.proxy_width
+    proxy_h = cfg.video.proxy_height
+
+    best_t     = scene.start_s
+    best_score = 0.0
+    best_loc   = (0, 0)
+
+    # ---- Coarse pass --------------------------------------------------------
+    with open_video(scene.source_path) as cap:
+        for t, frame in iter_frames_stepped(
+            cap, scene.start_s, scene.end_s, ds.coarse_step_seconds
+        ):
+            score, loc = _match_frame(
+                frame, template, ds.match_method,
+                proxy_w, proxy_h, vc.crop_top_fraction, vc.crop_bottom_fraction,
+            )
+            if score > best_score:
+                best_score = score
+                best_t     = t
+                best_loc   = loc
+
+        if best_score < ds.match_threshold:
+            return None  # scene doesn't contain a match worth refining
+
+        # ---- Refine pass ----------------------------------------------------
+        refine_start = max(scene.start_s, best_t - ds.refine_window_seconds)
+        refine_end   = min(scene.end_s,   best_t + ds.refine_window_seconds)
+
+        refined_t     = best_t
+        refined_score = best_score
+        refined_loc   = best_loc
+
+        for t, frame in iter_frames_stepped(
+            cap, refine_start, refine_end, ds.refine_step_seconds
+        ):
+            score, loc = _match_frame(
+                frame, template, ds.match_method,
+                proxy_w, proxy_h, vc.crop_top_fraction, vc.crop_bottom_fraction,
+            )
+            if score > refined_score:
+                refined_score = score
+                refined_t     = t
+                refined_loc   = loc
+
+    logger.debug(
+        "Beat %d → Scene %d: coarse=%.3f refined=%.3f @%.3fs",
+        beat.beat_id, scene.scene_id, best_score, refined_score, refined_t,
+    )
+    return refined_t, refined_score, refined_loc
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+def run_deep_scan(
+    beat: TrailerBeat,
+    candidates: Sequence[VibeHit],
+    scenes_by_id: dict[int, Scene],
+    cfg: AppConfig,
+) -> MatchResult | None:
+    """
+    Phase 2 Deep Scan: iterate over Vibe Check candidates and template-match.
+
+    Args:
+        beat:          The trailer beat to source.
+        candidates:    Ranked VibeHit list from Phase 1 (best first).
+        scenes_by_id:  Lookup dict: scene_id → Scene.
+        cfg:           Application configuration.
+
+    Returns:
+        The best MatchResult above threshold, or None if no match found.
+    """
+    proxy_w = cfg.video.proxy_width
+    proxy_h = cfg.video.proxy_height
+
+    template = _prepare_template(beat, cfg, proxy_w, proxy_h)
+    if template is None:
+        return None
+
+    best_result: MatchResult | None = None
+
+    for vibe_hit in candidates:
+        scene = scenes_by_id.get(vibe_hit.scene_id)
+        if scene is None:
+            logger.warning("VibeHit references unknown scene_id=%d", vibe_hit.scene_id)
+            continue
+
+        hit = scan_scene(beat, scene, template, cfg)
+        if hit is None:
+            continue
+
+        in_point_s, match_score, match_loc = hit
+
+        # Frame number: approximate via FPS (refined later if needed)
+        from src.cv.frame_extractor import get_video_info
+        info = get_video_info(scene.source_path)
+        fps  = float(info["fps"]) or 24.0
+        in_point_frame = int(in_point_s * fps)
+
+        candidate_result = MatchResult(
+            beat_id=beat.beat_id,
+            scene_id=scene.scene_id,
+            source_path=scene.source_path,
+            in_point_s=in_point_s,
+            out_point_s=in_point_s + beat.duration_s,
+            in_point_frame=in_point_frame,
+            match_score=match_score,
+            match_location=match_loc,
+            vibe_hit=vibe_hit,
+        )
+
+        if best_result is None or match_score > best_result.match_score:
+            best_result = candidate_result
+
+        # Early exit: if score is very high, no need to check other candidates
+        if match_score >= 0.90:
+            logger.info(
+                "Beat %d: early-exit match (score=%.3f) in scene %d @%.3fs",
+                beat.beat_id, match_score, scene.scene_id, in_point_s,
+            )
+            break
+
+    if best_result:
+        logger.info("Beat %d → MATCH scene=%d score=%.3f in=%.3fs",
+                    beat.beat_id, best_result.scene_id,
+                    best_result.match_score, best_result.in_point_s)
+    else:
+        logger.warning("Beat %d → NO MATCH found in %d candidates.",
+                       beat.beat_id, len(candidates))
+
+    return best_result
diff --git a/src/cv/fingerprinting.py b/src/cv/fingerprinting.py
new file mode 100644
index 0000000..dc0c0b0
--- /dev/null
+++ b/src/cv/fingerprinting.py
@@ -0,0 +1,228 @@
+"""
+src/cv/fingerprinting.py — Image fingerprinting for the Vibe Check phase
+
+Responsibilities (Single Responsibility Principle):
+  - Text-Safe Crop: strip top/bottom fractions to hide logos & letterbox
+  - Luma + Saturation histogram extraction (scale-invariant)
+  - Perceptual hash (pHash) via imagehash
+
+This module is PURELY functional — no file I/O, no video decoding,
+no search logic. It takes numpy arrays and returns numeric descriptors.
+"""
+
+from __future__ import annotations
+
+import pickle
+from typing import TYPE_CHECKING
+
+import cv2
+import numpy as np
+
+try:
+    import imagehash
+    from PIL import Image as PilImage
+    _HAS_IMAGEHASH = True
+except ImportError:
+    _HAS_IMAGEHASH = False
+
+if TYPE_CHECKING:
+    from src.core.config import VibeCheckConfig
+
+
+# ---------------------------------------------------------------------------
+# Text-Safe Crop
+# ---------------------------------------------------------------------------
+
+def text_safe_crop(
+    frame: np.ndarray,
+    crop_top: float,
+    crop_bottom: float,
+) -> np.ndarray:
+    """
+    Remove the top and bottom fractions of a frame.
+
+    This eliminates title cards, logos (top) and letterbox / subtitles
+    (bottom) before any colour analysis, preventing false positives.
+
+    Args:
+        frame:       BGR or greyscale frame as (H, W[, C]) ndarray.
+        crop_top:    Fraction [0, 1) of height to remove from the top.
+        crop_bottom: Fraction [0, 1) of height to remove from the bottom.
+
+    Returns:
+        Cropped view (no copy — avoids memory overhead).
+
+    Raises:
+        ValueError: If crop fractions are out of range or overlap.
+    """
+    if not (0.0 <= crop_top < 1.0):
+        raise ValueError(f"crop_top must be in [0, 1); got {crop_top}")
+    if not (0.0 <= crop_bottom < 1.0):
+        raise ValueError(f"crop_bottom must be in [0, 1); got {crop_bottom}")
+    if crop_top + crop_bottom >= 1.0:
+        raise ValueError(
+            f"crop_top ({crop_top}) + crop_bottom ({crop_bottom}) must be < 1.0"
+        )
+
+    h = frame.shape[0]
+    y_start = int(h * crop_top)
+    y_end   = int(h * (1.0 - crop_bottom))
+    return frame[y_start:y_end]
+
+
+# ---------------------------------------------------------------------------
+# Histogram extraction
+# ---------------------------------------------------------------------------
+
+def extract_hs_histograms(
+    frame_bgr: np.ndarray,
+    bins_luma: int | None = None,
+    bins_sat: int | None = None,
+    *,
+    bins_hue: int | None = None,
+) -> tuple[np.ndarray, np.ndarray]:
+    """
+    Compute normalised Luma (Value) and Saturation histograms from a BGR frame.
+
+    We use Luma and Saturation (ignoring Hue) because Hue is highly sensitive
+    to color grading differences between the trailer and the source movie.
+
+    Args:
+        frame_bgr: BGR frame (H, W, 3) uint8.
+        bins_luma: Number of histogram bins for the Luma channel [0, 256).
+        bins_hue:  Backwards-compatible alias for bins_luma.
+        bins_sat:  Number of histogram bins for the Saturation channel [0, 256).
+
+    Returns:
+        (luma_hist, sat_hist) — each a 1-D float32 ndarray, L2-normalised.
+    """
+    if bins_luma is None:
+        bins_luma = bins_hue
+    elif bins_hue is not None and bins_hue != bins_luma:
+        raise ValueError("bins_hue is an alias for bins_luma; pass only one value")
+    if bins_luma is None or bins_sat is None:
+        raise TypeError("bins_luma/bins_hue and bins_sat are required")
+
+    hsv = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2HSV)
+    luma = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY)
+
+    # Use perceptual grayscale luma rather than HSV Value. Value would make
+    # saturated red and blue look identical, weakening the scene-level filter.
+    luma_hist = cv2.calcHist(
+        [luma], [0], None, [bins_luma], [0, 256]
+    ).flatten().astype(np.float32)
+
+    sat_hist = cv2.calcHist(
+        [hsv], [1], None, [bins_sat], [0, 256]
+    ).flatten().astype(np.float32)
+
+    # L2-normalise so scene size doesn't affect scores
+    cv2.normalize(luma_hist, luma_hist, alpha=1.0, norm_type=cv2.NORM_L2)
+    cv2.normalize(sat_hist,  sat_hist,  alpha=1.0, norm_type=cv2.NORM_L2)
+
+    return luma_hist, sat_hist
+
+
+def compare_histograms(
+    hist_a: np.ndarray,
+    hist_b: np.ndarray,
+    method: int,
+) -> float:
+    """
+    Compare two histograms using cv2.compareHist.
+
+    Args:
+        hist_a, hist_b: 1-D float32 ndarrays of identical shape.
+        method:         cv2.HISTCMP_* constant (e.g. cv2.HISTCMP_CORREL = 0).
+
+    Returns:
+        Raw score from cv2.compareHist (range depends on method).
+        For CORREL: [-1, 1], higher = more similar.
+        For BHATTACHARYYA: [0, 1], lower = more similar.
+    """
+    return float(cv2.compareHist(hist_a, hist_b, method))
+
+
+# ---------------------------------------------------------------------------
+# Perceptual Hash
+# ---------------------------------------------------------------------------
+
+def compute_phash(frame_bgr: np.ndarray, hash_size: int = 8) -> str:
+    """
+    Compute a perceptual hash (pHash) of a BGR frame.
+
+    pHash is rotation- and scale-invariant; it catches visual similarity
+    even when resolution differs between trailer proxy and source movie.
+
+    Args:
+        frame_bgr: BGR frame (H, W, 3) uint8.
+        hash_size: DCT block size; 8 → 64-bit hash (default).
+
+    Returns:
+        Hex string representation of the 64-bit hash (e.g. "f8e0e0e0...").
+
+    Raises:
+        RuntimeError: If imagehash is not installed.
+    """
+    if not _HAS_IMAGEHASH:
+        raise RuntimeError(
+            "imagehash is not installed. Run: pip install imagehash"
+        )
+    rgb   = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
+    pil   = PilImage.fromarray(rgb)
+    phash = imagehash.phash(pil, hash_size=hash_size)
+    return str(phash)
+
+
+def phash_distance(hash_a: str, hash_b: str) -> int:
+    """
+    Compute Hamming distance between two pHash hex strings.
+
+    Args:
+        hash_a, hash_b: Hex strings as returned by compute_phash().
+
+    Returns:
+        Integer Hamming distance [0, 64].  0 = identical.
+    """
+    if not _HAS_IMAGEHASH:
+        raise RuntimeError("imagehash is not installed.")
+    return int(imagehash.hex_to_hash(hash_a) - imagehash.hex_to_hash(hash_b))
+
+
+# ---------------------------------------------------------------------------
+# Serialisation helpers (histograms ↔ bytes for caching)
+# ---------------------------------------------------------------------------
+
+def hist_to_bytes(hist: np.ndarray) -> bytes:
+    """Serialise a numpy histogram array for storage in a Scene/Beat model."""
+    return pickle.dumps(hist, protocol=pickle.HIGHEST_PROTOCOL)
+
+
+def bytes_to_hist(data: bytes) -> np.ndarray:
+    """Deserialise a numpy histogram array from bytes."""
+    return pickle.loads(data)  # noqa: S301  (trusted internal cache only)
+
+
+# ---------------------------------------------------------------------------
+# High-level convenience: fingerprint one frame using config
+# ---------------------------------------------------------------------------
+
+def fingerprint_frame(
+    frame_bgr: np.ndarray,
+    cfg: "VibeCheckConfig",
+) -> tuple[bytes, bytes, str]:
+    """
+    Apply Text-Safe Crop, histogram extraction, and pHash in one call.
+
+    Args:
+        frame_bgr: Full BGR frame (H, W, 3) uint8.
+        cfg:       VibeCheckConfig carrying crop fractions and bin counts.
+
+    Returns:
+        (luma_hist_bytes, sat_hist_bytes, phash_hex)
+    """
+    cropped          = text_safe_crop(frame_bgr, cfg.crop_top_fraction, cfg.crop_bottom_fraction)
+    luma_hist, sat_hist = extract_hs_histograms(cropped, cfg.hist_bins_hue, cfg.hist_bins_saturation)
+    phash_hex        = compute_phash(cropped)
+
+    return hist_to_bytes(luma_hist), hist_to_bytes(sat_hist), phash_hex
diff --git a/src/cv/frame_extractor.py b/src/cv/frame_extractor.py
new file mode 100644
index 0000000..5cedd19
--- /dev/null
+++ b/src/cv/frame_extractor.py
@@ -0,0 +1,172 @@
+"""
+src/cv/frame_extractor.py — Low-level video frame access
+
+Responsibility:
+  Provide a thin, testable wrapper around cv2.VideoCapture for:
+    - seeking to an exact timestamp and returning one BGR frame
+    - iterating frames with a configurable step size
+    - extracting the "representative" middle frame of a Scene / TrailerBeat
+
+No fingerprinting, no matching — only raw frame delivery.
+"""
+
+from __future__ import annotations
+
+import logging
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Generator, Iterator
+
+import cv2
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Context-managed VideoCapture
+# ---------------------------------------------------------------------------
+
+@contextmanager
+def open_video(path: Path) -> Generator[cv2.VideoCapture, None, None]:
+    """
+    Context manager that opens a VideoCapture and guarantees release.
+
+    Args:
+        path: Absolute path to the video file.
+
+    Raises:
+        FileNotFoundError: If the file does not exist.
+        RuntimeError:      If OpenCV cannot open the file.
+    """
+    if not path.exists():
+        raise FileNotFoundError(f"Video not found: {path}")
+
+    cap = cv2.VideoCapture(str(path))
+    if not cap.isOpened():
+        raise RuntimeError(f"OpenCV could not open video: {path}")
+
+    try:
+        yield cap
+    finally:
+        cap.release()
+
+
+# ---------------------------------------------------------------------------
+# Video metadata
+# ---------------------------------------------------------------------------
+
+def get_video_info(path: Path) -> dict[str, float | int]:
+    """
+    Return basic metadata without keeping the file open.
+
+    Returns:
+        dict with keys: fps, frame_count, duration_s, width, height
+    """
+    with open_video(path) as cap:
+        fps         = cap.get(cv2.CAP_PROP_FPS)
+        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        width       = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height      = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+
+    duration_s = frame_count / fps if fps > 0 else 0.0
+    return {
+        "fps":         fps,
+        "frame_count": frame_count,
+        "duration_s":  duration_s,
+        "width":       width,
+        "height":      height,
+    }
+
+
+# ---------------------------------------------------------------------------
+# Single frame extraction
+# ---------------------------------------------------------------------------
+
+def grab_frame_at(cap: cv2.VideoCapture, timestamp_s: float) -> np.ndarray | None:
+    """
+    Seek to *timestamp_s* and return the BGR frame at that position.
+
+    Uses CAP_PROP_POS_MSEC for sub-frame accuracy.
+
+    Args:
+        cap:         An already-open VideoCapture.
+        timestamp_s: Target time in seconds.
+
+    Returns:
+        BGR ndarray (H, W, 3) or None if seeking / decoding failed.
+    """
+    cap.set(cv2.CAP_PROP_POS_MSEC, timestamp_s * 1000.0)
+    ok, frame = cap.read()
+    if not ok or frame is None:
+        logger.debug("grab_frame_at: failed at %.3fs", timestamp_s)
+        return None
+    return frame
+
+
+def grab_frame_at_path(path: Path, timestamp_s: float) -> np.ndarray | None:
+    """
+    One-shot convenience: open → seek → grab → release.
+    Prefer open_video() when grabbing multiple frames from the same file.
+    """
+    with open_video(path) as cap:
+        return grab_frame_at(cap, timestamp_s)
+
+
+# ---------------------------------------------------------------------------
+# Middle-frame extraction (representative frame for fingerprinting)
+# ---------------------------------------------------------------------------
+
+def grab_midpoint_frame(
+    cap: cv2.VideoCapture,
+    start_s: float,
+    end_s: float,
+) -> np.ndarray | None:
+    """
+    Grab the frame at the exact midpoint of a [start_s, end_s] interval.
+
+    Args:
+        cap:     Open VideoCapture for the source video.
+        start_s: Interval start in seconds.
+        end_s:   Interval end in seconds.
+
+    Returns:
+        BGR frame or None if decoding failed.
+    """
+    mid = start_s + (end_s - start_s) / 2.0
+    return grab_frame_at(cap, mid)
+
+
+# ---------------------------------------------------------------------------
+# Stepped-frame iterator (used by Deep Scan coarse pass)
+# ---------------------------------------------------------------------------
+
+def iter_frames_stepped(
+    cap: cv2.VideoCapture,
+    start_s: float,
+    end_s: float,
+    step_s: float,
+) -> Iterator[tuple[float, np.ndarray]]:
+    """
+    Yield (timestamp_s, frame) for every *step_s* increment in [start_s, end_s].
+
+    Frames that fail to decode are silently skipped.
+
+    Args:
+        cap:     Open VideoCapture.
+        start_s: Scan window start in seconds.
+        end_s:   Scan window end   in seconds.
+        step_s:  Step between samples in seconds.
+
+    Yields:
+        (timestamp_s, bgr_frame)
+    """
+    if step_s <= 0:
+        raise ValueError(f"step_s must be > 0; got {step_s}")
+
+    t = start_s
+    while t <= end_s:
+        frame = grab_frame_at(cap, t)
+        if frame is not None:
+            yield t, frame
+        t = round(t + step_s, 6)  # avoid float accumulation drift
diff --git a/src/cv/global_scan.py b/src/cv/global_scan.py
new file mode 100644
index 0000000..89b0930
--- /dev/null
+++ b/src/cv/global_scan.py
@@ -0,0 +1,1509 @@
+import logging
+import cv2
+import numpy as np
+import subprocess as sp
+from typing import Sequence
+import time
+from dataclasses import replace
+
+from src.core.config import AppConfig
+from src.core.models import MatchResult, TrailerBeat
+from src.cv.fingerprinting import text_safe_crop
+from src.cv.frame_extractor import grab_frame_at_path, get_video_info, open_video, grab_frame_at
+
+logger = logging.getLogger(__name__)
+SeedPoint = float | tuple[float, float]
+_REFERENCE_CUT_CACHE: dict[tuple[str, float, float, float], list[float]] = {}
+
+
+def _prepare_template(frame: np.ndarray, cfg: AppConfig) -> np.ndarray:
+    proxy_w = cfg.video.proxy_width
+    proxy_h = cfg.video.proxy_height
+    cb = text_safe_crop(
+        frame,
+        cfg.cv.vibe_check.crop_top_fraction,
+        cfg.cv.vibe_check.crop_bottom_fraction,
+    )
+    rb = cv2.resize(cb, (proxy_w, proxy_h), interpolation=cv2.INTER_AREA)
+
+    margin_y = int(proxy_h * 0.10)
+    margin_x = int(proxy_w * 0.10)
+    return _feature_image(rb[margin_y:proxy_h-margin_y, margin_x:proxy_w-margin_x])
+
+
+def _prepare_haystack(frame: np.ndarray, cfg: AppConfig) -> np.ndarray:
+    cb = text_safe_crop(
+        frame,
+        cfg.cv.vibe_check.crop_top_fraction,
+        cfg.cv.vibe_check.crop_bottom_fraction,
+    )
+    rb = cv2.resize(cb, (cfg.video.proxy_width, cfg.video.proxy_height), interpolation=cv2.INTER_AREA)
+    return _feature_image(rb)
+
+
+def _center_crop_feature(feature: np.ndarray, cfg: AppConfig) -> np.ndarray:
+    h, w = feature.shape[:2]
+    margin_y = int(h * 0.10)
+    margin_x = int(w * 0.10)
+    return feature[margin_y:h-margin_y, margin_x:w-margin_x]
+
+
+def _feature_image(frame: np.ndarray) -> np.ndarray:
+    """
+    Convert frames to a look-tolerant matching feature.
+
+    Trailer shots may be desaturated, contrast-shifted, or contain a different
+    grade than the source movie. Matching luma plus edges is more stable than
+    raw BGR pixels and rejects unrelated scenes with similar colors.
+    """
+    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+    gray = cv2.equalizeHist(gray)
+    edges = cv2.Canny(gray, 60, 140)
+    return cv2.addWeighted(gray, 0.70, edges, 0.30, 0)
+
+
+def _match_score(frame: np.ndarray, template: np.ndarray, cfg: AppConfig) -> float:
+    haystack = _prepare_haystack(frame, cfg)
+    res = cv2.matchTemplate(haystack, template, cv2.TM_CCOEFF_NORMED)
+    _, max_val, _, _ = cv2.minMaxLoc(res)
+    return float(max_val)
+
+
+def _fixed_position_score(frame: np.ndarray, template: np.ndarray, cfg: AppConfig) -> float:
+    fixed = _center_crop_feature(_prepare_haystack(frame, cfg), cfg)
+    if fixed.shape != template.shape:
+        fixed = cv2.resize(fixed, (template.shape[1], template.shape[0]), interpolation=cv2.INTER_AREA)
+    res = cv2.matchTemplate(fixed, template, cv2.TM_CCOEFF_NORMED)
+    _, max_val, _, _ = cv2.minMaxLoc(res)
+    return float(max_val)
+
+
+def _fixed_feature(frame: np.ndarray, template_shape: tuple[int, ...], cfg: AppConfig) -> np.ndarray:
+    fixed = _center_crop_feature(_prepare_haystack(frame, cfg), cfg)
+    if fixed.shape != template_shape:
+        fixed = cv2.resize(fixed, (template_shape[1], template_shape[0]), interpolation=cv2.INTER_AREA)
+    return fixed
+
+
+def _corr_same_size(a: np.ndarray, b: np.ndarray) -> float:
+    if a.shape != b.shape:
+        b = cv2.resize(b, (a.shape[1], a.shape[0]), interpolation=cv2.INTER_AREA)
+    res = cv2.matchTemplate(a, b, cv2.TM_CCOEFF_NORMED)
+    _, max_val, _, _ = cv2.minMaxLoc(res)
+    if np.isnan(max_val):
+        return 0.0
+    return float(max_val)
+
+
+def _validation_crop(frame: np.ndarray) -> np.ndarray:
+    frame = _trim_dark_borders(frame)
+    h = frame.shape[0]
+    return frame[int(h * 0.05):int(h * 0.95), :]
+
+
+def _trim_dark_borders(frame: np.ndarray) -> np.ndarray:
+    """
+    Remove encoded black matte/pillarbox borders before fixed-position checks.
+
+    The reference trailer can contain vertical black bars while the source movie
+    does not. Whole-frame spatial validation should compare picture content, not
+    container matte.
+    """
+    if frame.size == 0:
+        return frame
+    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+    h, w = gray.shape[:2]
+    col_signal = np.percentile(gray, 90, axis=0)
+    row_signal = np.percentile(gray, 90, axis=1)
+    active_cols = np.where(col_signal > 18.0)[0]
+    active_rows = np.where(row_signal > 18.0)[0]
+    if active_cols.size >= max(8, int(w * 0.35)):
+        x0 = max(0, int(active_cols[0]) - 2)
+        x1 = min(w, int(active_cols[-1]) + 3)
+    else:
+        x0, x1 = 0, w
+    if active_rows.size >= max(8, int(h * 0.35)):
+        y0 = max(0, int(active_rows[0]) - 2)
+        y1 = min(h, int(active_rows[-1]) + 3)
+    else:
+        y0, y1 = 0, h
+    if x1 - x0 < int(w * 0.35) or y1 - y0 < int(h * 0.35):
+        return frame
+    return frame[y0:y1, x0:x1]
+
+
+def _fixed_luma_feature(frame: np.ndarray, cfg: AppConfig) -> np.ndarray:
+    cropped = _validation_crop(frame)
+    gray = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY)
+    gray = cv2.equalizeHist(gray)
+    resized = cv2.resize(gray, (160, 80), interpolation=cv2.INTER_AREA).astype(np.float32)
+    return (resized - float(np.mean(resized))) / (float(np.std(resized)) + 1e-6)
+
+
+def _fixed_edge_feature(frame: np.ndarray, cfg: AppConfig) -> np.ndarray:
+    cropped = _validation_crop(frame)
+    gray = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY)
+    gray = cv2.equalizeHist(gray)
+    edges = cv2.Canny(gray, 60, 140)
+    resized = cv2.resize(edges, (160, 80), interpolation=cv2.INTER_AREA).astype(np.float32)
+    return (resized - float(np.mean(resized))) / (float(np.std(resized)) + 1e-6)
+
+
+def _fixed_hist_feature(frame: np.ndarray, cfg: AppConfig) -> np.ndarray:
+    cropped = _validation_crop(frame)
+    resized = cv2.resize(cropped, (160, 80), interpolation=cv2.INTER_AREA)
+    chans = cv2.split(resized)
+    parts = []
+    for channel in chans:
+        hist = cv2.calcHist([channel], [0], None, [32], [0, 256]).astype(np.float32).flatten()
+        parts.append(hist / (float(np.sum(hist)) + 1e-6))
+    return np.concatenate(parts)
+
+
+def _fixed_spatial_hist_feature(frame: np.ndarray, cfg: AppConfig) -> np.ndarray:
+    cropped = _validation_crop(frame)
+    resized = cv2.resize(cropped, (160, 80), interpolation=cv2.INTER_AREA)
+    grid_y = 4
+    grid_x = 4
+    cell_h = resized.shape[0] // grid_y
+    cell_w = resized.shape[1] // grid_x
+    parts = []
+    for gy in range(grid_y):
+        for gx in range(grid_x):
+            cell = resized[gy * cell_h:(gy + 1) * cell_h, gx * cell_w:(gx + 1) * cell_w, :]
+            for channel in cv2.split(cell):
+                hist = cv2.calcHist([channel], [0], None, [16], [0, 256]).astype(np.float32).flatten()
+                parts.append(hist / (float(np.sum(hist)) + 1e-6))
+    return np.concatenate(parts)
+
+
+def _array_corr(a: np.ndarray, b: np.ndarray) -> float:
+    if a.shape != b.shape:
+        return 0.0
+    return float(np.mean(a * b))
+
+
+def _hist_intersection(a: np.ndarray, b: np.ndarray) -> float:
+    if a.shape != b.shape:
+        return 0.0
+    return float(np.minimum(a, b).sum() / (np.maximum(a, b).sum() + 1e-6))
+
+
+def _fixed_content_features(frame: np.ndarray, cfg: AppConfig) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    return (
+        _fixed_luma_feature(frame, cfg),
+        _fixed_edge_feature(frame, cfg),
+        _fixed_hist_feature(frame, cfg),
+        _fixed_spatial_hist_feature(frame, cfg),
+    )
+
+
+def _fixed_content_pair_score(
+    ref_features: tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray],
+    source_frame: np.ndarray,
+    cfg: AppConfig,
+) -> float:
+    src_luma, src_edge, src_hist, src_spatial = _fixed_content_features(source_frame, cfg)
+    ref_luma, ref_edge, ref_hist, ref_spatial = ref_features
+    luma_score = _array_corr(ref_luma, src_luma)
+    edge_score = _array_corr(ref_edge, src_edge)
+    hist_score = _hist_intersection(ref_hist, src_hist)
+    spatial_score = _hist_intersection(ref_spatial, src_spatial)
+    return (
+        edge_score * 0.24
+        + luma_score * 0.24
+        + hist_score * 0.14
+        + spatial_score * 0.38
+    )
+
+
+def _prepare_validation_templates(
+    beat: TrailerBeat,
+    cfg: AppConfig,
+) -> list[tuple[float, tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]]:
+    step_s = max(0.20, cfg.cv.deep_scan.content_align_sample_step_s * 1.5)
+    matchable_s = estimate_matchable_reference_duration(beat, cfg, sample_step_s=step_s)
+    templates: list[tuple[float, tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]] = []
+    t = 0.0
+    while t <= matchable_s:
+        frame = grab_frame_at_path(beat.trailer_path, beat.start_s + t)
+        if frame is not None and _is_scoreable_reference_frame(frame, cfg):
+            templates.append((t, _fixed_content_features(frame, cfg)))
+        t = round(t + step_s, 6)
+
+    if len(templates) >= 3:
+        return templates
+
+    fallback: list[tuple[float, tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]] = []
+    for offset_s in _beat_offsets(matchable_s):
+        frame = grab_frame_at_path(beat.trailer_path, beat.start_s + offset_s)
+        if frame is not None and _is_scoreable_reference_frame(frame, cfg):
+            fallback.append((offset_s, _fixed_content_features(frame, cfg)))
+    return fallback
+
+
+def _prepare_rerank_templates(
+    beat: TrailerBeat,
+    cfg: AppConfig,
+) -> list[tuple[float, tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]]:
+    matchable_s = estimate_matchable_reference_duration(beat, cfg)
+    templates: list[tuple[float, tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]] = []
+    for offset_s in _beat_offsets(matchable_s):
+        frame = grab_frame_at_path(beat.trailer_path, beat.start_s + offset_s)
+        if frame is not None and _is_scoreable_reference_frame(frame, cfg):
+            templates.append((offset_s, _fixed_content_features(frame, cfg)))
+    return templates
+
+
+def _fixed_content_sequence_score(
+    cap: cv2.VideoCapture,
+    in_point_s: float,
+    templates: list[tuple[float, tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]],
+    cfg: AppConfig,
+) -> float:
+    if not templates:
+        return 0.0
+
+    scores: list[float] = []
+    for offset_s, ref_features in templates:
+        frame = grab_frame_at(cap, in_point_s + offset_s)
+        if frame is None:
+            return 0.0
+        scores.append(_fixed_content_pair_score(ref_features, frame, cfg))
+
+    if not scores:
+        return 0.0
+    return float((sum(scores) / len(scores)) * 0.68 + min(scores) * 0.32)
+
+
+def _reference_internal_cut_offsets(beat: TrailerBeat, cfg: AppConfig) -> list[float]:
+    """Detect hard visual cuts inside a single trailer beat."""
+    cache_key = (
+        str(beat.trailer_path),
+        round(float(beat.start_s), 3),
+        round(float(beat.end_s), 3),
+        round(float(cfg.vision.multi_shot_cut_corr_threshold), 3),
+    )
+    cached = _REFERENCE_CUT_CACHE.get(cache_key)
+    if cached is not None:
+        return cached
+
+    step_s = max(1.0 / cfg.export.edl_frame_rate, 0.08)
+    previous: np.ndarray | None = None
+    cuts: list[float] = []
+    t = 0.0
+    while t <= beat.duration_s:
+        frame = grab_frame_at_path(beat.trailer_path, beat.start_s + t)
+        if frame is not None and _is_scoreable_reference_frame(frame, cfg):
+            feature = _prepare_haystack(frame, cfg)
+            if previous is not None:
+                corr = _corr_same_size(previous, feature)
+                if (
+                    corr < cfg.vision.multi_shot_cut_corr_threshold
+                    and 0.18 < t < beat.duration_s - 0.18
+                    and (not cuts or t - cuts[-1] > 0.24)
+                ):
+                    cuts.append(round(t, 3))
+            previous = feature
+        t = round(t + step_s, 6)
+    if cuts:
+        logger.debug('Beat %d: detected internal trailer cuts at %s', beat.beat_id, cuts)
+    _REFERENCE_CUT_CACHE[cache_key] = cuts
+    return cuts
+
+
+def _scene_fps_estimate(scene, cfg: AppConfig) -> float:
+    duration_s = max(0.0, float(scene.end_s) - float(scene.start_s))
+    frame_count = max(0, int(scene.end_frame) - int(scene.start_frame))
+    if duration_s <= 0.0 or frame_count <= 0:
+        return cfg.export.edl_frame_rate
+    return frame_count / duration_s
+
+
+def _contiguous_scene_coverage_duration(
+    beat: TrailerBeat,
+    in_point_s: float,
+    scenes: Sequence | None,
+    matchable_duration_s: float,
+    cfg: AppConfig,
+) -> float:
+    """
+    Allow a source span to cross scene boundaries only when the trailer beat has
+    matching internal cuts at the same relative offsets.
+    """
+    if not scenes or matchable_duration_s <= 0:
+        return 0.0
+
+    start_idx = None
+    for idx, scene in enumerate(scenes):
+        if float(scene.start_s) <= in_point_s < float(scene.end_s):
+            start_idx = idx
+            break
+    if start_idx is None:
+        return 0.0
+
+    cut_offsets = _reference_internal_cut_offsets(beat, cfg)
+    target_end = in_point_s + matchable_duration_s
+    current_end = in_point_s
+    for scene in scenes[start_idx:]:
+        scene_end = float(scene.end_s)
+        fps = _scene_fps_estimate(scene, cfg)
+        tail_s = max(0.0, cfg.cv.deep_scan.trim_tail_frames / fps)
+        if target_end <= scene_end:
+            return matchable_duration_s
+
+        boundary_offset = scene_end - in_point_s
+        boundary_matches_ref_cut = any(
+            abs(boundary_offset - cut_offset) <= cfg.vision.multi_shot_boundary_tolerance_s
+            for cut_offset in cut_offsets
+        )
+        if not boundary_matches_ref_cut:
+            return max(0.0, scene_end - in_point_s - tail_s)
+
+        current_end = scene_end
+
+    return max(0.0, current_end - in_point_s)
+
+
+def _rerank_candidates_by_content(
+    beat: TrailerBeat,
+    candidates: list[tuple[float, float]],
+    cfg: AppConfig,
+    scenes: Sequence | None = None,
+    matchable_duration_s: float | None = None,
+) -> list[tuple[float, float, float]]:
+    templates = _prepare_rerank_templates(beat, cfg)
+    if not templates:
+        return [(score, score, t_sec) for score, t_sec in candidates]
+
+    reranked: list[tuple[float, float, float]] = []
+    with open_video(cfg.paths.source_movie) as cap:
+        for coarse_score, t_sec in candidates:
+            content_score = _fixed_content_sequence_score(cap, t_sec, templates, cfg)
+            coverage_score = 1.0
+            if scenes is not None and matchable_duration_s and matchable_duration_s > 0:
+                usable_s = _contiguous_scene_coverage_duration(
+                    beat,
+                    t_sec,
+                    scenes,
+                    matchable_duration_s,
+                    cfg,
+                )
+                coverage_score = min(1.0, usable_s / matchable_duration_s)
+            rank_score = (
+                content_score * 0.62
+                + coarse_score * 0.18
+                + coverage_score * 0.20
+            )
+            reranked.append((rank_score, coarse_score, t_sec))
+
+    return sorted(reranked, key=lambda item: item[0], reverse=True)
+
+
+def _dense_weighted_seed_candidates(
+    beat: TrailerBeat,
+    seed_candidates: list[tuple[float, float]],
+    cfg: AppConfig,
+    scenes: Sequence | None,
+    matchable_duration_s: float,
+) -> list[tuple[float, float]]:
+    """Scan vision-selected source scenes densely with fixed-position content features."""
+    if not scenes or not seed_candidates:
+        return []
+
+    weighted_floor = cfg.cv.deep_scan.coarse_candidate_threshold + 0.05
+    seeded_scenes: dict[int, tuple[object, float]] = {}
+    for seed_score, seed_t in seed_candidates:
+        if seed_score <= weighted_floor:
+            continue
+        scene = _find_scene_for_time(scenes, seed_t, cfg)
+        if scene is None:
+            continue
+        previous = seeded_scenes.get(scene.scene_id)
+        if previous is None or seed_score > previous[1]:
+            seeded_scenes[scene.scene_id] = (scene, seed_score)
+
+    if not seeded_scenes:
+        return []
+
+    templates = _prepare_rerank_templates(beat, cfg)
+    if not templates:
+        return []
+
+    cut_offsets = _reference_internal_cut_offsets(beat, cfg)
+    dense: list[tuple[float, float, float, float, int]] = []
+    with open_video(cfg.paths.source_movie) as cap:
+        for scene, seed_score in seeded_scenes.values():
+            fps = _source_fps_from_scene(scene) or cfg.export.edl_frame_rate
+            tail_s = max(0.0, cfg.cv.deep_scan.trim_tail_frames / fps)
+            start_s = max(0.0, float(scene.start_s))
+            end_s = max(start_s, float(scene.end_s) - tail_s)
+            if end_s <= start_s:
+                continue
+            span_s = end_s - start_s
+            step_s = max(0.04, cfg.vision.local_scan_step_s)
+            max_points = max(2, cfg.vision.local_scan_max_points_per_scene)
+            point_count = int(span_s / step_s) + 1
+            if point_count > max_points:
+                step_s = span_s / float(max_points - 1)
+
+            t_sec = start_s
+            while t_sec <= end_s + 0.001:
+                content_score = _fixed_content_sequence_score(cap, t_sec, templates, cfg)
+                usable_s = max(0.0, float(scene.end_s) - t_sec - tail_s)
+                coverage_score = (
+                    min(1.0, usable_s / matchable_duration_s)
+                    if matchable_duration_s > 0 else 0.0
+                )
+                rank_score = (
+                    content_score * 0.50
+                    + coverage_score * 0.35
+                    + seed_score * 0.15
+                )
+                coarse_score = max(
+                    weighted_floor,
+                    min(0.99, seed_score * 0.80 + content_score * 0.20),
+                )
+                dense.append((rank_score, coarse_score, t_sec, content_score, scene.scene_id))
+                t_sec += step_s
+
+            for cut_offset in cut_offsets:
+                shifted_t = max(0.0, float(scene.start_s) - cut_offset)
+                coverage_score = (
+                    min(
+                        1.0,
+                        _contiguous_scene_coverage_duration(
+                            beat,
+                            shifted_t,
+                            scenes,
+                            matchable_duration_s,
+                            cfg,
+                        ) / matchable_duration_s,
+                    )
+                    if matchable_duration_s > 0 else 0.0
+                )
+                if coverage_score < 0.80:
+                    continue
+                content_score = _fixed_content_sequence_score(cap, shifted_t, templates, cfg)
+                rank_score = (
+                    content_score * 0.56
+                    + coverage_score * 0.34
+                    + seed_score * 0.10
+                )
+                coarse_score = max(
+                    weighted_floor,
+                    min(0.99, seed_score * 0.78 + content_score * 0.22),
+                )
+                dense.append((rank_score, coarse_score, shifted_t, content_score, scene.scene_id))
+
+    dense.sort(key=lambda item: item[0], reverse=True)
+    top = dense[: max(0, cfg.vision.local_scan_top_candidates)]
+    if top:
+        logger.info(
+            'Beat %d: dense vision content scan kept %d/%d candidates; best scene=%d in=%.3fs content=%.3f rank=%.3f.',
+            beat.beat_id,
+            len(top),
+            len(dense),
+            top[0][4],
+            top[0][2],
+            top[0][3],
+            top[0][0],
+        )
+    return [(coarse_score, t_sec) for _, coarse_score, t_sec, _, _ in top]
+
+
+def _beat_offsets(duration_s: float) -> list[float]:
+    """Use several frames across the beat, including the leading edge."""
+    if duration_s < 1.0:
+        return [0.0, duration_s * 0.35, duration_s * 0.70]
+    if duration_s < 2.5:
+        return [duration_s * r for r in (0.00, 0.15, 0.35, 0.55, 0.78)]
+    return [duration_s * r for r in (0.00, 0.12, 0.30, 0.50, 0.70, 0.88)]
+
+
+def _prepare_beat_templates(beat: TrailerBeat, cfg: AppConfig) -> list[tuple[float, np.ndarray]]:
+    templates: list[tuple[float, np.ndarray]] = []
+    matchable_s = estimate_matchable_reference_duration(beat, cfg)
+    for offset_s in _beat_offsets(matchable_s):
+        frame = grab_frame_at_path(beat.trailer_path, beat.start_s + offset_s)
+        if frame is None or not _is_scoreable_reference_frame(frame, cfg):
+            continue
+        templates.append((offset_s, _prepare_template(frame, cfg)))
+    return templates
+
+
+def _prepare_beat_templates_stepped(
+    beat: TrailerBeat,
+    cfg: AppConfig,
+    step_s: float = 0.12,
+) -> list[tuple[float, np.ndarray]]:
+    templates: list[tuple[float, np.ndarray]] = []
+    matchable_s = estimate_matchable_reference_duration(beat, cfg, sample_step_s=step_s)
+    t = 0.0
+    while t <= matchable_s:
+        frame = grab_frame_at_path(beat.trailer_path, beat.start_s + t)
+        if frame is not None and _is_scoreable_reference_frame(frame, cfg):
+            templates.append((t, _prepare_template(frame, cfg)))
+        t = round(t + step_s, 6)
+    return templates
+
+
+def _prepare_motion_templates(
+    beat: TrailerBeat,
+    cfg: AppConfig,
+    step_s: float = 0.12,
+) -> list[tuple[float, float, np.ndarray, tuple[int, ...]]]:
+    """
+    Build reference frame-difference templates for motion-phase alignment.
+
+    Absolute image similarity can match the right shot at the wrong point in a
+    repeated movement. Frame-to-frame deltas make the refine pass care about the
+    phase and direction of motion as well.
+    """
+    result: list[tuple[float, float, np.ndarray, tuple[int, ...]]] = []
+    max_offset = max(0.0, beat.duration_s - step_s)
+    t = 0.0
+    while t <= max_offset:
+        f0 = grab_frame_at_path(beat.trailer_path, beat.start_s + t)
+        f1 = grab_frame_at_path(beat.trailer_path, beat.start_s + t + step_s)
+        if (
+            f0 is not None
+            and f1 is not None
+            and _is_scoreable_reference_frame(f0, cfg)
+            and _is_scoreable_reference_frame(f1, cfg)
+        ):
+            feat0 = _prepare_template(f0, cfg)
+            feat1 = _prepare_template(f1, cfg)
+            result.append((t, step_s, cv2.absdiff(feat1, feat0), feat0.shape))
+        t = round(t + step_s, 6)
+    return result
+
+
+def _is_dark_reference_frame(frame: np.ndarray, cfg: AppConfig) -> bool:
+    cropped = text_safe_crop(
+        frame,
+        cfg.cv.vibe_check.crop_top_fraction,
+        cfg.cv.vibe_check.crop_bottom_fraction,
+    )
+    gray = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY)
+    return float(np.mean(gray)) < 28.0 and float(np.percentile(gray, 90)) < 58.0
+
+
+def _reference_visibility_stats(frame: np.ndarray, cfg: AppConfig) -> tuple[float, float, float]:
+    cropped = text_safe_crop(
+        frame,
+        cfg.cv.vibe_check.crop_top_fraction,
+        cfg.cv.vibe_check.crop_bottom_fraction,
+    )
+    gray = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY)
+    p10 = float(np.percentile(gray, 10))
+    p90 = float(np.percentile(gray, 90))
+    return float(np.mean(gray)), p90, p90 - p10
+
+
+def _is_scoreable_reference_frame(frame: np.ndarray, cfg: AppConfig) -> bool:
+    """Exclude black, fade, and low-visibility reference frames from scoring."""
+    if _is_dark_reference_frame(frame, cfg):
+        return False
+
+    mean_luma, p90_luma, contrast = _reference_visibility_stats(frame, cfg)
+    low_visibility = (
+        mean_luma < cfg.cv.deep_scan.scoreable_luma_mean_min
+        and p90_luma < cfg.cv.deep_scan.scoreable_luma_p90_min
+    )
+    return not low_visibility and contrast >= cfg.cv.deep_scan.scoreable_contrast_min
+
+
+def estimate_matchable_reference_duration(
+    beat: TrailerBeat,
+    cfg: AppConfig,
+    sample_step_s: float | None = None,
+) -> float:
+    """
+    Estimate the part of a trailer beat that should be source-matchable.
+
+    Trailer beats often include trailing black/title/credit frames that do not
+    exist in the source movie. Those frames should not force the source match to
+    cover the full beat duration.
+    """
+    step_s = sample_step_s if sample_step_s is not None else cfg.cv.deep_scan.span_sample_step_s
+    samples: list[tuple[float, bool]] = []
+    t = 0.0
+    while t <= beat.duration_s:
+        frame = grab_frame_at_path(beat.trailer_path, beat.start_s + t)
+        if frame is not None:
+            samples.append((t, _is_dark_reference_frame(frame, cfg)))
+        t = round(t + step_s, 6)
+
+    if not samples:
+        return beat.duration_s
+
+    dark_run_start: float | None = None
+    saw_visible = False
+    min_dark_break_s = max(0.24, step_s * 2.0)
+    for offset_s, is_dark in samples:
+        if not is_dark:
+            saw_visible = True
+            dark_run_start = None
+            continue
+
+        if saw_visible:
+            if dark_run_start is None:
+                dark_run_start = offset_s
+            if offset_s - dark_run_start >= min_dark_break_s:
+                break
+
+    if dark_run_start is None:
+        return beat.duration_s
+
+    # Keep a small buffer before the first sustained dark/title break so the
+    # source clip does not visibly end before the trailer begins its fade/card.
+    # Long beats can contain later credit/title islands; those should not force
+    # one source clip to validate unrelated images.
+    return max(step_s, min(beat.duration_s, dark_run_start + step_s))
+
+
+def _sequence_score(
+    cap: cv2.VideoCapture,
+    in_point_s: float,
+    templates: list[tuple[float, np.ndarray]],
+    cfg: AppConfig,
+) -> float:
+    weighted_scores: list[float] = []
+    raw_scores: list[float] = []
+    for offset_s, template in templates:
+        frame = grab_frame_at(cap, in_point_s + offset_s)
+        if frame is None:
+            return -1.0
+        floating_score = _match_score(frame, template, cfg)
+        fixed_score = _fixed_position_score(frame, template, cfg)
+        score = (floating_score * 0.55) + (fixed_score * 0.45)
+        # The first frames matter most for perceived sync. Weight them higher
+        # so a match that begins a few frames early loses to a better aligned hit.
+        weight = 1.35 if offset_s <= 0.16 else 1.0
+        weighted_scores.append(score * weight)
+        raw_scores.append(score)
+    if not raw_scores:
+        return -1.0
+
+    # Reward consistently good temporal alignment. A single strong frame is not
+    # enough if the other beat frames drift away.
+    weighted_avg = sum(weighted_scores) / (len(raw_scores) + 0.35 * sum(1 for o, _ in templates if o <= 0.16))
+    return float(weighted_avg * 0.70 + min(raw_scores) * 0.30)
+
+
+def _content_alignment_templates(
+    beat: TrailerBeat,
+    cfg: AppConfig,
+) -> list[tuple[float, np.ndarray]]:
+    matchable_s = estimate_matchable_reference_duration(
+        beat,
+        cfg,
+        sample_step_s=cfg.cv.deep_scan.content_align_sample_step_s,
+    )
+    step_s = max(1.0 / cfg.export.edl_frame_rate, cfg.cv.deep_scan.content_align_sample_step_s)
+    max_offset_s = max(0.0, min(beat.duration_s, matchable_s) - step_s)
+    offsets = [0.0]
+    t = step_s
+    while t <= max_offset_s:
+        offsets.append(round(t, 6))
+        t = round(t + step_s, 6)
+    if matchable_s > step_s and offsets[-1] < max_offset_s:
+        offsets.append(round(max_offset_s, 6))
+
+    templates: list[tuple[float, np.ndarray]] = []
+    for offset_s in offsets:
+        frame = grab_frame_at_path(beat.trailer_path, beat.start_s + offset_s)
+        if frame is not None:
+            if not _is_scoreable_reference_frame(frame, cfg):
+                continue
+            templates.append((offset_s, _prepare_template(frame, cfg)))
+    if not templates:
+        return _prepare_beat_templates(beat, cfg)
+    return templates
+
+
+def _content_alignment_score(
+    cap: cv2.VideoCapture,
+    in_point_s: float,
+    templates: list[tuple[float, np.ndarray]],
+    cfg: AppConfig,
+) -> float:
+    if not templates:
+        return -1.0
+
+    weighted_total = 0.0
+    weight_total = 0.0
+    raw_scores: list[float] = []
+    early_scores: list[float] = []
+
+    for offset_s, template in templates:
+        frame = grab_frame_at(cap, in_point_s + offset_s)
+        if frame is None:
+            return -1.0
+
+        # For offset detection the fixed frame position is intentionally more
+        # important than free template placement. Free placement can make the
+        # right shot look acceptable even when the movement is a few frames off.
+        fixed_score = _fixed_position_score(frame, template, cfg)
+        floating_score = _match_score(frame, template, cfg)
+        score = fixed_score * 0.72 + floating_score * 0.28
+
+        weight = 1.45 if offset_s <= 0.20 else 1.0
+        weighted_total += score * weight
+        weight_total += weight
+        raw_scores.append(score)
+        if offset_s <= 0.36:
+            early_scores.append(score)
+
+    avg_score = weighted_total / weight_total if weight_total > 0 else -1.0
+    min_score = min(raw_scores) if raw_scores else -1.0
+    early_score = sum(early_scores) / len(early_scores) if early_scores else avg_score
+    return float(avg_score * 0.55 + min_score * 0.25 + early_score * 0.20)
+
+
+def align_in_point_by_content(
+    beat: TrailerBeat,
+    estimated_in_point_s: float,
+    cfg: AppConfig,
+    search_window_s: float | None = None,
+) -> tuple[float, float]:
+    """
+    Find the frame offset directly from image content around a rough match.
+
+    This is deliberately local: once a candidate shot is plausible, scanning a
+    small window around it with many reference frames is faster and more robust
+    than repeating a global scan or applying a fixed frame preroll.
+    """
+    templates = _content_alignment_templates(beat, cfg)
+    if not templates:
+        return estimated_in_point_s, 0.0
+
+    with open_video(cfg.paths.source_movie) as cap:
+        fps = float(cap.get(cv2.CAP_PROP_FPS)) or cfg.export.edl_frame_rate
+        frame_step_s = 1.0 / fps
+        window_s = (
+            search_window_s
+            if search_window_s is not None
+            else cfg.cv.deep_scan.content_align_window_seconds
+        )
+        start_s = max(0.0, estimated_in_point_s - window_s)
+        end_s = estimated_in_point_s + window_s
+        tie_delta = cfg.cv.deep_scan.start_tie_break_score_delta
+
+        best_in = estimated_in_point_s
+        best_score = -1.0
+        t = start_s
+        while t <= end_s:
+            score = _content_alignment_score(cap, t, templates, cfg)
+            if score > best_score + tie_delta:
+                best_score = score
+                best_in = t
+            elif score >= best_score - tie_delta and abs(t - estimated_in_point_s) < abs(best_in - estimated_in_point_s):
+                best_in = t
+            t = round(t + frame_step_s, 6)
+
+    return best_in, max(0.0, best_score)
+
+
+def _motion_phase_score(
+    cap: cv2.VideoCapture,
+    in_point_s: float,
+    motion_templates: list[tuple[float, float, np.ndarray, tuple[int, ...]]],
+    cfg: AppConfig,
+) -> float:
+    scores: list[float] = []
+    for offset_s, step_s, ref_delta, template_shape in motion_templates:
+        f0 = grab_frame_at(cap, in_point_s + offset_s)
+        f1 = grab_frame_at(cap, in_point_s + offset_s + step_s)
+        if f0 is None or f1 is None:
+            return -1.0
+        src0 = _fixed_feature(f0, template_shape, cfg)
+        src1 = _fixed_feature(f1, template_shape, cfg)
+        scores.append(_corr_same_size(cv2.absdiff(src1, src0), ref_delta))
+
+    if not scores:
+        return 0.0
+    return float((sum(scores) / len(scores)) * 0.65 + min(scores) * 0.35)
+
+
+def estimate_usable_source_duration(
+    beat: TrailerBeat,
+    in_point_s: float,
+    cfg: AppConfig,
+    sample_step_s: float | None = None,
+    min_keep_s: float = 0.5,
+) -> tuple[float, float]:
+    """
+    Estimate how long the source stays visually aligned with the beat.
+
+    This catches cases where the source dissolves/cuts into the next shot while
+    the trailer beat continues into a title card or black fade.
+
+    Returns:
+        (usable_duration_s, average_good_score)
+    """
+    step_s = sample_step_s if sample_step_s is not None else cfg.cv.deep_scan.span_sample_step_s
+    templates = _prepare_beat_templates_stepped(beat, cfg, step_s)
+    if not templates:
+        return beat.duration_s, 0.0
+
+    scores: list[tuple[float, float]] = []
+    source_fps = cfg.export.edl_frame_rate
+    with open_video(cfg.paths.source_movie) as cap:
+        source_fps = float(cap.get(cv2.CAP_PROP_FPS)) or cfg.export.edl_frame_rate
+        for offset_s, template in templates:
+            frame = grab_frame_at(cap, in_point_s + offset_s)
+            if frame is None:
+                break
+            scores.append((offset_s, _match_score(frame, template, cfg)))
+
+    if not scores:
+        return 0.0, 0.0
+
+    warmup_scores = [score for offset, score in scores if offset <= min(1.0, beat.duration_s * 0.35)]
+    baseline = max(warmup_scores) if warmup_scores else max(score for _, score in scores)
+    min_score = max(0.34, baseline * 0.48)
+
+    last_good = 0.0
+    bad_run = 0
+    good_scores: list[float] = []
+
+    for offset_s, score in scores:
+        if score >= min_score:
+            last_good = offset_s
+            bad_run = 0
+            good_scores.append(score)
+            continue
+
+        if offset_s < min_keep_s:
+            continue
+
+        bad_run += 1
+        if bad_run >= 3:
+            break
+
+    tail_safety_s = max(0.0, cfg.cv.deep_scan.trim_tail_frames / source_fps)
+    usable = min(beat.duration_s, max(0.0, last_good - tail_safety_s))
+    if usable < min_keep_s and scores:
+        usable = min(beat.duration_s, max(min_keep_s, scores[0][0] + step_s - tail_safety_s))
+
+    avg_good = float(sum(good_scores) / len(good_scores)) if good_scores else 0.0
+    return usable, avg_good
+
+
+def refine_timestamp(template: np.ndarray, t_sec: float, cfg: AppConfig) -> float:
+    best_score = -1.0
+    best_t = t_sec
+    tie_delta = cfg.cv.deep_scan.start_tie_break_score_delta
+
+    with open_video(cfg.paths.source_movie) as cap:
+        fps = float(cap.get(cv2.CAP_PROP_FPS))
+        step = 1.0 / fps
+        start_t = max(0.0, t_sec - 0.5)
+        end_t = t_sec + 0.5
+
+        t = start_t
+        while t <= end_t:
+            frame = grab_frame_at(cap, t)
+            if frame is not None:
+                max_val = _match_score(frame, template, cfg)
+                if max_val > best_score + tie_delta:
+                    best_score = max_val
+                    best_t = t
+                elif max_val >= best_score - tie_delta and t < best_t:
+                    best_t = t
+            t += step
+
+    return best_t
+
+
+def refine_in_point_with_sequence(
+    beat: TrailerBeat,
+    estimated_in_point_s: float,
+    cfg: AppConfig,
+    search_window_s: float | None = None,
+) -> tuple[float, float]:
+    """
+    Refine a rough source in-point by comparing several frames across the beat.
+
+    Returns:
+        (best_in_point_s, sequence_score)
+    """
+    return align_in_point_by_content(beat, estimated_in_point_s, cfg, search_window_s)
+
+
+def _find_scene_for_time(scenes: Sequence | None, t_sec: float, cfg: AppConfig):
+    if not scenes:
+        return None
+    for idx, scene in enumerate(scenes):
+        if scene.start_s <= t_sec < scene.end_s:
+            if (
+                scene.end_s - t_sec <= cfg.cv.deep_scan.scene_boundary_epsilon_s
+                and idx + 1 < len(scenes)
+            ):
+                return scenes[idx + 1]
+            return scene
+    return None
+
+
+def _source_fps_from_scene(scene) -> float:
+    duration_s = max(0.0, scene.end_s - scene.start_s)
+    frame_count = max(0, scene.end_frame - scene.start_frame)
+    return frame_count / duration_s if duration_s > 0 and frame_count > 0 else 0.0
+
+
+def _apply_start_preroll(in_point_s: float, source_fps: float, cfg: AppConfig) -> float:
+    if cfg.cv.deep_scan.start_preroll_frames <= 0:
+        return in_point_s
+    fps = source_fps or cfg.export.edl_frame_rate
+    return max(0.0, in_point_s - (cfg.cv.deep_scan.start_preroll_frames / fps))
+
+
+def _clamp_to_scene_start(in_point_s: float, scene) -> float:
+    if scene is None:
+        return in_point_s
+    return max(float(scene.start_s), in_point_s)
+
+
+def _add_top_candidate(
+    candidates: list[tuple[float, float]],
+    score: float,
+    t_sec: float,
+    max_candidates: int,
+    min_distance_s: float,
+) -> list[tuple[float, float]]:
+    """
+    Keep diverse coarse candidates as (score, midpoint_time).
+
+    A single best midpoint frame is too brittle: repeated actors, similar color
+    palettes, cars, forests, and title-card darkness can all create plausible
+    false positives. Keeping a ranked pool lets the multi-frame sequence pass
+    choose the temporally consistent match.
+    """
+    for idx, (old_score, old_t) in enumerate(candidates):
+        if abs(old_t - t_sec) < min_distance_s:
+            if score > old_score:
+                candidates[idx] = (score, t_sec)
+            return sorted(candidates, key=lambda item: item[0], reverse=True)[:max_candidates]
+
+    candidates.append((score, t_sec))
+    return sorted(candidates, key=lambda item: item[0], reverse=True)[:max_candidates]
+
+
+def run_global_scan(
+    beats: Sequence[TrailerBeat],
+    cfg: AppConfig,
+    scenes: Sequence | None = None,
+    seed_in_points: dict[int, Sequence[SeedPoint]] | None = None,
+) -> list[MatchResult]:
+    logger.info('[Global Scan] Preparing templates for %d beats...', len(beats))
+    templates = []
+    midpoint_templates = []
+    beat_valid = []
+
+    for b in beats:
+        bf = grab_frame_at_path(cfg.paths.reference_trailer, b.start_s + (b.end_s - b.start_s)/2)
+        if bf is None:
+            midpoint_templates.append(None)
+            templates.append([])
+            beat_valid.append(False)
+            continue
+
+        midpoint_templates.append(_prepare_template(bf, cfg))
+        beat_templates = _prepare_beat_templates(b, cfg)
+        templates.append(beat_templates)
+        beat_valid.append(bool(beat_templates))
+
+    top_candidates: list[list[tuple[float, float]]] = [[] for _ in beats]
+    seed_candidates: list[list[tuple[float, float]]] = [[] for _ in beats]
+    has_weighted_seeds = False
+    for idx, beat in enumerate(beats):
+        for seed in (seed_in_points or {}).get(beat.beat_id, ()):
+            if isinstance(seed, tuple):
+                seed_t = float(seed[0])
+                seed_score = max(
+                    cfg.cv.deep_scan.coarse_candidate_threshold,
+                    min(0.99, float(seed[1])),
+                )
+                has_weighted_seeds = True
+            else:
+                seed_t = float(seed)
+                seed_score = cfg.cv.deep_scan.coarse_candidate_threshold
+            seed_candidate = (
+                seed_score,
+                max(0.0, seed_t),
+            )
+            seed_candidates[idx].append(seed_candidate)
+            top_candidates[idx] = _add_top_candidate(
+                top_candidates[idx],
+                seed_candidate[0],
+                seed_candidate[1],
+                max_candidates=cfg.cv.deep_scan.sequence_candidate_count,
+                min_distance_s=cfg.cv.deep_scan.sequence_min_distance_s,
+            )
+        if (seed_in_points or {}).get(beat.beat_id):
+            logger.info(
+                'Beat %d: added %d seeded in-point candidates.',
+                beat.beat_id,
+                len((seed_in_points or {}).get(beat.beat_id, ())),
+            )
+
+    skip_coarse_scan = (
+        cfg.vision.enabled
+        and cfg.cv.deep_scan.skip_coarse_scan_with_weighted_seeds
+        and has_weighted_seeds
+        and all(top_candidates[i] for i, valid in enumerate(beat_valid) if valid)
+    )
+
+    if skip_coarse_scan:
+        logger.info('[Global Scan] Weighted vision seeds present; skipping full FFmpeg coarse scan.')
+    else:
+        fps = 2.0
+        cmd = [
+            'ffmpeg', '-i', str(cfg.paths.source_movie),
+            '-vf', f'scale={cfg.video.proxy_width}:{cfg.video.proxy_height},fps={fps}',
+            '-f', 'image2pipe', '-vcodec', 'rawvideo', '-pix_fmt', 'bgr24', '-'
+        ]
+        logger.info('[Global Scan] Streaming %s via FFmpeg (%.1f fps) ...', cfg.paths.source_movie.name, fps)
+
+        p = sp.Popen(cmd, stdout=sp.PIPE, stderr=sp.DEVNULL)
+        frame_size = cfg.video.proxy_width * cfg.video.proxy_height * 3
+        frame_idx = 0
+        start_t = time.time()
+
+        while True:
+            raw = p.stdout.read(frame_size)
+            if len(raw) != frame_size: break
+
+            frame = np.frombuffer(raw, dtype=np.uint8).reshape((cfg.video.proxy_height, cfg.video.proxy_width, 3))
+            haystack = _prepare_haystack(frame, cfg)
+
+            for i, beat_templates in enumerate(templates):
+                if not beat_valid[i]: continue
+                source_t = frame_idx / fps
+                for beat_offset_s, template in beat_templates:
+                    res = cv2.matchTemplate(haystack, template, cv2.TM_CCOEFF_NORMED)
+                    _, max_val, _, _ = cv2.minMaxLoc(res)
+                    candidate_in_s = source_t - beat_offset_s
+                    if candidate_in_s < 0.0:
+                        continue
+
+                    top_candidates[i] = _add_top_candidate(
+                        top_candidates[i],
+                        float(max_val),
+                        candidate_in_s,
+                        max_candidates=cfg.cv.deep_scan.sequence_candidate_count,
+                        min_distance_s=cfg.cv.deep_scan.sequence_min_distance_s,
+                    )
+
+            frame_idx += 1
+            if frame_idx % 1000 == 0:
+                logger.info('[Global Scan] Processed %d frames (%.1fs movie time)...', frame_idx, frame_idx / fps)
+
+        p.stdout.close()
+        p.wait()
+
+        logger.info('[Global Scan] Finished streaming %d frames in %.1fs.', frame_idx, time.time() - start_t)
+
+    results = []
+    source_info = get_video_info(cfg.paths.source_movie)
+    source_fps = float(source_info['fps']) or 24.0
+
+    for i, b in enumerate(beats):
+        if not beat_valid[i]: continue
+
+        candidates = top_candidates[i]
+        if not candidates:
+            continue
+
+        score = float(candidates[0][0])
+
+        if score >= cfg.cv.deep_scan.coarse_candidate_threshold:
+            matchable_duration_s = estimate_matchable_reference_duration(b, cfg)
+            logger.info(
+                'Beat %d: refining %d temporal candidates (best offset score %.3f, matchable %.2fs / beat %.2fs).',
+                b.beat_id,
+                len(candidates),
+                score,
+                matchable_duration_s,
+                b.duration_s,
+            )
+
+            best_result: MatchResult | None = None
+            best_short_result: MatchResult | None = None
+            best_short_coverage = -1.0
+            best_duration_coverage = -1.0
+            best_content_score = -1.0
+            rejected_short_candidates = 0
+            rejected_content_candidates = 0
+            scan_cfg = cfg.cv.deep_scan
+            content_gate = (
+                min(scan_cfg.provisional_content_threshold, cfg.vision.content_threshold)
+                if skip_coarse_scan and has_weighted_seeds
+                else scan_cfg.provisional_content_threshold
+            )
+
+            candidate_pool = candidates[:scan_cfg.content_rerank_candidate_count]
+            for seed_candidate in seed_candidates[i]:
+                candidate_pool = _add_top_candidate(
+                    candidate_pool,
+                    seed_candidate[0],
+                    seed_candidate[1],
+                    max_candidates=scan_cfg.content_rerank_candidate_count + len(seed_candidates[i]),
+                    min_distance_s=scan_cfg.sequence_min_distance_s,
+                )
+            if skip_coarse_scan and has_weighted_seeds:
+                dense_candidates = _dense_weighted_seed_candidates(
+                    b,
+                    seed_candidates[i],
+                    cfg,
+                    scenes,
+                    matchable_duration_s,
+                )
+                for dense_candidate in dense_candidates:
+                    candidate_pool = _add_top_candidate(
+                        candidate_pool,
+                        dense_candidate[0],
+                        dense_candidate[1],
+                        max_candidates=(
+                            scan_cfg.content_rerank_candidate_count
+                            + len(seed_candidates[i])
+                            + len(dense_candidates)
+                        ),
+                        min_distance_s=max(0.04, cfg.vision.local_scan_step_s * 0.5),
+                    )
+            reranked_candidates = _rerank_candidates_by_content(
+                b,
+                candidate_pool,
+                cfg,
+                scenes=scenes,
+                matchable_duration_s=matchable_duration_s,
+            )
+            refine_limit = (
+                min(scan_cfg.max_refine_candidates, cfg.vision.max_refine_candidates)
+                if skip_coarse_scan and has_weighted_seeds
+                else scan_cfg.max_refine_candidates
+            )
+            refine_candidates = [
+                (coarse_score, in_point_s)
+                for _, coarse_score, in_point_s in reranked_candidates[:refine_limit]
+            ]
+            validation_templates = _prepare_validation_templates(b, cfg)
+            logger.info(
+                'Beat %d: content-reranked top %d / %d candidates.',
+                b.beat_id,
+                len(refine_candidates),
+                len(candidate_pool),
+            )
+
+            for coarse_score, coarse_in_s in refine_candidates:
+                rough_in_s = coarse_in_s
+                is_weighted_seed_candidate = (
+                    skip_coarse_scan
+                    and has_weighted_seeds
+                    and coarse_score > scan_cfg.coarse_candidate_threshold + 0.05
+                )
+                if midpoint_templates[i] is not None and not is_weighted_seed_candidate:
+                    midpoint_t = coarse_in_s + (b.duration_s / 2)
+                    fine_t = refine_timestamp(midpoint_templates[i], midpoint_t, cfg)
+                    rough_in_s = max(0.0, fine_t - (b.duration_s / 2))
+                local_align_window_s = (
+                    min(cfg.vision.local_scan_step_s, cfg.cv.deep_scan.content_align_window_seconds)
+                    if is_weighted_seed_candidate
+                    else None
+                )
+                refined_in_s, sequence_score = refine_in_point_with_sequence(
+                    b,
+                    rough_in_s,
+                    cfg,
+                    search_window_s=local_align_window_s,
+                )
+                scene = _find_scene_for_time(scenes, refined_in_s, cfg)
+                scene_fps = _source_fps_from_scene(scene) if scene is not None else source_fps
+                adjusted_in_s = _apply_start_preroll(refined_in_s, scene_fps, cfg)
+                adjusted_in_s = _clamp_to_scene_start(adjusted_in_s, scene)
+                scene = _find_scene_for_time(scenes, adjusted_in_s, cfg)
+                usable_duration_s, span_score = estimate_usable_source_duration(b, adjusted_in_s, cfg)
+                out_s = adjusted_in_s + usable_duration_s
+                if scene is not None:
+                    out_s = min(out_s, scene.end_s)
+                duration_s = max(0.0, out_s - adjusted_in_s)
+                duration_coverage = min(1.0, duration_s / matchable_duration_s) if matchable_duration_s > 0 else 0.0
+                with open_video(cfg.paths.source_movie) as validation_cap:
+                    original_content_score = _fixed_content_sequence_score(
+                        validation_cap,
+                        adjusted_in_s,
+                        validation_templates,
+                        cfg,
+                    )
+                content_score = original_content_score
+                content_in_s, align_content_score = align_in_point_by_content(
+                    b,
+                    adjusted_in_s,
+                    cfg,
+                    search_window_s=(
+                        local_align_window_s
+                        if local_align_window_s is not None
+                        else min(0.8, cfg.cv.deep_scan.content_align_window_seconds)
+                    ),
+                )
+                if abs(content_in_s - adjusted_in_s) <= cfg.cv.deep_scan.content_align_window_seconds:
+                    with open_video(cfg.paths.source_movie) as validation_cap:
+                        aligned_content_score = _fixed_content_sequence_score(
+                            validation_cap,
+                            content_in_s,
+                            validation_templates,
+                            cfg,
+                        )
+                    if aligned_content_score >= original_content_score + 0.01:
+                        adjusted_in_s = content_in_s
+                        content_score = min(align_content_score, aligned_content_score)
+                        scene = _find_scene_for_time(scenes, adjusted_in_s, cfg)
+                        usable_duration_s = max(0.0, duration_s)
+                        out_s = adjusted_in_s + usable_duration_s
+                        if scene is not None:
+                            out_s = min(out_s, scene.end_s)
+                        duration_s = max(0.0, out_s - adjusted_in_s)
+                        duration_coverage = (
+                            min(1.0, duration_s / matchable_duration_s)
+                            if matchable_duration_s > 0 else 0.0
+                        )
+
+                if is_weighted_seed_candidate and scene is not None and content_score >= content_gate:
+                    contiguous_usable_s = _contiguous_scene_coverage_duration(
+                        b,
+                        adjusted_in_s,
+                        scenes,
+                        matchable_duration_s,
+                        cfg,
+                    )
+                    scene_duration_s = min(b.duration_s, contiguous_usable_s)
+                    if scene_duration_s > duration_s:
+                        usable_duration_s = scene_duration_s
+                        out_s = adjusted_in_s + usable_duration_s
+                        duration_s = usable_duration_s
+                        duration_coverage = (
+                            min(1.0, duration_s / matchable_duration_s)
+                            if matchable_duration_s > 0 else 0.0
+                        )
+                        span_score = max(span_score, content_score)
+
+                final_score = (
+                    sequence_score * scan_cfg.sequence_score_weight
+                    + span_score * scan_cfg.span_score_weight
+                    + coarse_score * scan_cfg.coarse_score_weight
+                    + duration_coverage * scan_cfg.duration_score_weight
+                )
+                final_score = (
+                    final_score * (1.0 - scan_cfg.content_validation_weight)
+                    + content_score * scan_cfg.content_validation_weight
+                )
+                if is_weighted_seed_candidate:
+                    vision_provisional_score = (
+                        content_score * 0.55
+                        + duration_coverage * 0.33
+                        + coarse_score * 0.12
+                    )
+                    final_score = max(final_score, vision_provisional_score)
+                if content_score < scan_cfg.match_threshold and not is_weighted_seed_candidate:
+                    final_score = min(final_score, content_score)
+                if content_score < content_gate:
+                    logger.debug(
+                        'Beat %d rejected by content validation in=%.3fs scene=%s content=%.3f min=%.3f',
+                        b.beat_id,
+                        adjusted_in_s,
+                        scene.scene_id if scene is not None else 'none',
+                        content_score,
+                        content_gate,
+                    )
+                    rejected_content_candidates += 1
+                    continue
+                candidate_result = MatchResult(
+                    beat_id=b.beat_id,
+                    scene_id=scene.scene_id if scene is not None else 0,
+                    source_path=cfg.paths.source_movie,
+                    in_point_s=max(0.0, adjusted_in_s),
+                    out_point_s=out_s,
+                    in_point_frame=int(max(0.0, adjusted_in_s) * source_fps),
+                    match_score=final_score,
+                )
+
+                if duration_coverage < scan_cfg.min_duration_coverage:
+                    rejected_short_candidates += 1
+                    logger.debug(
+                        'Beat %d short candidate in=%.3fs scene=%s sequence=%.3f span=%.3f coarse=%.3f content=%.3f coverage=%.2f final=%.3f',
+                        b.beat_id,
+                        adjusted_in_s,
+                        scene.scene_id if scene is not None else 'none',
+                        sequence_score,
+                        span_score,
+                        coarse_score,
+                        content_score,
+                        duration_coverage,
+                        final_score,
+                    )
+                    long_enough_for_review = duration_s >= max(0.5, matchable_duration_s * 0.45)
+                    visually_plausible = (
+                        sequence_score >= scan_cfg.provisional_match_threshold
+                        or final_score >= scan_cfg.provisional_match_threshold
+                    )
+                    if long_enough_for_review and visually_plausible:
+                        if (
+                            best_short_result is None
+                            or candidate_result.match_score
+                            > best_short_result.match_score + scan_cfg.duration_tie_break_score_delta
+                            or (
+                                candidate_result.match_score
+                                >= best_short_result.match_score - scan_cfg.duration_tie_break_score_delta
+                                and duration_coverage > best_short_coverage
+                            )
+                        ):
+                            best_short_result = candidate_result
+                            best_short_coverage = duration_coverage
+                    continue
+
+                logger.debug(
+                    'Beat %d candidate in=%.3fs scene=%s sequence=%.3f span=%.3f coarse=%.3f content=%.3f coverage=%.2f final=%.3f',
+                    b.beat_id,
+                    adjusted_in_s,
+                    scene.scene_id if scene is not None else 'none',
+                    sequence_score,
+                    span_score,
+                    coarse_score,
+                    content_score,
+                    duration_coverage,
+                    final_score,
+                )
+
+                clearly_better_score = (
+                    best_result is None
+                    or candidate_result.match_score
+                    > best_result.match_score + scan_cfg.duration_tie_break_score_delta
+                )
+                similar_score_better_duration = (
+                    best_result is not None
+                    and candidate_result.match_score
+                    >= best_result.match_score - scan_cfg.duration_tie_break_score_delta
+                    and duration_coverage > best_duration_coverage + 0.03
+                )
+                similar_vision_score_earlier_phase = (
+                    is_weighted_seed_candidate
+                    and best_result is not None
+                    and candidate_result.scene_id == best_result.scene_id
+                    and candidate_result.match_score
+                    >= best_result.match_score - cfg.vision.local_scan_tie_break_score_delta
+                    and content_score >= best_content_score - 0.005
+                    and duration_coverage >= best_duration_coverage - 0.03
+                    and candidate_result.in_point_s < best_result.in_point_s
+                )
+                similar_vision_score_better_phase = (
+                    is_weighted_seed_candidate
+                    and best_result is not None
+                    and candidate_result.scene_id == best_result.scene_id
+                    and candidate_result.match_score
+                    >= best_result.match_score - cfg.vision.local_scan_tie_break_score_delta
+                    and content_score > best_content_score + 0.008
+                    and duration_coverage >= best_duration_coverage - 0.03
+                )
+
+                if (
+                    clearly_better_score
+                    or similar_score_better_duration
+                    or similar_vision_score_earlier_phase
+                    or similar_vision_score_better_phase
+                ):
+                    best_result = candidate_result
+                    best_duration_coverage = duration_coverage
+                    best_content_score = content_score
+
+            if best_result is None:
+                if best_short_result is not None:
+                    logger.warning(
+                        'Beat %d: using short provisional automatic match scene=%d in=%.3fs dur=%.3fs coverage=%.2f score=%.3f',
+                        b.beat_id,
+                        best_short_result.scene_id,
+                        best_short_result.in_point_s,
+                        best_short_result.duration_s,
+                        best_short_coverage,
+                        best_short_result.match_score,
+                    )
+                    best_result = best_short_result
+                    best_duration_coverage = best_short_coverage
+                else:
+                    if rejected_content_candidates > 0 and rejected_short_candidates == 0:
+                        logger.warning(
+                            'Beat %d: NO MATCH after refinement (%d candidates rejected by content validation)',
+                            b.beat_id,
+                            rejected_content_candidates,
+                        )
+                    else:
+                        logger.warning(
+                            'Beat %d: NO MATCH after refinement (%d candidates rejected below %.0f%% duration coverage, %d by content validation)',
+                            b.beat_id,
+                            rejected_short_candidates,
+                            scan_cfg.min_duration_coverage * 100.0,
+                            rejected_content_candidates,
+                        )
+                    continue
+            is_confirmed = best_result.match_score >= cfg.cv.deep_scan.match_threshold
+            if best_result.match_score < cfg.cv.deep_scan.provisional_match_threshold:
+                logger.warning(
+                    'Beat %d: NO MATCH after refinement (best final score %.3f, provisional threshold %.3f)',
+                    b.beat_id,
+                    best_result.match_score,
+                    cfg.cv.deep_scan.provisional_match_threshold,
+                )
+                continue
+            if not is_confirmed:
+                logger.warning(
+                    'Beat %d: provisional automatic match scene=%d in=%.3fs score=%.3f (confirmed threshold %.3f)',
+                    b.beat_id,
+                    best_result.scene_id,
+                    best_result.in_point_s,
+                    best_result.match_score,
+                    cfg.cv.deep_scan.match_threshold,
+                )
+
+            logger.info(
+                'Beat %d: best automatic match scene=%d in=%.3fs dur=%.3fs coverage=%.2f score=%.3f',
+                b.beat_id,
+                best_result.scene_id,
+                best_result.in_point_s,
+                best_result.duration_s,
+                best_duration_coverage,
+                best_result.match_score,
+            )
+
+            results.append(MatchResult(
+                beat_id=b.beat_id,
+                scene_id=best_result.scene_id,
+                source_path=cfg.paths.source_movie,
+                in_point_s=best_result.in_point_s,
+                out_point_s=best_result.out_point_s,
+                in_point_frame=best_result.in_point_frame,
+                match_score=best_result.match_score,
+                is_confirmed=is_confirmed,
+            ))
+        else:
+            logger.warning(
+                'Beat %d: NO MATCH (best coarse score %.3f, coarse threshold %.3f)',
+                b.beat_id,
+                score,
+                cfg.cv.deep_scan.coarse_candidate_threshold,
+            )
+
+    if skip_coarse_scan and not results and cfg.vision.fullscan_fallback:
+        logger.warning(
+            '[Global Scan] Weighted vision-seed pass found no valid matches; retrying with full FFmpeg coarse scan.'
+        )
+        retry_cfg = replace(
+            cfg,
+            cv=replace(
+                cfg.cv,
+                deep_scan=replace(cfg.cv.deep_scan, skip_coarse_scan_with_weighted_seeds=False),
+            ),
+        )
+        return run_global_scan(beats, retry_cfg, scenes=scenes, seed_in_points=seed_in_points)
+
+    return results
diff --git a/src/cv/scene_indexer.py b/src/cv/scene_indexer.py
new file mode 100644
index 0000000..10bb47e
--- /dev/null
+++ b/src/cv/scene_indexer.py
@@ -0,0 +1,229 @@
+"""
+src/cv/scene_indexer.py — Source-movie scene segmentation + fingerprinting
+
+Responsibility:
+  1. Run PySceneDetect on the source movie → list of raw scene boundaries
+  2. For each scene, extract the midpoint frame and fingerprint it
+  3. Optionally run Whisper dialogue on each scene (injected as dependency)
+  4. Persist results to .cache/ as JSON for fast re-runs
+
+Returns: list[Scene] with luma_hist, sat_hist, phash populated.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import pickle
+from pathlib import Path
+from typing import Callable, Sequence
+
+import numpy as np
+
+from src.core.config import AppConfig
+from src.core.models import Scene
+from src.cv.fingerprinting import fingerprint_frame
+from src.cv.frame_extractor import grab_midpoint_frame, open_video
+
+logger = logging.getLogger(__name__)
+
+# Type alias for an optional dialogue-injection callback
+DialogueCallback = Callable[[Scene], Scene]
+
+
+# ---------------------------------------------------------------------------
+# Cache helpers
+# ---------------------------------------------------------------------------
+
+def _cache_path(cfg: AppConfig) -> Path:
+    p = cfg.paths.cache_dir / "scene_index.json"
+    p.parent.mkdir(parents=True, exist_ok=True)
+    return p
+
+
+def _scene_to_dict(s: Scene) -> dict:
+    return {
+        "scene_id":    s.scene_id,
+        "source_path": str(s.source_path),
+        "start_s":     s.start_s,
+        "end_s":       s.end_s,
+        "start_frame": s.start_frame,
+        "end_frame":   s.end_frame,
+        # histograms serialised as hex so JSON can hold them
+        "luma_hist":   s.luma_hist.hex() if s.luma_hist else None,
+        "sat_hist":    s.sat_hist.hex()  if s.sat_hist  else None,
+        "phash":       s.phash,
+    }
+
+
+def _scene_from_dict(d: dict) -> Scene:
+    return Scene(
+        scene_id=d["scene_id"],
+        source_path=Path(d["source_path"]),
+        start_s=d["start_s"],
+        end_s=d["end_s"],
+        start_frame=d["start_frame"],
+        end_frame=d["end_frame"],
+        luma_hist=bytes.fromhex(d["luma_hist"]) if d.get("luma_hist") else None,
+        sat_hist= bytes.fromhex(d["sat_hist"])  if d.get("sat_hist")  else None,
+        phash=d.get("phash"),
+    )
+
+
+def _save_cache(scenes: list[Scene], cfg: AppConfig) -> None:
+    data = [_scene_to_dict(s) for s in scenes]
+    _cache_path(cfg).write_text(json.dumps(data, indent=2), encoding="utf-8")
+    logger.info("Scene index cached → %s (%d scenes)", _cache_path(cfg), len(scenes))
+
+
+def _load_cache(cfg: AppConfig) -> list[Scene] | None:
+    p = _cache_path(cfg)
+    if not p.exists():
+        return None
+    try:
+        data = json.loads(p.read_text(encoding="utf-8"))
+        scenes = [_scene_from_dict(d) for d in data]
+        logger.info("Loaded %d scenes from cache (%s)", len(scenes), p)
+        return scenes
+    except Exception as exc:
+        logger.warning("Cache corrupt, re-indexing: %s", exc)
+        return None
+
+
+# ---------------------------------------------------------------------------
+# PySceneDetect integration
+# ---------------------------------------------------------------------------
+
+def _detect_scenes_pyscenedetect(cfg: AppConfig) -> list[tuple[float, float, int, int]]:
+    """
+    Run PySceneDetect ContentDetector on the source movie.
+
+    Returns:
+        List of (start_s, end_s, start_frame, end_frame) tuples.
+    """
+    try:
+        from scenedetect import open_video as sd_open_video, SceneManager
+        from scenedetect.detectors import ContentDetector
+    except ImportError:
+        raise ImportError(
+            "scenedetect is not installed. Run: pip install scenedetect[opencv]"
+        )
+
+    video = sd_open_video(str(cfg.paths.source_movie))
+    manager = SceneManager()
+    manager.add_detector(
+        ContentDetector(
+            threshold=cfg.scene_detection.content_threshold,
+            min_scene_len=int(
+                cfg.scene_detection.min_scene_duration_s
+                * video.frame_rate
+            ),
+        )
+    )
+
+    logger.info("Detecting scenes in %s …", cfg.paths.source_movie.name)
+    manager.detect_scenes(video=video, show_progress=True)
+
+    raw = manager.get_scene_list()
+    result: list[tuple[float, float, int, int]] = []
+    for start_tc, end_tc in raw:
+        result.append((
+            start_tc.get_seconds(),
+            end_tc.get_seconds(),
+            start_tc.get_frames(),
+            end_tc.get_frames(),
+        ))
+
+    logger.info("PySceneDetect found %d scenes.", len(result))
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Fingerprint enrichment
+# ---------------------------------------------------------------------------
+
+def _fingerprint_scenes(
+    raw_scenes: list[tuple[float, float, int, int]],
+    cfg: AppConfig,
+) -> list[Scene]:
+    """
+    For each raw scene boundary, extract the midpoint frame and fingerprint it.
+    """
+    scenes: list[Scene] = []
+    vc_cfg = cfg.cv.vibe_check
+
+    logger.info("Fingerprinting %d scenes …", len(raw_scenes))
+
+    with open_video(cfg.paths.source_movie) as cap:
+        for idx, (start_s, end_s, start_frame, end_frame) in enumerate(raw_scenes):
+            frame = grab_midpoint_frame(cap, start_s, end_s)
+
+            if frame is None:
+                logger.warning("Scene %d: midpoint frame decode failed, skipping fingerprint.", idx)
+                scenes.append(Scene(
+                    scene_id=idx,
+                    source_path=cfg.paths.source_movie,
+                    start_s=start_s, end_s=end_s,
+                    start_frame=start_frame, end_frame=end_frame,
+                ))
+                continue
+
+            luma_bytes, sat_bytes, phash_hex = fingerprint_frame(frame, vc_cfg)
+
+            scenes.append(Scene(
+                scene_id=idx,
+                source_path=cfg.paths.source_movie,
+                start_s=start_s, end_s=end_s,
+                start_frame=start_frame, end_frame=end_frame,
+                luma_hist=luma_bytes,
+                sat_hist=sat_bytes,
+                phash=phash_hex,
+            ))
+
+            if (idx + 1) % 50 == 0:
+                logger.info("  … %d / %d scenes fingerprinted", idx + 1, len(raw_scenes))
+
+    return scenes
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+def build_scene_index(
+    cfg: AppConfig,
+    force_reindex: bool = False,
+    dialogue_callback: DialogueCallback | None = None,
+) -> list[Scene]:
+    """
+    Build (or load from cache) the full scene index for the source movie.
+
+    Steps:
+      1. Load from .cache/scene_index.json if available and force_reindex=False.
+      2. Otherwise: detect scenes via PySceneDetect → fingerprint → cache.
+      3. Optionally enrich each scene with dialogue via dialogue_callback.
+
+    Args:
+        cfg:               Application configuration.
+        force_reindex:     Ignore cache and re-run detection + fingerprinting.
+        dialogue_callback: Optional function Scene → Scene that adds dialogue.
+                           Injected here so this module stays audio-free.
+
+    Returns:
+        List of Scene objects with fingerprints populated.
+    """
+    if not force_reindex:
+        cached = _load_cache(cfg)
+        if cached is not None:
+            if dialogue_callback:
+                cached = [dialogue_callback(s) for s in cached]
+            return cached
+
+    raw = _detect_scenes_pyscenedetect(cfg)
+    scenes = _fingerprint_scenes(raw, cfg)
+    _save_cache(scenes, cfg)
+
+    if dialogue_callback:
+        scenes = [dialogue_callback(s) for s in scenes]
+
+    return scenes
diff --git a/src/cv/vibe_check.py b/src/cv/vibe_check.py
new file mode 100644
index 0000000..ed1d1fd
--- /dev/null
+++ b/src/cv/vibe_check.py
@@ -0,0 +1,190 @@
+"""
+src/cv/vibe_check.py — Phase 1: Scene-level histogram / pHash filter
+
+Responsibility:
+  Given ONE TrailerBeat (with pre-computed fingerprints) and a list of
+  source Scenes (also fingerprinted), return the Top-K candidates ranked
+  by a combined histogram + pHash score.
+
+This module contains ZERO file I/O and ZERO frame decoding — those live
+in the pipeline layer. Input = model objects, output = sorted VibeHit list.
+"""
+
+from __future__ import annotations
+
+import logging
+from dataclasses import replace
+from typing import Sequence
+
+import cv2
+import numpy as np
+
+from src.core.models import Scene, TrailerBeat, VibeHit
+from src.cv.fingerprinting import bytes_to_hist, phash_distance
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Scoring
+# ---------------------------------------------------------------------------
+
+# Weight applied to histogram score vs pHash score in the combined metric.
+# pHash gets less weight because it's sensitive to text overlays on source.
+_HIST_WEIGHT  = 0.70
+_PHASH_WEIGHT = 0.30
+_PHASH_MAX_BITS = 64  # maximum possible Hamming distance
+
+
+def _hist_combined_score(
+    beat: TrailerBeat,
+    scene: Scene,
+    hist_method: int,
+) -> float:
+    """
+    Average CORREL score of luma + saturation histograms.
+
+    Returns a value in [-1, 1] (CORREL) or [0, 1] depending on method.
+    Higher is always more similar (we invert BHATTACHARYYA if needed).
+    """
+    if beat.luma_hist is None or scene.luma_hist is None:
+        return 0.0
+    if beat.sat_hist is None or scene.sat_hist is None:
+        return 0.0
+
+    luma_score = cv2.compareHist(
+        bytes_to_hist(beat.luma_hist),
+        bytes_to_hist(scene.luma_hist),
+        hist_method,
+    )
+    sat_score = cv2.compareHist(
+        bytes_to_hist(beat.sat_hist),
+        bytes_to_hist(scene.sat_hist),
+        hist_method,
+    )
+
+    # Normalise BHATTACHARYYA to [0, 1] similarity (invert distance)
+    if hist_method == cv2.HISTCMP_BHATTACHARYYA:
+        luma_score = 1.0 - float(luma_score)
+        sat_score  = 1.0 - float(sat_score)
+
+    return float((luma_score + sat_score) / 2.0)
+
+
+def _phash_score(beat: TrailerBeat, scene: Scene) -> float:
+    """
+    Convert Hamming distance to a [0, 1] similarity score.
+
+    0 Hamming distance → 1.0 (identical)
+    64 Hamming distance → 0.0 (completely different)
+    """
+    if beat.phash is None or scene.phash is None:
+        return 0.0
+    dist = phash_distance(beat.phash, scene.phash)
+    return 1.0 - (dist / _PHASH_MAX_BITS)
+
+
+def _combined_score(
+    beat: TrailerBeat,
+    scene: Scene,
+    hist_method: int,
+) -> float:
+    """Weighted aggregate of histogram + pHash similarity."""
+    hist  = _hist_combined_score(beat, scene, hist_method)
+    phash = _phash_score(beat, scene)
+    return _HIST_WEIGHT * hist + _PHASH_WEIGHT * phash
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+def run_vibe_check(
+    beat: TrailerBeat,
+    scenes: Sequence[Scene],
+    top_k: int,
+    hist_method: int,
+    phash_max_distance: int,
+) -> list[VibeHit]:
+    """
+    Phase 1: Score all source scenes against one trailer beat and return
+    the top-K candidates for Deep Scan.
+
+    Args:
+        beat:               The trailer beat to match (must have fingerprints).
+        scenes:             All detected scenes from the source movie.
+        top_k:              Maximum number of candidates to return.
+        hist_method:        cv2.HISTCMP_* constant (e.g. 0 = CORREL).
+        phash_max_distance: Scenes with pHash Hamming distance > this value
+                            are excluded before ranking (hard filter).
+
+    Returns:
+        List of VibeHit, sorted by combined_score descending, length ≤ top_k.
+        Empty list if beat has no fingerprints or no scenes pass the filter.
+    """
+    if beat.luma_hist is None and beat.phash is None:
+        logger.warning(
+            "Beat %d has no fingerprints — skipping Vibe Check.", beat.beat_id
+        )
+        return []
+
+    candidates: list[VibeHit] = []
+
+    for scene in scenes:
+        # Hard pHash filter: skip scenes that are too visually distant
+        if beat.phash and scene.phash:
+            dist = phash_distance(beat.phash, scene.phash)
+            if dist > phash_max_distance:
+                continue  # fast rejection — avoids full histogram compare
+
+        hist  = _hist_combined_score(beat, scene, hist_method)
+        phash = _phash_score(beat, scene)
+        combined = _HIST_WEIGHT * hist + _PHASH_WEIGHT * phash
+
+        candidates.append(VibeHit(
+            beat_id=beat.beat_id,
+            scene_id=scene.scene_id,
+            hist_score=round(hist, 4),
+            phash_distance=(
+                phash_distance(beat.phash, scene.phash)
+                if beat.phash and scene.phash
+                else _PHASH_MAX_BITS
+            ),
+            combined_score=round(combined, 4),
+        ))
+
+    # Sort by combined score, descending; return top-K
+    candidates.sort(key=lambda h: h.combined_score, reverse=True)
+    top = candidates[:top_k]
+
+    logger.info(
+        "Vibe Check beat=%d: %d scenes scored, %d candidates forwarded to Deep Scan. "
+        "Best score: %.3f (scene %s)",
+        beat.beat_id,
+        len(candidates),
+        len(top),
+        top[0].combined_score if top else 0.0,
+        top[0].scene_id if top else "—",
+    )
+
+    return top
+
+
+def batch_vibe_check(
+    beats: Sequence[TrailerBeat],
+    scenes: Sequence[Scene],
+    top_k: int,
+    hist_method: int,
+    phash_max_distance: int,
+) -> dict[int, list[VibeHit]]:
+    """
+    Run Vibe Check for every beat and return a mapping beat_id → [VibeHit].
+
+    Convenience wrapper for the pipeline layer.
+    """
+    return {
+        beat.beat_id: run_vibe_check(
+            beat, scenes, top_k, hist_method, phash_max_distance
+        )
+        for beat in beats
+    }
diff --git a/src/export/__init__.py b/src/export/__init__.py
new file mode 100644
index 0000000..da61106
--- /dev/null
+++ b/src/export/__init__.py
@@ -0,0 +1 @@
+# src.export package — FCPXML / EDL export
diff --git a/src/export/edl_writer.py b/src/export/edl_writer.py
new file mode 100644
index 0000000..d593b99
--- /dev/null
+++ b/src/export/edl_writer.py
@@ -0,0 +1,114 @@
+"""
+src/export/edl_writer.py — EditTimeline → CMX 3600 EDL
+
+Generates a standard CMX 3600 Edit Decision List compatible with
+Avid, DaVinci Resolve, Premiere Pro, and most NLEs.
+
+CMX 3600 format reference:
+  https://en.wikipedia.org/wiki/Edit_decision_list#CMX_3600
+"""
+
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+
+from src.core.config import AppConfig
+from src.core.models import EditClip, EditTimeline
+from src.export.timecode import seconds_to_smpte
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# EDL line builders
+# ---------------------------------------------------------------------------
+
+def _edl_header(title: str) -> str:
+    return f"TITLE: {title}\nFCM: NON-DROP FRAME\n"
+
+
+def _edl_event(
+    event_num: int,
+    clip: EditClip,
+    fps: float,
+) -> str:
+    """
+    Build one CMX 3600 event block for a single clip.
+
+    Format:
+        NNN  AX  V  C  <SRC_IN> <SRC_OUT> <REC_IN> <REC_OUT>
+        * FROM CLIP NAME: ...
+        * COMMENT: ...
+    """
+    src_in  = seconds_to_smpte(clip.match.in_point_s,         fps)
+    source_duration_s = clip.source_timeline_duration_s
+    src_out = seconds_to_smpte(clip.match.in_point_s + source_duration_s, fps)
+    rec_in  = seconds_to_smpte(clip.timeline_start_s,         fps)
+    rec_out = seconds_to_smpte(clip.timeline_start_s + source_duration_s, fps)
+
+    event_line  = f"{event_num:03d}  AX       V     C        {src_in} {src_out} {rec_in} {rec_out}"
+    name_line   = f"* FROM CLIP NAME:  {clip.match.source_path.name}"
+    comment_line = (
+        f"* BEAT {clip.beat.beat_id:03d} | {clip.beat.beat_type.name} | "
+        f"score={clip.match.match_score:.3f}"
+    )
+
+    return "\n".join([event_line, name_line, comment_line, ""])
+
+
+def _edl_black_tail_event(event_num: int, clip: EditClip, fps: float) -> str:
+    rec_in = seconds_to_smpte(clip.timeline_start_s + clip.source_timeline_duration_s, fps)
+    rec_out = seconds_to_smpte(clip.timeline_end_s, fps)
+    event_line = f"{event_num:03d}  BL       V     C        00:00:00:00 00:00:00:00 {rec_in} {rec_out}"
+    comment_line = (
+        f"* BEAT {clip.beat.beat_id:03d} TRAILER-ONLY TAIL | "
+        "add fade/dissolve to black"
+    )
+    return "\n".join([event_line, "* FROM CLIP NAME:  BLACK", comment_line, ""])
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+def write_edl(
+    timeline: EditTimeline,
+    cfg: AppConfig,
+    output_path: Path | None = None,
+) -> Path:
+    """
+    Write the EditTimeline as a CMX 3600 EDL file.
+
+    Args:
+        timeline:    EditTimeline from build_timeline().
+        cfg:         Application configuration.
+        output_path: Override destination. Defaults to
+                     <output_dir>/<project_name>.edl.
+
+    Returns:
+        Path to the written .edl file.
+    """
+    if output_path is None:
+        output_path = cfg.paths.output_dir / f"{timeline.title}.edl"
+
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    fps    = timeline.frame_rate
+    lines  = [_edl_header(timeline.title), "\n"]
+
+    event_num = 1
+    for clip in sorted(timeline.clips, key=lambda c: c.clip_index):
+        lines.append(_edl_event(event_num, clip, fps))
+        event_num += 1
+        if clip.trailer_tail_s > 0:
+            lines.append("\n")
+            lines.append(_edl_black_tail_event(event_num, clip, fps))
+            event_num += 1
+        lines.append("\n")
+
+    edl_text = "\n".join(lines)
+    output_path.write_text(edl_text, encoding="utf-8")
+
+    logger.info("EDL written → %s (%d events)", output_path, timeline.clip_count)
+    return output_path
diff --git a/src/export/fcpxml_writer.py b/src/export/fcpxml_writer.py
new file mode 100644
index 0000000..bba4098
--- /dev/null
+++ b/src/export/fcpxml_writer.py
@@ -0,0 +1,222 @@
+"""
+src/export/fcpxml_writer.py — EditTimeline → Final Cut Pro XML (FCPXML 1.10)
+
+Generates a standards-compliant FCPXML file that can be imported directly
+into Final Cut Pro X, DaVinci Resolve, or Premiere Pro (via FCPXML plugin).
+
+Spec reference: https://developer.apple.com/documentation/professional_video_applications/fcpxml_reference
+"""
+
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+from urllib.parse import quote
+from xml.etree import ElementTree as ET
+from xml.etree.ElementTree import Element, SubElement
+
+from src.core.config import AppConfig
+from src.core.models import EditClip, EditTimeline
+from src.export.timecode import (
+    fcpxml_format_name,
+    fcpxml_frame_duration,
+    seconds_to_fcpxml,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Asset registry — one <asset> per unique source file
+# ---------------------------------------------------------------------------
+
+class _AssetRegistry:
+    def __init__(self) -> None:
+        self._assets: dict[Path, str] = {}   # path → asset id
+        self._counter = 2                    # r1 reserved for format
+
+    def get_or_create(self, path: Path) -> str:
+        if path not in self._assets:
+            rid = f"r{self._counter}"
+            self._assets[path] = rid
+            self._counter += 1
+        return self._assets[path]
+
+    @property
+    def items(self) -> dict[Path, str]:
+        return dict(self._assets)
+
+
+# ---------------------------------------------------------------------------
+# Builder
+# ---------------------------------------------------------------------------
+
+def _path_to_url(path: Path) -> str:
+    """Convert an absolute Path to a file:// URL as required by FCPXML."""
+    posix = path.as_posix()
+    if not posix.startswith("/"):
+        # Windows drive letter: C:/foo → /C:/foo
+        posix = "/" + posix
+    return "file://" + quote(posix, safe="/:@")
+
+
+def build_fcpxml(
+    timeline: EditTimeline,
+    cfg: AppConfig,
+    source_duration_s: float = 7200.0,  # 2-hour fallback if not probed
+) -> ET.ElementTree:
+    """
+    Build a complete FCPXML ElementTree from an EditTimeline.
+
+    Args:
+        timeline:          Ordered sequence of EditClips.
+        cfg:               Application configuration.
+        source_duration_s: Duration of the source movie asset (used for
+                           <asset> duration attribute). Will be probed
+                           automatically when possible.
+
+    Returns:
+        xml.etree.ElementTree.ElementTree — call .write() to serialise.
+    """
+    fps = timeline.frame_rate
+
+    # ---- root ---------------------------------------------------------------
+    root = Element("fcpxml", version=cfg.export.fcpxml_version)
+    root.set("xmlns", "http://www.apple.com/dt/FCPXML/1_10")
+
+    # ---- resources ----------------------------------------------------------
+    resources = SubElement(root, "resources")
+
+    format_id   = "r1"
+    format_name = fcpxml_format_name(fps)
+    fmt = SubElement(resources, "format",
+        id=format_id,
+        name=format_name,
+        frameDuration=fcpxml_frame_duration(fps),
+        width="1920",
+        height="1080",
+        colorSpace="1-1-1 (Rec. 709)",
+    )
+
+    registry = _AssetRegistry()
+
+    # Pre-register all unique source paths so <asset> elements come before
+    # the <library> block (required by FCPXML spec).
+    for clip in timeline.clips:
+        registry.get_or_create(clip.match.source_path)
+
+    # Probe actual source duration when possible
+    _durations: dict[Path, float] = {}
+    for path in registry.items:
+        try:
+            from src.cv.frame_extractor import get_video_info
+            info = get_video_info(path)
+            _durations[path] = float(info["duration_s"])
+        except Exception:
+            _durations[path] = source_duration_s
+
+    for path, rid in registry.items.items():
+        dur_s = _durations.get(path, source_duration_s)
+        SubElement(resources, "asset",
+            id=rid,
+            name=path.stem,
+            src=_path_to_url(path),
+            start="0s",
+            duration=seconds_to_fcpxml(dur_s, fps),
+            hasVideo="1",
+            hasAudio="1",
+            format=format_id,
+        )
+
+    # ---- library / event / project ------------------------------------------
+    library = SubElement(root, "library")
+    event   = SubElement(library, "event", name=timeline.title)
+    project = SubElement(event, "project", name=timeline.title)
+    sequence = SubElement(project, "sequence",
+        duration=seconds_to_fcpxml(timeline.total_duration_s, fps),
+        format=format_id,
+        tcStart="0s",
+        tcFormat="NDF",
+        audioLayout="stereo",
+        audioRate="48k",
+    )
+    spine = SubElement(sequence, "spine")
+
+    # ---- clips --------------------------------------------------------------
+    for clip in sorted(timeline.clips, key=lambda c: c.clip_index):
+        asset_id = registry.get_or_create(clip.match.source_path)
+
+        source_duration_s = clip.source_timeline_duration_s
+        clip_elem = SubElement(spine, "clip",
+            name=f"Beat_{clip.beat.beat_id:03d}_{clip.beat.beat_type.name}",
+            ref=asset_id,
+            # offset = position on the timeline
+            offset=seconds_to_fcpxml(clip.timeline_start_s, fps),
+            # duration = matched source part only; trailer-only tails become gaps.
+            duration=seconds_to_fcpxml(source_duration_s, fps),
+            # start = in-point inside the source asset
+            start=seconds_to_fcpxml(clip.match.in_point_s, fps),
+        )
+
+        # Inline audio role
+        SubElement(clip_elem, "audio",
+            role="dialogue",
+            srcCh="1, 2",
+            outCh="L, R",
+        )
+
+        if clip.trailer_tail_s > 0:
+            gap = SubElement(spine, "gap",
+                name=f"Beat_{clip.beat.beat_id:03d}_TRAILER_TAIL_BLACK_FADE",
+                offset=seconds_to_fcpxml(clip.timeline_start_s + source_duration_s, fps),
+                duration=seconds_to_fcpxml(clip.trailer_tail_s, fps),
+                start="0s",
+            )
+            SubElement(gap, "marker",
+                start="0s",
+                value="Trailer-only tail: add fade/dissolve to black here",
+                completed="0",
+            )
+
+    return ET.ElementTree(root)
+
+
+# ---------------------------------------------------------------------------
+# Writer
+# ---------------------------------------------------------------------------
+
+def write_fcpxml(
+    timeline: EditTimeline,
+    cfg: AppConfig,
+    output_path: Path | None = None,
+) -> Path:
+    """
+    Serialise the EditTimeline to a .fcpxml file.
+
+    Args:
+        timeline:    EditTimeline from build_timeline().
+        cfg:         Application configuration.
+        output_path: Override destination. Defaults to
+                     <output_dir>/<project_name>.fcpxml.
+
+    Returns:
+        Path to the written .fcpxml file.
+    """
+    if output_path is None:
+        output_path = cfg.paths.output_dir / f"{timeline.title}.fcpxml"
+
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    tree = build_fcpxml(timeline, cfg)
+
+    # Add XML declaration + DOCTYPE manually (ElementTree doesn't support DOCTYPE)
+    xml_bytes = ET.tostring(tree.getroot(), encoding="unicode", xml_declaration=False)
+    header = (
+        '<?xml version="1.0" encoding="UTF-8"?>\n'
+        '<!DOCTYPE fcpxml>\n'
+    )
+
+    output_path.write_text(header + xml_bytes, encoding="utf-8")
+
+    logger.info("FCPXML written → %s (%d clips)", output_path, timeline.clip_count)
+    return output_path
diff --git a/src/export/timecode.py b/src/export/timecode.py
new file mode 100644
index 0000000..89a6ffd
--- /dev/null
+++ b/src/export/timecode.py
@@ -0,0 +1,146 @@
+"""
+src/export/timecode.py — Timecode / rational-time conversion helpers
+
+FCPXML uses rational fractions ("1001/24000s") for all time values.
+EDL uses SMPTE timecode strings ("HH:MM:SS:FF").
+
+All conversion functions are pure — no I/O, no state.
+"""
+
+from __future__ import annotations
+
+import math
+from fractions import Fraction
+
+
+# ---------------------------------------------------------------------------
+# Common frame-rate denominators
+# ---------------------------------------------------------------------------
+
+_FPS_RATIONAL: dict[float, tuple[int, int]] = {
+    23.976: (24000, 1001),
+    24.0:   (24,    1),
+    25.0:   (25,    1),
+    29.97:  (30000, 1001),
+    30.0:   (30,    1),
+    50.0:   (50,    1),
+    59.94:  (60000, 1001),
+    60.0:   (60,    1),
+}
+
+_TOLERANCE = 0.01  # fps match tolerance
+
+
+def _fps_to_rational(fps: float) -> tuple[int, int]:
+    """Return (numerator, denominator) for common fps values."""
+    for ref_fps, rational in _FPS_RATIONAL.items():
+        if abs(fps - ref_fps) < _TOLERANCE:
+            return rational
+    # Fallback: convert float to exact fraction
+    f = Fraction(fps).limit_denominator(1001)
+    return f.numerator, f.denominator
+
+
+# ---------------------------------------------------------------------------
+# Seconds → FCPXML rational string
+# ---------------------------------------------------------------------------
+
+def seconds_to_fcpxml(seconds: float, fps: float) -> str:
+    """
+    Convert *seconds* to FCPXML rational time string.
+
+    FCPXML requires exact rational arithmetic to avoid drift.
+    Example: 10.0s @23.976fps → "240240/24000s"
+
+    Args:
+        seconds: Time in seconds (float).
+        fps:     Project frame rate.
+
+    Returns:
+        FCPXML time string, e.g. "240240/24000s".
+    """
+    if seconds == 0.0:
+        return "0s"
+
+    num, den = _fps_to_rational(fps)          # frames per second = num/den
+    # seconds × (num/den) = frames (float); round to nearest frame
+    frames = round(seconds * num / den)
+    # frames ÷ (num/den) = frames × den/num  → rational seconds
+    total_num = frames * den
+    total_den = num
+    # Reduce fraction
+    g = math.gcd(total_num, total_den)
+    return f"{total_num // g}/{total_den // g}s"
+
+
+def seconds_to_frame_count(seconds: float, fps: float) -> int:
+    """Convert seconds to integer frame count."""
+    return round(seconds * fps)
+
+
+# ---------------------------------------------------------------------------
+# Seconds → SMPTE timecode (for EDL)
+# ---------------------------------------------------------------------------
+
+def seconds_to_smpte(seconds: float, fps: float, drop_frame: bool = False) -> str:
+    """
+    Convert *seconds* to SMPTE timecode string "HH:MM:SS:FF".
+
+    Drop-frame timecode (;) is not implemented — always returns NDF (:).
+
+    Args:
+        seconds:    Time in float seconds.
+        fps:        Frame rate (23.976, 24, 25, etc.).
+        drop_frame: Ignored; placeholder for future DF support.
+
+    Returns:
+        "HH:MM:SS:FF" string.
+    """
+    total_frames = seconds_to_frame_count(seconds, fps)
+    nominal_fps  = round(fps)  # e.g. 23.976 → 24
+
+    ff = total_frames % nominal_fps
+    total_s = total_frames // nominal_fps
+    ss = total_s % 60
+    total_m = total_s // 60
+    mm = total_m % 60
+    hh = total_m // 60
+
+    return f"{hh:02d}:{mm:02d}:{ss:02d}:{ff:02d}"
+
+
+# ---------------------------------------------------------------------------
+# FCPXML format ID helpers
+# ---------------------------------------------------------------------------
+
+def fcpxml_format_name(fps: float, width: int = 1920, height: int = 1080) -> str:
+    """
+    Return an FCPXML format name string for a given frame rate and resolution.
+
+    Example: fps=23.976, 1080p → "FFVideoFormat1080p2398"
+    """
+    res = f"{height}p"
+    fps_tag = {
+        23.976: "2398",
+        24.0:   "24",
+        25.0:   "25",
+        29.97:  "2997",
+        30.0:   "30",
+        50.0:   "50",
+        59.94:  "5994",
+        60.0:   "60",
+    }.get(fps, str(int(fps * 100)))
+    return f"FFVideoFormat{res}{fps_tag}"
+
+
+def fcpxml_frame_duration(fps: float) -> str:
+    """
+    Return FCPXML frameDuration attribute for a given fps.
+
+    frame duration = 1 frame = 1/fps seconds = den/num seconds
+    Example: 23.976fps → num=24000, den=1001 → frame duration = 1001/24000s
+    """
+    num, den = _fps_to_rational(fps)   # fps = num/den  (e.g. 24000/1001)
+    # frame duration = den/num seconds
+    g = math.gcd(den, num)
+    return f"{den // g}/{num // g}s"
diff --git a/src/llm/__init__.py b/src/llm/__init__.py
new file mode 100644
index 0000000..a20d165
--- /dev/null
+++ b/src/llm/__init__.py
@@ -0,0 +1 @@
+# src.llm package — Thematic segmentation / dramaturgy (NO vision matching)
diff --git a/src/llm/dramaturg.py b/src/llm/dramaturg.py
new file mode 100644
index 0000000..defcd18
--- /dev/null
+++ b/src/llm/dramaturg.py
@@ -0,0 +1,202 @@
+"""
+src/llm/dramaturg.py — LLM-based thematic beat classification (OpenRouter)
+
+Responsibility:
+  - Receive a list of TrailerBeat objects (with dialogue lines attached)
+  - Send a single structured prompt to the LLM
+  - Parse the JSON response to assign BeatType to each beat
+
+IMPORTANT: This module does ZERO visual analysis.
+           It classifies narrative dramaturgy from dialogue text only.
+           Visual matching is handled exclusively by the CV engine.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from dataclasses import replace
+from typing import Sequence
+
+from src.core.config import AppConfig
+from src.core.models import BeatType, TrailerBeat
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Prompt builder
+# ---------------------------------------------------------------------------
+
+_SYSTEM_PROMPT = """You are a film trailer editor and narrative analyst.
+Your task is to classify each beat of a trailer into one of these dramatic roles:
+  HOOK        - Opening attention grabber (first impression, shocking image, logo)
+  SETUP       - World/character introduction
+  CONFLICT    - Inciting incident, rising tension, threat revealed
+  CLIMAX      - Peak action/emotion, highest stakes
+  RESOLUTION  - Cool-down, tagline, final title card
+
+You will receive a JSON array of beats with their index and dialogue text.
+Respond ONLY with a valid JSON array, one object per beat, with keys:
+  "beat_id" (int) and "beat_type" (one of the strings above).
+Do NOT include any explanation or markdown fences."""
+
+_USER_TEMPLATE = """Classify the following {n} trailer beats:
+
+{beats_json}"""
+
+
+def _build_beats_payload(beats: Sequence[TrailerBeat]) -> str:
+    payload = []
+    for b in beats:
+        dialogue_text = " / ".join(line.text for line in b.dialogue) or "(no dialogue)"
+        payload.append({
+            "beat_id":  b.beat_id,
+            "duration": round(b.duration_s, 2),
+            "dialogue": dialogue_text,
+        })
+    return json.dumps(payload, ensure_ascii=False, indent=2)
+
+
+# ---------------------------------------------------------------------------
+# OpenRouter / OpenAI-compatible HTTP client
+# ---------------------------------------------------------------------------
+
+def _call_llm(prompt_user: str, cfg: AppConfig) -> str:
+    """
+    Send a chat completion request to the configured LLM provider.
+
+    Supports: openrouter, openai, ollama (all use the OpenAI-compatible API).
+
+    Returns:
+        The raw text content of the first assistant message.
+
+    Raises:
+        RuntimeError: On HTTP errors or missing API key.
+    """
+    import urllib.request
+    import urllib.error
+
+    llm = cfg.llm
+
+    if llm.provider in ("openrouter", "openai") and not llm.api_key:
+        raise RuntimeError(
+            f"LLM provider is '{llm.provider}' but no API key found. "
+            "Set OPENROUTER_API_KEY (or OPENAI_API_KEY) in your .env file."
+        )
+
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {llm.api_key}",
+    }
+    if llm.provider == "openrouter":
+        headers["HTTP-Referer"] = "https://github.com/ai-trailer-2026"
+        headers["X-Title"]      = "AI Trailer Generator v2"
+
+    body = json.dumps({
+        "model": llm.model,
+        "messages": [
+            {"role": "system", "content": _SYSTEM_PROMPT},
+            {"role": "user",   "content": prompt_user},
+        ],
+        "temperature": llm.temperature,
+        "max_tokens":  llm.max_tokens,
+    }).encode("utf-8")
+
+    url = f"{llm.base_url.rstrip('/')}/chat/completions"
+
+    req = urllib.request.Request(url, data=body, headers=headers, method="POST")
+
+    try:
+        with urllib.request.urlopen(req, timeout=llm.timeout_seconds) as resp:
+            data = json.loads(resp.read().decode("utf-8"))
+            return data["choices"][0]["message"]["content"]
+    except urllib.error.HTTPError as exc:
+        body_text = exc.read().decode(errors="replace")
+        raise RuntimeError(
+            f"LLM HTTP {exc.code} from {url}:\n{body_text}"
+        ) from exc
+
+
+# ---------------------------------------------------------------------------
+# Response parser
+# ---------------------------------------------------------------------------
+
+_BEAT_TYPE_MAP: dict[str, BeatType] = {bt.name: bt for bt in BeatType}
+
+
+def _parse_response(raw: str, beats: Sequence[TrailerBeat]) -> dict[int, BeatType]:
+    """
+    Parse the LLM JSON array response into a beat_id → BeatType mapping.
+
+    Falls back to BeatType.UNKNOWN for any beat that cannot be parsed.
+    """
+    # Strip accidental markdown fences
+    clean = raw.strip()
+    if clean.startswith("```"):
+        clean = "\n".join(clean.split("\n")[1:])
+    if clean.endswith("```"):
+        clean = clean[: clean.rfind("```")]
+    clean = clean.strip()
+
+    result: dict[int, BeatType] = {b.beat_id: BeatType.UNKNOWN for b in beats}
+
+    try:
+        parsed = json.loads(clean)
+        if not isinstance(parsed, list):
+            raise ValueError("Expected JSON array at top level.")
+
+        for item in parsed:
+            bid  = int(item["beat_id"])
+            name = str(item.get("beat_type", "UNKNOWN")).upper()
+            result[bid] = _BEAT_TYPE_MAP.get(name, BeatType.UNKNOWN)
+
+    except (json.JSONDecodeError, KeyError, ValueError) as exc:
+        logger.warning("LLM response parse error (%s) — all beats → UNKNOWN.\nRaw: %s", exc, raw[:300])
+
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+def classify_beats(
+    beats: Sequence[TrailerBeat],
+    cfg: AppConfig,
+) -> list[TrailerBeat]:
+    """
+    Use the LLM to assign a BeatType to each TrailerBeat.
+
+    Args:
+        beats: TrailerBeat list (dialogue should be populated for best results).
+        cfg:   Application configuration (llm section + api key).
+
+    Returns:
+        New list of TrailerBeat objects with beat_type set.
+        On LLM error, all beats keep BeatType.UNKNOWN (no exception raised).
+    """
+    if not beats:
+        return list(beats)
+
+    logger.info(
+        "Classifying %d beats via %s / %s …",
+        len(beats), cfg.llm.provider, cfg.llm.model,
+    )
+
+    payload = _build_beats_payload(beats)
+    prompt  = _USER_TEMPLATE.format(n=len(beats), beats_json=payload)
+
+    try:
+        raw_response = _call_llm(prompt, cfg)
+    except Exception as exc:
+        logger.error("LLM classification failed: %s — keeping BeatType.UNKNOWN.", exc)
+        return list(beats)
+
+    type_map = _parse_response(raw_response, beats)
+
+    enriched = [replace(b, beat_type=type_map.get(b.beat_id, BeatType.UNKNOWN)) for b in beats]
+
+    classified = sum(1 for b in enriched if b.beat_type != BeatType.UNKNOWN)
+    logger.info("Beat classification done: %d / %d classified.", classified, len(beats))
+    return enriched
diff --git a/src/llm/vision_cache.py b/src/llm/vision_cache.py
new file mode 100644
index 0000000..0e9c7e1
--- /dev/null
+++ b/src/llm/vision_cache.py
@@ -0,0 +1,316 @@
+"""
+Cached vision descriptions for ambiguous trailer/source matching.
+
+This module is deliberately conservative: it never writes a final match and it
+does not replace CV. It describes a small number of 3-frame beat/scene samples,
+caches those descriptions, and returns extra source in-point seeds for the CV
+scanner to verify.
+"""
+
+from __future__ import annotations
+
+import base64
+import json
+import logging
+import re
+import urllib.error
+import urllib.request
+from dataclasses import asdict
+from pathlib import Path
+from typing import Sequence
+
+import cv2
+
+from src.core.config import AppConfig
+from src.core.models import Scene, TrailerBeat
+
+logger = logging.getLogger(__name__)
+
+_CACHE_VERSION = 1
+_STOPWORDS = {
+    "the", "and", "with", "from", "that", "this", "there", "their", "into",
+    "scene", "frame", "image", "shot", "video", "visible", "looks", "appears",
+    "eine", "einer", "einem", "einen", "und", "oder", "mit", "der", "die", "das",
+}
+
+_SYSTEM_PROMPT = """You describe film shots for automatic matching.
+Return only compact JSON with these keys:
+subject, setting, composition, action_phase, distinctive_objects, lighting_color, negatives.
+Focus on stable visual facts and spatial layout. Ignore timecode overlays, subtitles, logos, compression, aspect ratio, and color grading differences."""
+
+
+def _cache_path(cfg: AppConfig) -> Path:
+    return cfg.paths.cache_dir / "vision_descriptions.json"
+
+
+def _load_cache(cfg: AppConfig) -> dict:
+    path = _cache_path(cfg)
+    if not path.exists():
+        return {"version": _CACHE_VERSION, "items": {}}
+    try:
+        data = json.loads(path.read_text(encoding="utf-8"))
+    except json.JSONDecodeError:
+        logger.warning("Vision cache is unreadable; rebuilding: %s", path)
+        return {"version": _CACHE_VERSION, "items": {}}
+    if data.get("version") != _CACHE_VERSION or not isinstance(data.get("items"), dict):
+        return {"version": _CACHE_VERSION, "items": {}}
+    return data
+
+
+def _save_cache(cfg: AppConfig, cache: dict) -> None:
+    path = _cache_path(cfg)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(cache, indent=2, ensure_ascii=False), encoding="utf-8")
+
+
+def _sample_times(start_s: float, end_s: float) -> list[float]:
+    duration_s = max(0.04, end_s - start_s)
+    return [
+        start_s + min(duration_s * 0.12, max(0.0, duration_s - 0.04)),
+        start_s + duration_s * 0.50,
+        start_s + max(0.0, duration_s - min(duration_s * 0.12, 0.20)),
+    ]
+
+
+def _frame_data_url(video_path: Path, t_s: float) -> str | None:
+    cap = cv2.VideoCapture(str(video_path))
+    try:
+        if not cap.isOpened():
+            return None
+        cap.set(cv2.CAP_PROP_POS_MSEC, max(0.0, t_s) * 1000.0)
+        ok, frame = cap.read()
+        if not ok or frame is None:
+            return None
+        h, w = frame.shape[:2]
+        if w > 640:
+            frame = cv2.resize(frame, (640, int(h * (640 / w))), interpolation=cv2.INTER_AREA)
+        ok, encoded = cv2.imencode(".jpg", frame, [int(cv2.IMWRITE_JPEG_QUALITY), 72])
+        if not ok:
+            return None
+        payload = base64.b64encode(encoded.tobytes()).decode("ascii")
+        return f"data:image/jpeg;base64,{payload}"
+    finally:
+        cap.release()
+
+
+def _call_vision_model(label: str, image_urls: list[str], cfg: AppConfig) -> str:
+    vision = cfg.vision
+    if vision.provider in ("openai", "openrouter") and not vision.api_key:
+        raise RuntimeError(
+            "Vision is enabled but no API key is available. Set VISION_API_KEY, "
+            "OPENROUTER_API_KEY, OPENAI_API_KEY, or LLM_API_KEY."
+        )
+
+    content: list[dict] = [{
+        "type": "text",
+        "text": (
+            f"Describe this 3-frame sample for matching. Label: {label}. "
+            "The frames are start, middle, and end of the same beat/scene."
+        ),
+    }]
+    content.extend({
+        "type": "image_url",
+        "image_url": {"url": url, "detail": "low"},
+    } for url in image_urls)
+
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {vision.api_key}",
+    }
+    if vision.provider == "openrouter":
+        headers["HTTP-Referer"] = "https://github.com/ai-trailer-2026"
+        headers["X-Title"] = "AI Trailer Generator v2"
+
+    body = json.dumps({
+        "model": vision.model,
+        "messages": [
+            {"role": "system", "content": _SYSTEM_PROMPT},
+            {"role": "user", "content": content},
+        ],
+        "temperature": vision.temperature,
+        "max_tokens": vision.max_tokens,
+    }).encode("utf-8")
+
+    url = f"{vision.base_url.rstrip('/')}/chat/completions"
+    req = urllib.request.Request(url, data=body, headers=headers, method="POST")
+    try:
+        with urllib.request.urlopen(req, timeout=vision.timeout_seconds) as resp:
+            data = json.loads(resp.read().decode("utf-8"))
+            return str(data["choices"][0]["message"]["content"]).strip()
+    except urllib.error.HTTPError as exc:
+        body_text = exc.read().decode(errors="replace")
+        raise RuntimeError(f"Vision HTTP {exc.code} from {url}:\n{body_text}") from exc
+
+
+def _description_key(kind: str, item_id: int, start_s: float, end_s: float, cfg: AppConfig) -> str:
+    path = cfg.paths.reference_trailer if kind == "beat" else cfg.paths.source_movie
+    try:
+        stamp = int(path.stat().st_mtime)
+    except OSError:
+        stamp = 0
+    return (
+        f"{kind}:{item_id}:"
+        f"{start_s:.3f}:{end_s:.3f}:"
+        f"{cfg.vision.provider}:{cfg.vision.model}:{stamp}"
+    )
+
+
+def _describe_sample(
+    *,
+    kind: str,
+    item_id: int,
+    label: str,
+    video_path: Path,
+    start_s: float,
+    end_s: float,
+    cfg: AppConfig,
+    cache: dict,
+    budget: list[int],
+) -> str | None:
+    key = _description_key(kind, item_id, start_s, end_s, cfg)
+    cached = cache["items"].get(key)
+    if cached:
+        return str(cached.get("description", ""))
+    if budget[0] <= 0:
+        return None
+
+    image_urls = [
+        url for url in (_frame_data_url(video_path, t) for t in _sample_times(start_s, end_s))
+        if url is not None
+    ]
+    if len(image_urls) < 2:
+        return None
+
+    description = _call_vision_model(label, image_urls, cfg)
+    cache["items"][key] = {
+        "kind": kind,
+        "item_id": item_id,
+        "start_s": start_s,
+        "end_s": end_s,
+        "label": label,
+        "description": description,
+    }
+    budget[0] -= 1
+    return description
+
+
+def _terms(text: str) -> set[str]:
+    words = re.findall(r"[a-zA-Z][a-zA-Z0-9_'-]{2,}", text.lower())
+    return {w for w in words if w not in _STOPWORDS}
+
+
+def _text_similarity(a: str, b: str) -> float:
+    ta = _terms(a)
+    tb = _terms(b)
+    if not ta or not tb:
+        return 0.0
+    overlap = len(ta & tb)
+    return float(overlap / max(8, min(len(ta), len(tb))))
+
+
+def _scene_seed_points(scene: Scene, max_points: int) -> list[float]:
+    if max_points <= 1 or scene.duration_s <= 0:
+        return [scene.start_s]
+    usable_end = max(scene.start_s, scene.end_s - 0.2)
+    if usable_end <= scene.start_s:
+        return [scene.start_s]
+    step = (usable_end - scene.start_s) / max(1, max_points - 1)
+    return [scene.start_s + step * idx for idx in range(max_points)]
+
+
+def build_vision_seed_in_points(
+    beats: Sequence[TrailerBeat],
+    scenes: Sequence[Scene],
+    cfg: AppConfig,
+) -> dict[int, list[tuple[float, float]]]:
+    """
+    Return extra in-point seeds from cached vision descriptions.
+
+    The function is intentionally small-budget: for each beat it describes the
+    beat once and only a few top scene-level candidates. Existing descriptions
+    are read from cache and cost nothing.
+    """
+    if not cfg.vision.enabled:
+        return {}
+    if not beats or not scenes:
+        return {}
+
+    from src.cv.vibe_check import run_vibe_check
+
+    cache = _load_cache(cfg)
+    budget = [cfg.vision.max_new_descriptions_per_run]
+    scenes_by_id = {scene.scene_id: scene for scene in scenes}
+    seeds: dict[int, list[tuple[float, float]]] = {}
+
+    for beat in beats:
+        beat_desc = _describe_sample(
+            kind="beat",
+            item_id=beat.beat_id,
+            label=f"trailer beat {beat.beat_id}",
+            video_path=beat.trailer_path,
+            start_s=beat.start_s,
+            end_s=beat.end_s,
+            cfg=cfg,
+            cache=cache,
+            budget=budget,
+        )
+        if not beat_desc:
+            continue
+
+        hits = run_vibe_check(
+            beat,
+            scenes,
+            top_k=cfg.vision.scene_candidate_top_k,
+            hist_method=cfg.cv.vibe_check.hist_compare_method,
+            phash_max_distance=64,
+        )
+
+        ranked: list[tuple[float, Scene]] = []
+        for hit in hits:
+            scene = scenes_by_id.get(hit.scene_id)
+            if scene is None:
+                continue
+            scene_desc = _describe_sample(
+                kind="scene",
+                item_id=scene.scene_id,
+                label=f"source scene {scene.scene_id}",
+                video_path=scene.source_path,
+                start_s=scene.start_s,
+                end_s=scene.end_s,
+                cfg=cfg,
+                cache=cache,
+                budget=budget,
+            )
+            if not scene_desc:
+                continue
+            score = _text_similarity(beat_desc, scene_desc)
+            if score >= cfg.vision.similarity_threshold:
+                ranked.append((score, scene))
+
+        ranked.sort(key=lambda item: item[0], reverse=True)
+        points: list[tuple[float, float]] = []
+        for score, scene in ranked[:cfg.vision.max_seed_scenes]:
+            logger.info(
+                "Beat %d: vision seed scene=%d score=%.3f",
+                beat.beat_id,
+                scene.scene_id,
+                score,
+            )
+            weighted_score = max(
+                cfg.cv.deep_scan.coarse_candidate_threshold,
+                min(0.98, cfg.vision.seed_score * (0.75 + min(1.0, score) * 0.25)),
+            )
+            points.extend(
+                (point, weighted_score)
+                for point in _scene_seed_points(scene, cfg.vision.seed_points_per_scene)
+            )
+
+        if points:
+            merged: dict[float, float] = {}
+            for point, weighted_score in points:
+                key = round(max(0.0, point), 3)
+                merged[key] = max(weighted_score, merged.get(key, 0.0))
+            seeds[beat.beat_id] = sorted((point, score) for point, score in merged.items())
+
+    _save_cache(cfg, cache)
+    return seeds
diff --git a/src/pipeline/__init__.py b/src/pipeline/__init__.py
new file mode 100644
index 0000000..53af47b
--- /dev/null
+++ b/src/pipeline/__init__.py
@@ -0,0 +1,3 @@
+"""
+src/pipeline/__init__.py — Orchestration layer
+"""
diff --git a/src/pipeline/matcher.py b/src/pipeline/matcher.py
new file mode 100644
index 0000000..431c10a
--- /dev/null
+++ b/src/pipeline/matcher.py
@@ -0,0 +1,291 @@
+"""
+src/pipeline/matcher.py — Top-level CV matching orchestrator
+
+This is the single entry point for the full 2-phase CV pipeline:
+
+  Phase 0: Load / build scene index (PySceneDetect + fingerprinting)
+  Phase 1: Vibe Check — histogram + pHash filter → Top-K candidates per beat
+  Phase 2: Deep Scan — template matching → frame-accurate MatchResult per beat
+
+Usage:
+    from src.core.config import load_config
+    from src.pipeline.matcher import run_matching
+
+    cfg     = load_config()
+    beats   = [...]          # list[TrailerBeat] from trailer analysis
+    results = run_matching(cfg, beats)
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Sequence
+
+from src.core.config import AppConfig
+from src.core.models import MatchResult, Scene, TrailerBeat
+
+logger = logging.getLogger(__name__)
+SeedPoint = float | tuple[float, float]
+
+
+def _scene_seed_points(scene: Scene, max_points: int) -> list[float]:
+    if max_points <= 1 or scene.duration_s <= 0:
+        return [scene.start_s]
+    usable_end = max(scene.start_s, scene.end_s - 0.2)
+    if usable_end <= scene.start_s:
+        return [scene.start_s]
+    step = (usable_end - scene.start_s) / max(1, max_points - 1)
+    return [scene.start_s + step * idx for idx in range(max_points)]
+
+
+def _build_scene_seed_in_points(
+    beats: Sequence[TrailerBeat],
+    scenes: Sequence[Scene],
+    cfg: AppConfig,
+) -> dict[int, list[float]]:
+    from src.cv.vibe_check import run_vibe_check
+
+    scenes_by_id = {scene.scene_id: scene for scene in scenes}
+    seeds: dict[int, list[float]] = {}
+    for beat in beats:
+        hits = run_vibe_check(
+            beat,
+            scenes,
+            top_k=cfg.cv.deep_scan.scene_seed_top_k,
+            hist_method=cfg.cv.vibe_check.hist_compare_method,
+            phash_max_distance=64,
+        )
+        points: list[float] = []
+        for hit in hits:
+            scene = scenes_by_id.get(hit.scene_id)
+            if scene is None:
+                continue
+            points.extend(_scene_seed_points(scene, cfg.cv.deep_scan.scene_seed_points_per_scene))
+        if points:
+            seeds[beat.beat_id] = sorted({round(max(0.0, p), 3) for p in points})
+            logger.info(
+                "Beat %d: added %d scene-level seed candidates from %d source scenes.",
+                beat.beat_id,
+                len(seeds[beat.beat_id]),
+                len(hits),
+            )
+    return seeds
+
+
+def _merge_seed_in_points(
+    *seed_maps: dict[int, Sequence[SeedPoint]] | None,
+) -> dict[int, list[SeedPoint]]:
+    merged: dict[int, dict[float, float | None]] = {}
+    for seed_map in seed_maps:
+        if not seed_map:
+            continue
+        for beat_id, points in seed_map.items():
+            beat_points = merged.setdefault(beat_id, {})
+            for point in points:
+                if isinstance(point, tuple):
+                    t_sec = round(max(0.0, float(point[0])), 3)
+                    score = float(point[1])
+                else:
+                    t_sec = round(max(0.0, float(point)), 3)
+                    score = None
+                old_score = beat_points.get(t_sec)
+                if old_score is None:
+                    beat_points[t_sec] = score
+                elif score is not None:
+                    beat_points[t_sec] = max(old_score, score)
+
+    result: dict[int, list[SeedPoint]] = {}
+    for beat_id, points in merged.items():
+        result[beat_id] = [
+            (t_sec, score) if score is not None else t_sec
+            for t_sec, score in sorted(points.items())
+        ]
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Beat fingerprinting
+# ---------------------------------------------------------------------------
+
+def fingerprint_beats(
+    beats: Sequence[TrailerBeat],
+    cfg: AppConfig,
+) -> list[TrailerBeat]:
+    """
+    Enrich every TrailerBeat with its visual fingerprint (histogram + pHash).
+
+    Extracts the midpoint frame from the reference trailer and fingerprints it
+    using the same Text-Safe Crop parameters as the scene indexer.
+
+    Args:
+        beats: TrailerBeat list (fingerprints will be None initially).
+        cfg:   Application configuration.
+
+    Returns:
+        New list of TrailerBeat objects with luma_hist, sat_hist, phash set.
+    """
+    from dataclasses import replace
+    from src.cv.fingerprinting import fingerprint_frame
+    from src.cv.frame_extractor import grab_frame_at_path
+
+    vc_cfg   = cfg.cv.vibe_check
+    enriched: list[TrailerBeat] = []
+
+    for beat in beats:
+        frame = grab_frame_at_path(beat.trailer_path, beat.midpoint_s)
+        if frame is None:
+            logger.warning("Beat %d: cannot decode midpoint frame, leaving unfingerpinted.", beat.beat_id)
+            enriched.append(beat)
+            continue
+
+        luma_b, sat_b, phash = fingerprint_frame(frame, vc_cfg)
+        enriched.append(replace(beat, luma_hist=luma_b, sat_hist=sat_b, phash=phash))
+
+    logger.info("Fingerprinted %d / %d beats.", sum(1 for b in enriched if b.phash), len(beats))
+    return enriched
+
+
+# ---------------------------------------------------------------------------
+# Main pipeline entry point
+# ---------------------------------------------------------------------------
+
+def run_matching(
+    cfg: AppConfig,
+    beats: Sequence[TrailerBeat],
+    force_reindex: bool = False,
+    seed_in_points: dict[int, Sequence[SeedPoint]] | None = None,
+) -> list[MatchResult]:
+    """
+    Execute the full 2-phase CV matching pipeline.
+
+    Args:
+        cfg:           Application configuration (loaded from config.toml).
+        beats:         All trailer beats to source (must have trailer_path set).
+        force_reindex: If True, ignore the scene cache and re-run PySceneDetect.
+
+    Returns:
+        List of MatchResult, one per beat (unmatched beats are omitted).
+        Results are in the same order as the input beats.
+    """
+    from src.cv.scene_indexer import build_scene_index
+
+    logger.info("=" * 60)
+    logger.info("AI Trailer Generator v2 — CV Matching Pipeline")
+    logger.info("Source : %s", cfg.paths.source_movie.name)
+    logger.info("Trailer: %s", cfg.paths.reference_trailer.name)
+    logger.info("Beats  : %d", len(beats))
+    logger.info("=" * 60)
+
+    # ------------------------------------------------------------------
+    # Phase 0: Scene index
+    # ------------------------------------------------------------------
+    logger.info("[Phase 0] Building scene index …")
+    scenes: list[Scene] = build_scene_index(cfg, force_reindex=force_reindex)
+    scenes_by_id: dict[int, Scene] = {s.scene_id: s for s in scenes}
+    logger.info("[Phase 0] %d scenes indexed.", len(scenes))
+
+    # ------------------------------------------------------------------
+    # Phase 0b: Fingerprint the beats
+    # ------------------------------------------------------------------
+    logger.info("[Phase 0b] Fingerprinting %d trailer beats …", len(beats))
+    beats = fingerprint_beats(beats, cfg)
+
+    # ------------------------------------------------------------------
+    # Phase 1 & 2: Global Scan (bypasses Scene Indexer / Vibe Check entirely)
+    # ------------------------------------------------------------------
+    logger.info("[Phase 1 & 2] Running FFmpeg Global Scan for %d beats ...", len(beats))
+    from src.cv.global_scan import run_global_scan
+    
+    scene_seed_in_points = _build_scene_seed_in_points(beats, scenes, cfg)
+    vision_seed_in_points = {}
+    if cfg.vision.enabled:
+        try:
+            from src.llm.vision_cache import build_vision_seed_in_points
+
+            vision_seed_in_points = build_vision_seed_in_points(beats, scenes, cfg)
+        except Exception as exc:
+            logger.error("Vision seeding failed: %s — continuing with CV-only seeds.", exc)
+    results = run_global_scan(
+        beats,
+        cfg,
+        scenes=scenes,
+        seed_in_points=_merge_seed_in_points(seed_in_points, scene_seed_in_points, vision_seed_in_points),
+    )
+
+    logger.info("[Phase 1 & 2] Done. %d / %d beats matched.", len(results), len(beats))
+    logger.info("=" * 60)
+
+    return results
+
+
+# ---------------------------------------------------------------------------
+# Convenience: build an EditTimeline from match results
+# ---------------------------------------------------------------------------
+
+def build_timeline(
+    beats: Sequence[TrailerBeat],
+    results: Sequence[MatchResult],
+    cfg: AppConfig,
+) -> "src.core.models.EditTimeline":  # type: ignore[name-defined]
+    """
+    Combine beats + match results into an ordered EditTimeline.
+
+    Unmatched beats are skipped; timeline positions are computed
+    sequentially from the usable source-match durations.
+
+    Args:
+        beats:   All trailer beats (defines order + durations).
+        results: MatchResult list from run_matching().
+        cfg:     Application configuration.
+
+    Returns:
+        EditTimeline ready for FCPXML / EDL export.
+    """
+    from src.core.models import EditClip, EditTimeline
+
+    results_by_beat: dict[int, MatchResult] = {r.beat_id: r for r in results}
+
+    clips: list[EditClip] = []
+    cursor = 0.0
+
+    for beat in beats:
+        match = results_by_beat.get(beat.beat_id)
+        if match is None:
+            logger.warning("Beat %d has no match — gap in timeline.", beat.beat_id)
+            cursor += beat.duration_s
+            continue
+
+        match_duration = max(0.0, match.duration_s)
+        source_duration = min(beat.duration_s, match_duration) if match_duration > 0 else beat.duration_s
+        trailer_tail_s = max(0.0, beat.duration_s - source_duration)
+        if trailer_tail_s > 0:
+            logger.warning(
+                "Beat %d uses %.2fs source + %.2fs generated trailer tail.",
+                beat.beat_id,
+                source_duration,
+                trailer_tail_s,
+            )
+
+        clip = EditClip(
+            clip_index=len(clips),
+            beat=beat,
+            match=match,
+            timeline_start_s=cursor,
+            timeline_end_s=cursor + beat.duration_s,
+            source_duration_s=source_duration,
+            trailer_tail_s=trailer_tail_s,
+        )
+        clips.append(clip)
+        cursor += beat.duration_s
+
+    timeline = EditTimeline(
+        title=cfg.paths.reference_trailer.stem,
+        frame_rate=cfg.export.edl_frame_rate,
+        clips=tuple(clips),
+    )
+
+    logger.info(
+        "Timeline built: %d clips, total duration %.2fs",
+        timeline.clip_count, timeline.total_duration_s,
+    )
+    return timeline
diff --git a/src/pipeline/reporter.py b/src/pipeline/reporter.py
new file mode 100644
index 0000000..a84610d
--- /dev/null
+++ b/src/pipeline/reporter.py
@@ -0,0 +1,427 @@
+"""
+src/pipeline/reporter.py — Visual Match Report Generator
+
+Generates an HTML file containing side-by-side video clips of:
+  Left:  The original beat from the reference trailer
+  Right: The matched scene from the source movie
+
+This allows instant visual verification of the CV pipeline's results.
+"""
+
+from __future__ import annotations
+
+import logging
+import subprocess
+from pathlib import Path
+
+from src.core.config import AppConfig
+
+logger = logging.getLogger(__name__)
+
+
+def _extract_clip(video_path: Path, start_s: float, duration_s: float, out_path: Path) -> None:
+    """Use ffmpeg to extract a silent, low-res preview clip."""
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+
+    # Fast input seek close to the target, then accurate output seek for
+    # frame-faithful preview clips. A plain "-ss before -i" can land on a
+    # nearby keyframe and make the report look several frames out of sync.
+    preroll_s = 2.0 if start_s >= 2.0 else 0.0
+    input_seek_s = max(0.0, start_s - preroll_s)
+    accurate_seek_s = start_s - input_seek_s
+
+    cmd = [
+        "ffmpeg", "-y", "-loglevel", "error",
+        "-ss", str(input_seek_s),
+        "-i", str(video_path),
+        "-ss", str(accurate_seek_s),
+        "-t", str(duration_s),
+        "-map", "0:v:0",
+        "-c:v", "libx264",
+        "-preset", "ultrafast",
+        "-crf", "28",
+        "-vf", "scale=640:-2",   # scale down for lightweight report
+        "-an",                   # no audio
+        "-movflags", "+faststart",
+        str(out_path)
+    ]
+    
+    result = subprocess.run(cmd, capture_output=True)
+    if result.returncode != 0:
+        logger.error(
+            "ffmpeg clip extraction failed for %s:\n%s", 
+            out_path.name, result.stderr.decode(errors="replace")
+        )
+
+
+def _extract_clip_with_black_tail(
+    video_path: Path,
+    start_s: float,
+    source_duration_s: float,
+    total_duration_s: float,
+    out_path: Path,
+) -> None:
+    """Extract a source preview and append black frames for trailer-only tails."""
+    tail_s = max(0.0, total_duration_s - source_duration_s)
+    if tail_s <= 0.02:
+        _extract_clip(video_path, start_s, source_duration_s, out_path)
+        return
+
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    source_tmp = out_path.with_name(f"{out_path.stem}_source_tmp.mp4")
+    tail_tmp = out_path.with_name(f"{out_path.stem}_tail_tmp.mp4")
+    preroll_s = 2.0 if start_s >= 2.0 else 0.0
+    input_seek_s = max(0.0, start_s - preroll_s)
+    accurate_seek_s = start_s - input_seek_s
+
+    # First render the matched source portion with the same accurate seek path
+    # as _extract_clip(). Using trim=start=... after an input seek is brittle
+    # because FFmpeg may preserve non-zero packet timestamps around keyframes.
+    source_cmd = [
+        "ffmpeg", "-y", "-loglevel", "error",
+        "-ss", str(input_seek_s),
+        "-i", str(video_path),
+        "-ss", str(accurate_seek_s),
+        "-t", str(source_duration_s),
+        "-map", "0:v:0",
+        "-c:v", "libx264",
+        "-preset", "ultrafast",
+        "-crf", "28",
+        "-vf", "scale=640:360,setsar=1,fps=25,setpts=PTS-STARTPTS",
+        "-an",
+        "-movflags", "+faststart",
+        str(source_tmp),
+    ]
+
+    result = subprocess.run(source_cmd, capture_output=True)
+    if result.returncode != 0:
+        logger.error(
+            "ffmpeg source preview extraction failed for %s:\n%s",
+            out_path.name,
+            result.stderr.decode(errors="replace"),
+        )
+        return
+
+    tail_cmd = [
+        "ffmpeg", "-y", "-loglevel", "error",
+        "-f", "lavfi",
+        "-i", f"color=c=black:s=640x360:r=25:d={tail_s}",
+        "-c:v", "libx264",
+        "-preset", "ultrafast",
+        "-crf", "28",
+        "-an",
+        "-movflags", "+faststart",
+        str(tail_tmp),
+    ]
+    result = subprocess.run(tail_cmd, capture_output=True)
+    if result.returncode != 0:
+        logger.error(
+            "ffmpeg black tail render failed for %s:\n%s",
+            out_path.name,
+            result.stderr.decode(errors="replace"),
+        )
+        return
+
+    concat_cmd = [
+        "ffmpeg", "-y", "-loglevel", "error",
+        "-i", str(source_tmp),
+        "-i", str(tail_tmp),
+        "-filter_complex", "[0:v][1:v]concat=n=2:v=1:a=0[v]",
+        "-map", "[v]",
+        "-c:v", "libx264",
+        "-preset", "ultrafast",
+        "-crf", "28",
+        "-an",
+        "-movflags", "+faststart",
+        str(out_path),
+    ]
+    result = subprocess.run(concat_cmd, capture_output=True)
+    if result.returncode != 0:
+        logger.error(
+            "ffmpeg tailed preview concat failed for %s:\n%s",
+            out_path.name,
+            result.stderr.decode(errors="replace"),
+        )
+
+    for tmp in (source_tmp, tail_tmp):
+        try:
+            tmp.unlink(missing_ok=True)
+        except OSError:
+            pass
+
+
+def _extract_segmented_clip(
+    video_path: Path,
+    segments: list,
+    total_duration_s: float,
+    out_path: Path,
+) -> None:
+    """Render a beat-length source preview from multiple matched source islands."""
+    if not segments:
+        _extract_clip_with_black_tail(video_path, 0.0, 0.0, total_duration_s, out_path)
+        return
+
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    tmp_paths: list[Path] = []
+    cursor = 0.0
+
+    def add_black(duration_s: float) -> None:
+        if duration_s <= 0.02:
+            return
+        tmp = out_path.with_name(f"{out_path.stem}_part_{len(tmp_paths):03d}_black.mp4")
+        cmd = [
+            "ffmpeg", "-y", "-loglevel", "error",
+            "-f", "lavfi",
+            "-i", f"color=c=black:s=640x360:r=25:d={duration_s}",
+            "-c:v", "libx264", "-preset", "ultrafast", "-crf", "28",
+            "-an", "-movflags", "+faststart",
+            str(tmp),
+        ]
+        result = subprocess.run(cmd, capture_output=True)
+        if result.returncode == 0:
+            tmp_paths.append(tmp)
+        else:
+            logger.error("ffmpeg black segment render failed:\n%s", result.stderr.decode(errors="replace"))
+
+    def add_source(start_s: float, duration_s: float) -> None:
+        if duration_s <= 0.02:
+            return
+        tmp = out_path.with_name(f"{out_path.stem}_part_{len(tmp_paths):03d}_src.mp4")
+        preroll_s = 2.0 if start_s >= 2.0 else 0.0
+        input_seek_s = max(0.0, start_s - preroll_s)
+        accurate_seek_s = start_s - input_seek_s
+        cmd = [
+            "ffmpeg", "-y", "-loglevel", "error",
+            "-ss", str(input_seek_s),
+            "-i", str(video_path),
+            "-ss", str(accurate_seek_s),
+            "-t", str(duration_s),
+            "-map", "0:v:0",
+            "-c:v", "libx264", "-preset", "ultrafast", "-crf", "28",
+            "-vf", "scale=640:360,setsar=1,fps=25,setpts=PTS-STARTPTS",
+            "-an", "-movflags", "+faststart",
+            str(tmp),
+        ]
+        result = subprocess.run(cmd, capture_output=True)
+        if result.returncode == 0 and tmp.exists():
+            tmp_paths.append(tmp)
+        else:
+            logger.error("ffmpeg source segment render failed:\n%s", result.stderr.decode(errors="replace"))
+
+    for segment in sorted(segments, key=lambda s: s.trailer_offset_s):
+        offset_s = max(0.0, float(segment.trailer_offset_s))
+        duration_s = max(0.0, float(segment.duration_s))
+        add_black(offset_s - cursor)
+        add_source(float(segment.in_point_s), duration_s)
+        cursor = max(cursor, offset_s + duration_s)
+
+    add_black(total_duration_s - cursor)
+
+    if len(tmp_paths) == 1:
+        tmp_paths[0].replace(out_path)
+        return
+
+    inputs: list[str] = []
+    labels: list[str] = []
+    for idx, tmp in enumerate(tmp_paths):
+        inputs.extend(["-i", str(tmp)])
+        labels.append(f"[{idx}:v]")
+    filter_complex = "".join(labels) + f"concat=n={len(tmp_paths)}:v=1:a=0[v]"
+    cmd = [
+        "ffmpeg", "-y", "-loglevel", "error",
+        *inputs,
+        "-filter_complex", filter_complex,
+        "-map", "[v]",
+        "-c:v", "libx264", "-preset", "ultrafast", "-crf", "28",
+        "-an", "-movflags", "+faststart",
+        str(out_path),
+    ]
+    result = subprocess.run(cmd, capture_output=True)
+    if result.returncode != 0:
+        logger.error("ffmpeg segmented preview concat failed:\n%s", result.stderr.decode(errors="replace"))
+
+    for tmp in tmp_paths:
+        try:
+            tmp.unlink(missing_ok=True)
+        except OSError:
+            pass
+
+
+def _build_frame_locked_compare(ref_path: Path, src_path: Path, out_path: Path) -> None:
+    """Render reference and source into one side-by-side video stream."""
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    normalize = (
+        "fps=25,scale=640:360:force_original_aspect_ratio=decrease,"
+        "pad=640:360:(ow-iw)/2:(oh-ih)/2,setsar=1,setpts=PTS-STARTPTS"
+    )
+    filter_complex = (
+        f"[0:v]{normalize}[ref];"
+        f"[1:v]{normalize}[src];"
+        "[ref][src]hstack=inputs=2[v]"
+    )
+    cmd = [
+        "ffmpeg", "-y", "-loglevel", "error",
+        "-i", str(ref_path),
+        "-i", str(src_path),
+        "-filter_complex", filter_complex,
+        "-map", "[v]",
+        "-c:v", "libx264",
+        "-preset", "ultrafast",
+        "-crf", "28",
+        "-an",
+        "-movflags", "+faststart",
+        str(out_path),
+    ]
+    result = subprocess.run(cmd, capture_output=True)
+    if result.returncode != 0:
+        logger.error(
+            "ffmpeg compare render failed for %s:\n%s",
+            out_path.name,
+            result.stderr.decode(errors="replace"),
+        )
+
+
+def generate_report(beats: list, results: list, cfg: AppConfig) -> Path:
+    """
+    Generate an HTML side-by-side report.
+    Returns the path to the .html file.
+    """
+    report_dir = cfg.paths.output_dir / "report"
+    report_dir.mkdir(parents=True, exist_ok=True)
+    
+    html_path = report_dir / "match_report.html"
+    results_by_beat = {r.beat_id: r for r in results}
+    
+    logger.info("Generating report clips in %s (this might take a moment) ...", report_dir)
+    
+    html = [
+        "<!DOCTYPE html>",
+        "<html><head><meta charset='utf-8'><title>AI Trailer Match Report</title>",
+        "<style>",
+        "body { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; background: #0f0f0f; color: #e0e0e0; margin: 40px; }",
+        "h1 { color: #fff; border-bottom: 1px solid #333; padding-bottom: 10px; }",
+        ".stats { font-size: 1.2em; margin-bottom: 30px; color: #aaa; }",
+        ".beat-row { display: flex; margin-bottom: 30px; background: #1a1a1a; padding: 20px; border-radius: 12px; border: 1px solid #333; }",
+        ".info { width: 250px; padding-right: 20px; flex-shrink: 0; }",
+        ".info h3 { margin-top: 0; color: #fff; }",
+        ".video-container { display: flex; gap: 20px; flex-grow: 1; }",
+        ".videos { flex-grow: 1; }",
+        ".compare { margin-bottom: 18px; }",
+        ".video-col { flex: 1; }",
+        ".video-col p { margin-top: 0; font-weight: bold; color: #888; }",
+        "video { width: 100%; border-radius: 6px; box-shadow: 0 4px 6px rgba(0,0,0,0.5); background: #000; }",
+        ".status-match { color: #4ade80; font-weight: bold; font-size: 1.1em; }",
+        ".status-miss { color: #f87171; font-weight: bold; font-size: 1.1em; }",
+        ".score { font-family: monospace; font-size: 1.1em; color: #60a5fa; }",
+        ".code-hint { background: #000; padding: 10px; border-radius: 4px; font-family: monospace; font-size: 0.9em; margin-top: 15px; color: #a3e635; }",
+        "</style></head><body>",
+        f"<h1>AI Trailer Generator — Match Report</h1>",
+        f"<div class='stats'>Total Beats: {len(beats)} | Matched: {len(results)}</div>",
+        "<script>",
+        "function syncBeat(row) {",
+        "  const vids = row.querySelectorAll('video');",
+        "  if (vids.length < 2) return;",
+        "  const ref = vids[0];",
+        "  const src = vids[1];",
+        "  let syncing = false;",
+        "  function align() {",
+        "    if (syncing) return;",
+        "    syncing = true;",
+        "    const target = Math.min(ref.currentTime, Math.max(0, (src.duration || ref.currentTime) - 0.02));",
+        "    if (Math.abs(src.currentTime - target) > 0.035) src.currentTime = target;",
+        "    if (ref.paused && !src.paused) src.pause();",
+        "    if (!ref.paused && src.paused) src.play().catch(() => {});",
+        "    syncing = false;",
+        "  }",
+        "  ref.addEventListener('play', () => { src.currentTime = Math.min(ref.currentTime, Math.max(0, (src.duration || ref.currentTime) - 0.02)); src.play().catch(() => {}); });",
+        "  ref.addEventListener('pause', () => src.pause());",
+        "  ref.addEventListener('seeked', () => { src.currentTime = Math.min(ref.currentTime, Math.max(0, (src.duration || ref.currentTime) - 0.02)); });",
+        "  ref.addEventListener('timeupdate', align);",
+        "}",
+        "document.addEventListener('DOMContentLoaded', () => document.querySelectorAll('.beat-row').forEach(syncBeat));",
+        "</script>"
+    ]
+    
+    for beat in beats:
+        res = results_by_beat.get(beat.beat_id)
+        
+        # Extract Reference Clip
+        ref_mp4 = report_dir / f"beat_{beat.beat_id:03d}_ref.mp4"
+        _extract_clip(beat.trailer_path, beat.start_s, beat.duration_s, ref_mp4)
+        
+        html.append("<div class='beat-row'>")
+        
+        # Info Panel
+        html.append("<div class='info'>")
+        html.append(f"<h3>Beat {beat.beat_id:03d}</h3>")
+        html.append(f"<p><b>Type:</b> {beat.beat_type.name}</p>")
+        html.append(f"<p><b>Trailer:</b> {beat.start_s:.2f}s &rarr; {beat.end_s:.2f}s</p>")
+        
+        if res:
+            segments = list(getattr(res, "segments", ()) or [])
+            source_duration = sum(max(0.0, float(s.duration_s)) for s in segments)
+            if not segments:
+                source_duration = max(0.0, res.out_point_s - res.in_point_s)
+            preview_duration = min(beat.duration_s, source_duration) if source_duration > 0 else beat.duration_s
+            last_segment_end = max(
+                (float(s.trailer_offset_s) + float(s.duration_s) for s in segments),
+                default=preview_duration,
+            )
+            trailer_tail_s = max(0.0, beat.duration_s - last_segment_end)
+            if getattr(res, "is_confirmed", True):
+                html.append("<p class='status-match'>MATCHED</p>")
+            else:
+                html.append("<p style='color: #fbbf24; font-weight: bold; font-size: 1.1em;'>PROVISIONAL MATCH</p>")
+            html.append(f"<p><b>Scene ID:</b> {res.scene_id}</p>")
+            html.append(f"<p><b>Movie In:</b> {res.in_point_s:.2f}s</p>")
+            html.append(f"<p><b>Source Dur:</b> {source_duration:.2f}s</p>")
+            if len(segments) > 1:
+                html.append(f"<p><b>Segments:</b> {len(segments)} matched visual islands</p>")
+            if trailer_tail_s > 0:
+                html.append(f"<p><b>Unmatched Tail:</b> {trailer_tail_s:.2f}s placeholder</p>")
+            html.append(f"<p><b>Score:</b> <span class='score'>{res.match_score:.3f}</span></p>")
+            if trailer_tail_s > 0:
+                html.append("<p style='color: #fbbf24; font-size: 0.9em;'>Some trailer frames are still unmatched; report fills only those gaps with placeholder black.</p>")
+            
+            # Warn if score is low
+            if res.match_score < 0.80:
+                html.append("<p style='color: #fbbf24; font-size: 0.9em;'>⚠️ Score below 0.80. Verify visually.</p>")
+            
+            # Extract Source Clip
+            src_mp4 = report_dir / f"beat_{beat.beat_id:03d}_src.mp4"
+            compare_mp4 = report_dir / f"beat_{beat.beat_id:03d}_compare.mp4"
+            if segments:
+                _extract_segmented_clip(res.source_path, segments, beat.duration_s, src_mp4)
+            else:
+                _extract_clip_with_black_tail(
+                    res.source_path,
+                    res.in_point_s,
+                    preview_duration,
+                    beat.duration_s,
+                    src_mp4,
+                )
+            _build_frame_locked_compare(ref_mp4, src_mp4, compare_mp4)
+        else:
+            html.append("<p class='status-miss'>NO MATCH</p>")
+            src_mp4 = None
+            compare_mp4 = None
+            
+        html.append(f"<div class='code-hint'>python cli.py rematch --beat {beat.beat_id}</div>")
+        html.append("</div>") # /info
+        
+        # Video Panel
+        html.append("<div class='videos'>")
+        if compare_mp4:
+            html.append(f"<div class='compare'><p>Frame-Locked Compare</p><video src='{compare_mp4.name}' controls loop muted autoplay></video></div>")
+        else:
+            html.append("<div class='video-container'>")
+            html.append(f"<div class='video-col'><p>Reference Trailer</p><video src='{ref_mp4.name}' controls loop muted autoplay></video></div>")
+            html.append("<div class='video-col'><p>Matched Source</p><div style='width: 100%; aspect-ratio: 16/9; background: #222; display: flex; align-items: center; justify-content: center; border-radius: 6px; color: #555;'>No Match</div></div>")
+            html.append("</div>") # /video-container
+        html.append("</div>") # /videos
+        html.append("</div>") # /beat-row
+        
+    html.append("</body></html>")
+    
+    html_path.write_text("\n".join(html), encoding="utf-8")
+    return html_path
diff --git a/src/pipeline/trailer_analyzer.py b/src/pipeline/trailer_analyzer.py
new file mode 100644
index 0000000..a2e16b9
--- /dev/null
+++ b/src/pipeline/trailer_analyzer.py
@@ -0,0 +1,175 @@
+"""
+src/pipeline/trailer_analyzer.py — Reference trailer → list[TrailerBeat]
+
+Responsibility:
+  1. Run PySceneDetect on the REFERENCE TRAILER (not the source movie)
+     to detect cut boundaries → raw beat intervals
+  2. Fingerprint the midpoint frame of each beat (for Vibe Check)
+  3. Transcribe dialogue per beat via Whisper (optional, injected)
+  4. Optionally classify BeatType via the LLM dramaturg (injected)
+
+Returns: list[TrailerBeat] ready to feed into run_matching().
+"""
+
+from __future__ import annotations
+
+import logging
+from dataclasses import replace
+from pathlib import Path
+from typing import Callable, Sequence
+
+from src.core.config import AppConfig
+from src.core.models import BeatType, DialogueLine, TrailerBeat
+from src.cv.fingerprinting import fingerprint_frame
+from src.cv.frame_extractor import grab_midpoint_frame, open_video
+
+logger = logging.getLogger(__name__)
+
+# Injection type aliases — keeps this module free of hard audio/LLM imports
+TranscribeCallback = Callable[[Path, float, float, float], list[DialogueLine]]
+ClassifyCallback   = Callable[[list[TrailerBeat]], list[TrailerBeat]]
+
+
+# ---------------------------------------------------------------------------
+# Step 1: Scene detection on the reference trailer
+# ---------------------------------------------------------------------------
+
+def _detect_trailer_beats(cfg: AppConfig) -> list[tuple[float, float, int, int]]:
+    """
+    Run PySceneDetect on the reference trailer.
+
+    Returns list of (start_s, end_s, start_frame, end_frame).
+    Uses the same ContentDetector thresholds as the source movie.
+    """
+    try:
+        from scenedetect import open_video as sd_open_video, SceneManager
+        from scenedetect.detectors import ContentDetector
+    except ImportError:
+        raise ImportError("pip install scenedetect[opencv]")
+
+    trailer_path = cfg.paths.reference_trailer
+    video   = sd_open_video(str(trailer_path))
+    manager = SceneManager()
+    manager.add_detector(
+        ContentDetector(
+            threshold=cfg.scene_detection.content_threshold,
+            min_scene_len=int(
+                cfg.scene_detection.min_scene_duration_s * video.frame_rate
+            ),
+        )
+    )
+
+    logger.info("Detecting beats in reference trailer: %s …", trailer_path.name)
+    manager.detect_scenes(video=video, show_progress=False)
+
+    raw = manager.get_scene_list()
+    result = [
+        (s.get_seconds(), e.get_seconds(), s.get_frames(), e.get_frames())
+        for s, e in raw
+    ]
+    logger.info("Detected %d beats in reference trailer.", len(result))
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Step 2: Fingerprint beats
+# ---------------------------------------------------------------------------
+
+def _fingerprint_beats(
+    raw_beats: list[tuple[float, float, int, int]],
+    cfg: AppConfig,
+) -> list[TrailerBeat]:
+    """Extract midpoint frame for each beat and compute fingerprints."""
+    vc_cfg       = cfg.cv.vibe_check
+    trailer_path = cfg.paths.reference_trailer
+    beats: list[TrailerBeat] = []
+
+    with open_video(trailer_path) as cap:
+        for idx, (start_s, end_s, start_frame, end_frame) in enumerate(raw_beats):
+            frame = grab_midpoint_frame(cap, start_s, end_s)
+
+            if frame is None:
+                logger.warning("Beat %d: midpoint frame decode failed.", idx)
+                beats.append(TrailerBeat(
+                    beat_id=idx,
+                    trailer_path=trailer_path,
+                    start_s=start_s, end_s=end_s,
+                    start_frame=start_frame, end_frame=end_frame,
+                ))
+                continue
+
+            luma_b, sat_b, phash = fingerprint_frame(frame, vc_cfg)
+            beats.append(TrailerBeat(
+                beat_id=idx,
+                trailer_path=trailer_path,
+                start_s=start_s, end_s=end_s,
+                start_frame=start_frame, end_frame=end_frame,
+                luma_hist=luma_b,
+                sat_hist=sat_b,
+                phash=phash,
+            ))
+
+    return beats
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+def analyze_reference_trailer(
+    cfg: AppConfig,
+    transcribe_callback: TranscribeCallback | None = None,
+    classify_callback:   ClassifyCallback   | None = None,
+) -> list[TrailerBeat]:
+    """
+    Full reference-trailer analysis pipeline.
+
+    Args:
+        cfg:                   Application configuration.
+        transcribe_callback:   Optional fn(path, start_s, end_s, offset_s)
+                               → list[DialogueLine]. Injected to keep this
+                               module free of faster-whisper imports.
+        classify_callback:     Optional fn(beats) → beats with BeatType set.
+                               Injected to keep this module LLM-free.
+
+    Returns:
+        List of TrailerBeat objects with fingerprints (and optionally
+        dialogue + BeatType) populated.
+    """
+    # Step 1 — cut detection
+    raw_beats = _detect_trailer_beats(cfg)
+
+    # Step 2 — fingerprint
+    beats = _fingerprint_beats(raw_beats, cfg)
+
+    # Step 3 — dialogue (optional)
+    if transcribe_callback is not None:
+        enriched: list[TrailerBeat] = []
+        for beat in beats:
+            try:
+                lines = transcribe_callback(
+                    beat.trailer_path,
+                    beat.start_s,
+                    beat.end_s,
+                    beat.start_s,   # time_offset so timestamps are absolute
+                )
+                enriched.append(replace(beat, dialogue=tuple(lines)))
+            except Exception as exc:
+                logger.warning("Beat %d transcription failed: %s", beat.beat_id, exc)
+                enriched.append(beat)
+        beats = enriched
+
+    # Step 4 — LLM dramaturgy (optional)
+    if classify_callback is not None:
+        try:
+            beats = classify_callback(beats)
+        except Exception as exc:
+            logger.warning("Beat classification failed: %s — keeping UNKNOWN.", exc)
+
+    logger.info(
+        "Trailer analysis complete: %d beats, %d with dialogue, %d classified.",
+        len(beats),
+        sum(1 for b in beats if b.dialogue),
+        sum(1 for b in beats if b.beat_type != BeatType.UNKNOWN),
+    )
+    return beats
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..65140f2
--- /dev/null
+++ b/tests/__init__.py
@@ -0,0 +1 @@
+# tests package
diff --git a/tests/test_config.py b/tests/test_config.py
new file mode 100644
index 0000000..f0b728b
--- /dev/null
+++ b/tests/test_config.py
@@ -0,0 +1,144 @@
+"""
+tests/test_config.py — Smoke tests for config loading and model integrity.
+
+Run with:  pytest tests/test_config.py -v
+"""
+
+from pathlib import Path
+import pytest
+
+from src.core.config import load_config, AppConfig
+from src.core.models import (
+    Scene, TrailerBeat, MatchResult, VibeHit,
+    EditClip, EditTimeline, BeatType, DialogueLine,
+)
+
+
+CONFIG_PATH = Path(__file__).parents[1] / "config.toml"
+
+
+# ---------------------------------------------------------------------------
+# Config loader
+# ---------------------------------------------------------------------------
+
+class TestConfigLoader:
+    def test_loads_without_error(self) -> None:
+        cfg = load_config(CONFIG_PATH)
+        assert isinstance(cfg, AppConfig)
+
+    def test_project_meta(self) -> None:
+        cfg = load_config(CONFIG_PATH)
+        assert cfg.version == "2.0.0"
+        assert cfg.log_level in ("DEBUG", "INFO", "WARNING", "ERROR")
+
+    def test_cv_thresholds_in_range(self) -> None:
+        cfg = load_config(CONFIG_PATH)
+        ds = cfg.cv.deep_scan
+        assert 0.0 < ds.match_threshold < 1.0
+        assert ds.coarse_step_seconds > 0
+
+    def test_vibe_check_crop_fractions(self) -> None:
+        cfg = load_config(CONFIG_PATH)
+        vc = cfg.cv.vibe_check
+        assert 0.0 < vc.crop_top_fraction    < 1.0
+        assert 0.0 < vc.crop_bottom_fraction < 1.0
+        assert vc.crop_top_fraction + vc.crop_bottom_fraction < 1.0
+
+    def test_missing_config_raises(self, tmp_path: Path) -> None:
+        with pytest.raises(FileNotFoundError):
+            load_config(tmp_path / "nonexistent.toml")
+
+    def test_paths_are_path_objects(self) -> None:
+        cfg = load_config(CONFIG_PATH)
+        assert isinstance(cfg.paths.source_movie,      Path)
+        assert isinstance(cfg.paths.reference_trailer, Path)
+
+
+# ---------------------------------------------------------------------------
+# Data models — construction & properties
+# ---------------------------------------------------------------------------
+
+class TestSceneModel:
+    def test_duration(self) -> None:
+        s = Scene(
+            scene_id=0,
+            source_path=Path("dummy.mp4"),
+            start_s=10.0,
+            end_s=25.5,
+            start_frame=240,
+            end_frame=612,
+        )
+        assert s.duration_s == pytest.approx(15.5)
+        assert s.midpoint_s == pytest.approx(17.75)
+
+    def test_immutable(self) -> None:
+        s = Scene(
+            scene_id=0, source_path=Path("x.mp4"),
+            start_s=0.0, end_s=1.0,
+            start_frame=0, end_frame=24,
+        )
+        with pytest.raises(Exception):  # FrozenInstanceError
+            s.scene_id = 99  # type: ignore[misc]
+
+
+class TestTrailerBeatModel:
+    def test_beat_type_default(self) -> None:
+        b = TrailerBeat(
+            beat_id=0, trailer_path=Path("trailer.mp4"),
+            start_s=0.0, end_s=3.0,
+            start_frame=0, end_frame=72,
+        )
+        assert b.beat_type == BeatType.UNKNOWN
+
+
+class TestMatchResultModel:
+    def test_duration_computed(self) -> None:
+        mr = MatchResult(
+            beat_id=0, scene_id=3,
+            source_path=Path("movie.mp4"),
+            in_point_s=120.0,
+            out_point_s=123.5,
+            in_point_frame=2880,
+            match_score=0.87,
+        )
+        assert mr.duration_s == pytest.approx(3.5)
+
+    def test_repr_contains_key_info(self) -> None:
+        mr = MatchResult(
+            beat_id=1, scene_id=7,
+            source_path=Path("movie.mp4"),
+            in_point_s=60.0, out_point_s=63.0,
+            in_point_frame=1440, match_score=0.91,
+        )
+        r = repr(mr)
+        assert "beat=1" in r
+        assert "scene=7" in r
+
+
+class TestEditTimeline:
+    def _make_clip(self, idx: int, t_start: float, t_end: float) -> EditClip:
+        beat = TrailerBeat(
+            beat_id=idx, trailer_path=Path("t.mp4"),
+            start_s=t_start, end_s=t_end,
+            start_frame=0, end_frame=1,
+        )
+        match = MatchResult(
+            beat_id=idx, scene_id=0,
+            source_path=Path("m.mp4"),
+            in_point_s=0.0, out_point_s=t_end - t_start,
+            in_point_frame=0, match_score=0.9,
+        )
+        return EditClip(
+            clip_index=idx, beat=beat, match=match,
+            timeline_start_s=t_start, timeline_end_s=t_end,
+        )
+
+    def test_total_duration(self) -> None:
+        clips = (self._make_clip(0, 0.0, 5.0), self._make_clip(1, 5.0, 9.0))
+        tl = EditTimeline(title="Test Trailer", frame_rate=23.976, clips=clips)
+        assert tl.total_duration_s == pytest.approx(9.0)
+        assert tl.clip_count == 2
+
+    def test_empty_timeline(self) -> None:
+        tl = EditTimeline(title="Empty", frame_rate=24.0, clips=())
+        assert tl.total_duration_s == 0.0
diff --git a/tests/test_deep_scan.py b/tests/test_deep_scan.py
new file mode 100644
index 0000000..c220ad3
--- /dev/null
+++ b/tests/test_deep_scan.py
@@ -0,0 +1,140 @@
+"""
+tests/test_deep_scan.py — Unit tests for frame_extractor and deep_scan
+
+Uses synthetic in-memory videos (cv2.VideoWriter → temp file) so no real
+video files are required. Tests cover the pure logic, not hardware decoding.
+"""
+
+from __future__ import annotations
+
+import tempfile
+from pathlib import Path
+
+import cv2
+import numpy as np
+import pytest
+
+from src.cv.frame_extractor import (
+    get_video_info,
+    grab_frame_at,
+    iter_frames_stepped,
+    open_video,
+)
+from src.cv.fingerprinting import text_safe_crop
+
+
+# ---------------------------------------------------------------------------
+# Helpers: build a tiny synthetic video on disk
+# ---------------------------------------------------------------------------
+
+FPS    = 24
+WIDTH  = 320
+HEIGHT = 240
+SECS   = 3
+
+
+def _make_synthetic_video(path: Path, color_bgr: tuple[int, int, int] = (0, 128, 255)) -> Path:
+    """Write a 3-second single-colour video to *path*."""
+    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+    writer = cv2.VideoWriter(str(path), fourcc, float(FPS), (WIDTH, HEIGHT))
+    frame  = np.full((HEIGHT, WIDTH, 3), color_bgr, dtype=np.uint8)
+    for _ in range(FPS * SECS):
+        writer.write(frame)
+    writer.release()
+    return path
+
+
+@pytest.fixture
+def synthetic_video(tmp_path: Path) -> Path:
+    return _make_synthetic_video(tmp_path / "test.mp4")
+
+
+# ---------------------------------------------------------------------------
+# open_video
+# ---------------------------------------------------------------------------
+
+class TestOpenVideo:
+    def test_opens_valid_file(self, synthetic_video: Path) -> None:
+        with open_video(synthetic_video) as cap:
+            assert cap.isOpened()
+
+    def test_raises_on_missing_file(self, tmp_path: Path) -> None:
+        with pytest.raises(FileNotFoundError):
+            with open_video(tmp_path / "ghost.mp4"):
+                pass
+
+
+# ---------------------------------------------------------------------------
+# get_video_info
+# ---------------------------------------------------------------------------
+
+class TestGetVideoInfo:
+    def test_returns_correct_fps(self, synthetic_video: Path) -> None:
+        info = get_video_info(synthetic_video)
+        assert info["fps"] == pytest.approx(FPS, rel=0.05)
+
+    def test_duration_approx(self, synthetic_video: Path) -> None:
+        info = get_video_info(synthetic_video)
+        assert info["duration_s"] == pytest.approx(SECS, rel=0.1)
+
+    def test_resolution(self, synthetic_video: Path) -> None:
+        info = get_video_info(synthetic_video)
+        assert info["width"]  == WIDTH
+        assert info["height"] == HEIGHT
+
+
+# ---------------------------------------------------------------------------
+# grab_frame_at
+# ---------------------------------------------------------------------------
+
+class TestGrabFrameAt:
+    def test_returns_ndarray(self, synthetic_video: Path) -> None:
+        with open_video(synthetic_video) as cap:
+            frame = grab_frame_at(cap, 1.0)
+        assert frame is not None
+        assert isinstance(frame, np.ndarray)
+        assert frame.shape == (HEIGHT, WIDTH, 3)
+
+    def test_returns_none_past_end(self, synthetic_video: Path) -> None:
+        with open_video(synthetic_video) as cap:
+            frame = grab_frame_at(cap, 9999.0)
+        # May return None or a repeated last frame depending on codec;
+        # we only assert no exception is raised.
+        assert frame is None or isinstance(frame, np.ndarray)
+
+
+# ---------------------------------------------------------------------------
+# iter_frames_stepped
+# ---------------------------------------------------------------------------
+
+class TestIterFramesStepped:
+    def test_yields_correct_count(self, synthetic_video: Path) -> None:
+        with open_video(synthetic_video) as cap:
+            frames = list(iter_frames_stepped(cap, 0.0, 1.0, 0.5))
+        # Expect timestamps: 0.0, 0.5, 1.0 → 3 frames
+        assert len(frames) == 3
+
+    def test_timestamps_increasing(self, synthetic_video: Path) -> None:
+        with open_video(synthetic_video) as cap:
+            frames = list(iter_frames_stepped(cap, 0.0, 2.0, 0.5))
+        timestamps = [t for t, _ in frames]
+        assert timestamps == sorted(timestamps)
+
+    def test_invalid_step_raises(self, synthetic_video: Path) -> None:
+        with open_video(synthetic_video) as cap:
+            with pytest.raises(ValueError, match="step_s"):
+                list(iter_frames_stepped(cap, 0.0, 1.0, 0.0))
+
+
+# ---------------------------------------------------------------------------
+# text_safe_crop integration (sanity: cropped height consistent)
+# ---------------------------------------------------------------------------
+
+class TestCropSanity:
+    def test_crop_reduces_height(self, synthetic_video: Path) -> None:
+        with open_video(synthetic_video) as cap:
+            frame = grab_frame_at(cap, 0.5)
+        assert frame is not None
+        cropped = text_safe_crop(frame, 0.15, 0.30)
+        assert cropped.shape[0] < frame.shape[0]
+        assert cropped.shape[1] == frame.shape[1]  # width unchanged
diff --git a/tests/test_export.py b/tests/test_export.py
new file mode 100644
index 0000000..bd24791
--- /dev/null
+++ b/tests/test_export.py
@@ -0,0 +1,218 @@
+"""
+tests/test_export.py — Unit tests for timecode conversion and export writers
+
+Tests use synthetic EditTimeline objects (no real video files needed).
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+from src.export.timecode import (
+    seconds_to_fcpxml,
+    seconds_to_smpte,
+    fcpxml_frame_duration,
+    fcpxml_format_name,
+    seconds_to_frame_count,
+)
+
+
+# ---------------------------------------------------------------------------
+# Timecode helpers
+# ---------------------------------------------------------------------------
+
+class TestSecondsToFcpxml:
+    def test_zero(self) -> None:
+        assert seconds_to_fcpxml(0.0, 24.0) == "0s"
+
+    def test_one_second_at_24fps(self) -> None:
+        # 1.0s @ 24fps → 24 frames → 24/24s = 1/1s
+        result = seconds_to_fcpxml(1.0, 24.0)
+        assert result == "1/1s"
+
+    def test_one_second_at_23976(self) -> None:
+        # 1s @ 23.976 → 24000/24000 * 1001/1001 = 1001/1000 ... let's just check it's rational
+        result = seconds_to_fcpxml(1.0, 23.976)
+        assert result.endswith("s")
+        assert "/" in result
+
+    def test_ten_seconds_at_25fps(self) -> None:
+        # 10s @ 25fps → 250 frames → 250/25s = 10/1s
+        result = seconds_to_fcpxml(10.0, 25.0)
+        assert result == "10/1s"
+
+    def test_rational_is_reduced(self) -> None:
+        # Should never produce 24/24s
+        result = seconds_to_fcpxml(1.0, 24.0)
+        num, den = result.rstrip("s").split("/")
+        from math import gcd
+        assert gcd(int(num), int(den)) == 1
+
+
+class TestSecondsToSmpte:
+    def test_zero(self) -> None:
+        assert seconds_to_smpte(0.0, 24.0) == "00:00:00:00"
+
+    def test_one_minute(self) -> None:
+        assert seconds_to_smpte(60.0, 25.0) == "00:01:00:00"
+
+    def test_one_hour(self) -> None:
+        assert seconds_to_smpte(3600.0, 24.0) == "01:00:00:00"
+
+    def test_frames_overflow(self) -> None:
+        # 25fps: 26 frames → 1s + 1 frame = 00:00:01:01
+        result = seconds_to_smpte(26 / 25, 25.0)
+        assert result == "00:00:01:01"
+
+    def test_format_length(self) -> None:
+        result = seconds_to_smpte(123.456, 23.976)
+        parts = result.split(":")
+        assert len(parts) == 4
+        assert all(len(p) == 2 for p in parts)
+
+
+class TestFcpxmlHelpers:
+    def test_frame_duration_24fps(self) -> None:
+        assert fcpxml_frame_duration(24.0) == "1/24s"
+
+    def test_frame_duration_23976(self) -> None:
+        fd = fcpxml_frame_duration(23.976)
+        # Should be "1001/24000s"
+        assert fd == "1001/24000s"
+
+    def test_format_name_1080p_2398(self) -> None:
+        name = fcpxml_format_name(23.976, 1920, 1080)
+        assert "1080" in name
+        assert "2398" in name
+
+    def test_frame_count_roundtrip(self) -> None:
+        fps = 25.0
+        seconds = 10.0
+        frames = seconds_to_frame_count(seconds, fps)
+        assert frames == 250
+
+
+# ---------------------------------------------------------------------------
+# EDL writer (string output)
+# ---------------------------------------------------------------------------
+
+class TestEdlWriter:
+    def _make_timeline(self) -> "src.core.models.EditTimeline":  # type: ignore
+        from src.core.models import (
+            BeatType, EditClip, EditTimeline, MatchResult, TrailerBeat,
+        )
+
+        beat = TrailerBeat(
+            beat_id=0, trailer_path=Path("trailer.mp4"),
+            start_s=0.0, end_s=5.0, start_frame=0, end_frame=120,
+            beat_type=BeatType.HOOK,
+        )
+        match = MatchResult(
+            beat_id=0, scene_id=3,
+            source_path=Path("movie.mp4"),
+            in_point_s=30.0, out_point_s=35.0,
+            in_point_frame=720, match_score=0.88,
+        )
+        clip = EditClip(
+            clip_index=0, beat=beat, match=match,
+            timeline_start_s=0.0, timeline_end_s=5.0,
+        )
+        return EditTimeline(
+            title="TestTrailer", frame_rate=25.0, clips=(clip,)
+        )
+
+    def test_edl_contains_title(self, tmp_path: Path) -> None:
+        from src.core.config import load_config
+        from src.export.edl_writer import write_edl
+
+        cfg = load_config()
+        tl  = self._make_timeline()
+        out = write_edl(tl, cfg, output_path=tmp_path / "test.edl")
+
+        text = out.read_text(encoding="utf-8")
+        assert "TITLE: TestTrailer" in text
+
+    def test_edl_has_event_line(self, tmp_path: Path) -> None:
+        from src.core.config import load_config
+        from src.export.edl_writer import write_edl
+
+        cfg = load_config()
+        tl  = self._make_timeline()
+        out = write_edl(tl, cfg, output_path=tmp_path / "test.edl")
+
+        text = out.read_text(encoding="utf-8")
+        assert "001" in text   # event number
+        assert "AX" in text    # reel name
+
+
+# ---------------------------------------------------------------------------
+# FCPXML writer (XML structure)
+# ---------------------------------------------------------------------------
+
+class TestFcpxmlWriter:
+    def _make_timeline(self) -> "src.core.models.EditTimeline":  # type: ignore
+        from src.core.models import (
+            BeatType, EditClip, EditTimeline, MatchResult, TrailerBeat,
+        )
+
+        beat = TrailerBeat(
+            beat_id=0, trailer_path=Path("trailer.mp4"),
+            start_s=0.0, end_s=5.0, start_frame=0, end_frame=120,
+            beat_type=BeatType.HOOK,
+        )
+        match = MatchResult(
+            beat_id=0, scene_id=3,
+            source_path=Path("B:/Proxy/movie.mp4"),
+            in_point_s=30.0, out_point_s=35.0,
+            in_point_frame=720, match_score=0.88,
+        )
+        clip = EditClip(
+            clip_index=0, beat=beat, match=match,
+            timeline_start_s=0.0, timeline_end_s=5.0,
+        )
+        return EditTimeline(
+            title="TestTrailer", frame_rate=25.0, clips=(clip,)
+        )
+
+    def test_fcpxml_is_valid_xml(self, tmp_path: Path) -> None:
+        from xml.etree import ElementTree as ET
+        from src.core.config import load_config
+        from src.export.fcpxml_writer import write_fcpxml
+
+        cfg = load_config()
+        tl  = self._make_timeline()
+        out = write_fcpxml(tl, cfg, output_path=tmp_path / "test.fcpxml")
+
+        text = out.read_text(encoding="utf-8")
+        text_no_doctype = "\n".join(
+            line for line in text.splitlines()
+            if not line.strip().startswith("<!DOCTYPE")
+        )
+        root = ET.fromstring(text_no_doctype)
+        # Strip namespace prefix for comparison
+        local_tag = root.tag.split("}")[-1] if "}" in root.tag else root.tag
+        assert local_tag == "fcpxml"
+
+    def test_fcpxml_has_spine(self, tmp_path: Path) -> None:
+        from xml.etree import ElementTree as ET
+        from src.core.config import load_config
+        from src.export.fcpxml_writer import write_fcpxml
+
+        cfg = load_config()
+        tl  = self._make_timeline()
+        out = write_fcpxml(tl, cfg, output_path=tmp_path / "test.fcpxml")
+
+        text = out.read_text(encoding="utf-8")
+        text_no_doctype = "\n".join(
+            line for line in text.splitlines()
+            if not line.strip().startswith("<!DOCTYPE")
+        )
+        # Register the FCPXML namespace so find() works
+        ns = {"fcp": "http://www.apple.com/dt/FCPXML/1_10"}
+        root  = ET.fromstring(text_no_doctype)
+        spine = root.find(".//fcp:spine", ns)
+        assert spine is not None
+        clips = list(spine)
+        assert len(clips) == 1
diff --git a/tests/test_fingerprinting.py b/tests/test_fingerprinting.py
new file mode 100644
index 0000000..090241c
--- /dev/null
+++ b/tests/test_fingerprinting.py
@@ -0,0 +1,112 @@
+"""
+tests/test_fingerprinting.py — Unit tests for src/cv/fingerprinting.py
+
+Tests run WITHOUT requiring real video files.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import pytest
+
+from src.cv.fingerprinting import (
+    text_safe_crop,
+    extract_hs_histograms,
+    compare_histograms,
+    hist_to_bytes,
+    bytes_to_hist,
+)
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+@pytest.fixture
+def solid_blue_frame() -> np.ndarray:
+    """256×256 solid blue BGR frame."""
+    frame = np.zeros((256, 256, 3), dtype=np.uint8)
+    frame[:, :] = (255, 0, 0)  # BGR blue
+    return frame
+
+
+@pytest.fixture
+def solid_red_frame() -> np.ndarray:
+    """256×256 solid red BGR frame."""
+    frame = np.zeros((256, 256, 3), dtype=np.uint8)
+    frame[:, :] = (0, 0, 255)  # BGR red
+    return frame
+
+
+# ---------------------------------------------------------------------------
+# text_safe_crop
+# ---------------------------------------------------------------------------
+
+class TestTextSafeCrop:
+    def test_removes_correct_rows(self, solid_blue_frame: np.ndarray) -> None:
+        cropped = text_safe_crop(solid_blue_frame, crop_top=0.15, crop_bottom=0.30)
+        h = solid_blue_frame.shape[0]  # 256
+        expected_h = int(h * (1.0 - 0.30)) - int(h * 0.15)
+        assert cropped.shape[0] == expected_h
+
+    def test_zero_crop_returns_same_size(self, solid_blue_frame: np.ndarray) -> None:
+        cropped = text_safe_crop(solid_blue_frame, crop_top=0.0, crop_bottom=0.0)
+        assert cropped.shape == solid_blue_frame.shape
+
+    def test_invalid_top_raises(self, solid_blue_frame: np.ndarray) -> None:
+        with pytest.raises(ValueError, match="crop_top"):
+            text_safe_crop(solid_blue_frame, crop_top=1.0, crop_bottom=0.0)
+
+    def test_invalid_bottom_raises(self, solid_blue_frame: np.ndarray) -> None:
+        with pytest.raises(ValueError, match="crop_bottom"):
+            text_safe_crop(solid_blue_frame, crop_top=0.0, crop_bottom=-0.1)
+
+    def test_overlapping_crops_raise(self, solid_blue_frame: np.ndarray) -> None:
+        with pytest.raises(ValueError, match="must be < 1.0"):
+            text_safe_crop(solid_blue_frame, crop_top=0.6, crop_bottom=0.5)
+
+
+# ---------------------------------------------------------------------------
+# Histograms
+# ---------------------------------------------------------------------------
+
+class TestHistograms:
+    def test_output_shape(self, solid_blue_frame: np.ndarray) -> None:
+        luma, sat = extract_hs_histograms(solid_blue_frame, bins_hue=50, bins_sat=60)
+        assert luma.shape == (50,)
+        assert sat.shape  == (60,)
+
+    def test_normalised(self, solid_blue_frame: np.ndarray) -> None:
+        import numpy as np
+        luma, sat = extract_hs_histograms(solid_blue_frame, bins_hue=50, bins_sat=60)
+        # L2-normalised → norm ≈ 1.0
+        assert np.linalg.norm(luma) == pytest.approx(1.0, abs=1e-5)
+        assert np.linalg.norm(sat)  == pytest.approx(1.0, abs=1e-5)
+
+    def test_same_frame_correl_is_one(self, solid_blue_frame: np.ndarray) -> None:
+        import cv2
+        luma, _ = extract_hs_histograms(solid_blue_frame, bins_hue=50, bins_sat=60)
+        score = compare_histograms(luma, luma, method=cv2.HISTCMP_CORREL)
+        assert score == pytest.approx(1.0, abs=1e-5)
+
+    def test_different_frames_correl_lower(
+        self,
+        solid_blue_frame: np.ndarray,
+        solid_red_frame: np.ndarray,
+    ) -> None:
+        import cv2
+        luma_b, _ = extract_hs_histograms(solid_blue_frame, 50, 60)
+        luma_r, _ = extract_hs_histograms(solid_red_frame,  50, 60)
+        score = compare_histograms(luma_b, luma_r, method=cv2.HISTCMP_CORREL)
+        assert score < 1.0
+
+
+# ---------------------------------------------------------------------------
+# Serialisation round-trip
+# ---------------------------------------------------------------------------
+
+class TestSerialisation:
+    def test_round_trip(self, solid_blue_frame: np.ndarray) -> None:
+        luma, _ = extract_hs_histograms(solid_blue_frame, 50, 60)
+        restored = bytes_to_hist(hist_to_bytes(luma))
+        np.testing.assert_array_almost_equal(luma, restored)