Initial project import

This commit is contained in:
Melbar
2026-05-02 09:07:41 +02:00
commit 8e1bcf142f
38 changed files with 7928 additions and 0 deletions
+15
View File
@@ -0,0 +1,15 @@
# =============================================================================
# AI Trailer Generator v2 — Environment Variables
# =============================================================================
# Copy this file to .env and fill in your actual keys.
# .env is listed in .gitignore and will NEVER be committed.
# =============================================================================
# OpenRouter API key (required when [llm] provider = "openrouter")
OPENROUTER_API_KEY=sk-or-v1-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
# OpenAI API key (required when [llm] provider = "openai")
# OPENAI_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
# Universal fallback (used if provider-specific key is not set)
# LLM_API_KEY=
+44
View File
@@ -0,0 +1,44 @@
# ---------------------------------------------------------------------------
# AI Trailer Generator v2 — .gitignore
# ---------------------------------------------------------------------------
# Python
__pycache__/
*.py[cod]
*.pyo
*.pyd
*.egg-info/
dist/
build/
*.whl
.venv/
venv/
.mypy_cache/
.ruff_cache/
.pytest_cache/
# Project-generated artefacts (potentially huge)
.cache/
output/
proxy/
*.mp4
*.mov
*.mxf
*.wav
*.mp3
*.jpg
*.jpeg
*.png
# IDE
.vscode/
.idea/
*.swp
# OS
.DS_Store
Thumbs.db
# Secrets / local overrides
.env
config.local.toml
+384
View File
@@ -0,0 +1,384 @@
# AI Trailer Generator v2
**Frame-accurate trailer reconstruction via pure Computer Vision**
> Gibt einen Reference Trailer und den dazugehörigen Quellfilm hinein — bekommt eine fertige FCPXML/EDL heraus, die den Trailer Frame-genau aus dem Quellfilm nachbaut.
---
## Das Kernprinzip
Standardmäßig kein LLM für visuelles Matching. Optional kann ein Vision-Layer
gecachte 3-Frame-Beschreibungen als zusätzliche Suchanker liefern; der finale
Match bleibt aber CV-verifiziert.
| Phase | Was passiert | Technologie |
|-------|-------------|-------------|
| **0 — Prep** | Reference Trailer analysieren & Beats extrahieren | PySceneDetect + OpenCV |
| **1 — Global Scan**| Gesamten Quellfilm via FFmpeg-Stream (2 FPS) gegen alle Beats scannen | FFmpeg Pipe + Luma-Histogramm |
| **1b — Optional Vision Seeds** | Unsichere Top-K Szenen mit 3-Frame-Beschreibungen cachen | OpenAI-kompatibles Vision-LLM |
| **2 — Refine** | Beste Treffer auf Frame-Ebene präzisieren | OpenCV `matchTemplate` |
| **3 — Dramaturgie** | Narrative BeatType-Klassifikation aus Dialog-Text | OpenRouter LLM |
| **4 — Export** | Timeline → FCPXML 1.10 oder CMX 3600 EDL | xml.etree + eigener Timecode-Layer |
**Text-Safe Crop:** Obere 15% und untere 30% des Frames werden vor jedem Vergleich ausgeblendet, um Title Cards, Logos und Letterbox zu ignorieren.
---
## Voraussetzungen
- Python **3.11+**
- [ffmpeg](https://ffmpeg.org/download.html) im PATH (für Whisper Audio-Extraktion)
- CUDA-fähige GPU empfohlen (für faster-whisper; CPU funktioniert auch)
---
## Setup
### 1. Virtual Environment erstellen & aktivieren
```powershell
# Im Projektordner
python -m venv .venv
.\.venv\Scripts\Activate.ps1
# Falls ExecutionPolicy blockiert:
# Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser
```
### 2. Abhängigkeiten installieren
```powershell
pip install -r requirements.txt
```
### 3. API-Key konfigurieren
```powershell
# .env aus dem Template kopieren
Copy-Item .env.example .env
# Dann .env öffnen und den echten Key eintragen:
# OPENROUTER_API_KEY=sk-or-v1-...
```
### 4. Videodateien eintragen
`config.toml` öffnen und die Pfade anpassen:
```toml
[paths]
source_movie = "B:/Proxy/DeinFilm_FTR.mp4"
reference_trailer = "F:/Encodings/DeinFilm_Trailer.mp4"
```
---
## Verwendung
```powershell
# Vollständige Pipeline (analyze → match → report → export)
python cli.py run
# Ohne Whisper-Transkription (schneller)
python cli.py run --no-audio
# Ohne LLM-Klassifikation
python cli.py run --no-audio --no-llm
# Schrittweise
python cli.py analyze # Reference Trailer → Beats erkennen
python cli.py match # Globaler FFmpeg Scan (Szenen-unabhängig)
python cli.py report # HTML Report mit Video-Vergleich bauen
python cli.py export --format both # FCPXML + EDL ausgeben
# Gezielt nur einen Beat bearbeiten (empfohlen für erste Iterationen)
python cli.py match --beat 5
python cli.py match --beat 5 --vision # optionale gecachte Vision-Seeds
python cli.py report --beat 5
python cli.py export --beat 5 --format both
# Fehlerhafte Matches korrigieren
python cli.py rematch --beat 5 --threshold 0.50 # Schwelle anpassen (Globaler Scan wird für diesen Beat wiederholt)
python cli.py rematch --beat 5 --refine # Cached Match per lokalem Bildinhalt-Offset nachschärfen
```
Der HTML-Report regeneriert seine Preview-Clips bei jedem Lauf mit genauer
FFmpeg-Nachsuche und synchronisiert die beiden Video-Player pro Beat. Dadurch
ist der Report zur Frame-Prüfung geeignet und zeigt keine alten gecachten
Preview-Clips.
Source-Previews bekommen bei Trailer-only-Tails denselben schwarzen Tail wie der
Export, damit der Browser nicht einen zu kurzen Source-Clip gegen den längeren
Referenzbeat weiterspult oder loopt.
Zur Synchronprüfung rendert der Report ein einzelnes Frame-Locked-Compare-Video
mit Referenz und Source in demselben MP4-Stream. Dieses Compare-Video ist
maßgeblich, weil zwei getrennte Browser-Videoelemente nie zuverlässig
framegenau synchron bleiben.
Wenn ein Trailer-Beat am Ende eine Blende, Schwarzfläche oder Textkarte enthält,
die im Source-Film nicht als normaler Shot vorhanden ist, endet der Source-Match
am letzten stabil passenden Frame. Exportierte Timelines behalten trotzdem die
volle Beat-Länge und fügen danach automatisch einen schwarzen Trailer-Tail mit
Marker für Fade/Dissolve ein.
Gezielte Ein-Beat-Matches nutzen zusätzlich vorhandene automatische Nachbarbeats
aus dem Cache als zeitliche Suchanker. Das hilft bei aufeinanderfolgenden Shots,
ohne manuelle Szenen oder Timecodes zu kuratieren.
Bei `match --beat N` wird ein alter Cache-Treffer für genau diesen Beat entfernt
und nur ein neu gefundener automatischer Treffer wieder eingetragen. Ein
fehlgeschlagener neuer Lauf kann dadurch keinen alten falschen Report-Treffer
stehen lassen.
Der globale Bildvergleich arbeitet auf kontrast-normalisierten Luma- und
Kantenfeatures statt auf rohen Farb-Pixeln. Dadurch bleiben Schwarzweiß- oder
anders gegradete Trailerbilder mit dem Source-Material vergleichbar, während
unähnliche Farbshots schlechter ranken.
Die Inpoint-Feinjustage bestimmt den Versatz lokal aus dem Bildinhalt: Um einen
groben Treffer herum werden mehrere Referenzframes gegen mehrere Source-Offsets
verglichen, und der beste gemeinsame Offset wird übernommen. Das ist schneller
als ein erneuter globaler Scan und vermeidet pauschale Frame-Prerolls.
Schwarze Referenzframes aus Blenden oder Titel-Tails werden für diese
Offset-Messung ausgelassen, damit echte Bildbewegung und nicht die Blende selbst
den Inpoint bestimmt.
`rematch --refine` nutzt denselben lokalen FFmpeg/Pillow-Aligner und schreibt
den korrigierten Inpoint direkt zurück in `.cache/match_results.json`.
Zusätzlich werden aus den besten szenenweiten Luma/Histogramm-Kandidaten
mehrere Inpoint-Suchanker erzeugt. Diese Scene-Seeds verwenden keine harte
pHash-Sperre, weil pHash bei stark anders gegradeten Trailerbildern echte
Matches zu früh ausschließen kann.
Optional kann `python cli.py match --beat N --vision` einen Vision-Layer
zuschalten. Dann werden pro Trailer-Beat und pro wenigen Scene-Level-Kandidaten
je drei Frames (Anfang, Mitte, Ende) von einem visionfähigen OpenAI-kompatiblen
Modell beschrieben. Die Beschreibungen liegen in
`.cache/vision_descriptions.json` und werden wiederverwendet. Vision erzeugt
nur zusätzliche Suchanker; der eigentliche Match muss weiterhin durch CV,
Content-Reranking, Timing und Duration-Coverage bestätigt werden.
Der gewichtete Vision-Seed-Pfad ersetzt standardmäßig keinen normalen
FFmpeg-Vollscan. Vision-Beschreibungen sind semantische Hinweise, aber keine
Beweise; der volle CV-Scan bleibt deshalb aktiv, damit falsch bewertete
Vision-Szenen echte Treffer nicht verdrängen. Für schnelle Experimente kann
`skip_coarse_scan_with_weighted_seeds = true` gesetzt werden.
Gewichtete Vision-Seeds werden nicht zuerst durch den alten Midpoint-Template
Refine verschoben; sie gehen direkt in die lokale Content-Alignment-Prüfung.
Das schützt wiederholte Gesprächseinstellungen, bei denen ähnliche Momente
mehrfach in derselben Szene vorkommen.
Innerhalb der automatisch von Vision vorgeschlagenen Szenen läuft zusätzlich
eine dichte lokale Bildsequenzsuche. Sie misst den Phasenversatz in kleinen
Zeitschritten direkt am Bildinhalt und bevorzugt Kandidaten mit genügend
Restdauer in derselben Source-Szene. Das ist kein manueller Override: Vision
grenzt nur Suchbereiche ein, die Auswahl bleibt Content-, Timing- und
Coverage-getrieben.
Nach einem dichten Vision-Treffer darf der spätere lokale Aligner nur noch im
Bereich dieses Scan-Schritts nachjustieren. So kann ein korrekt gefundener
Bewegungsmoment nicht wieder um viele Frames in eine ähnlich aussehende Phase
derselben Szene verschoben werden.
Wenn mehrere Vision-Kandidaten in derselben Source-Szene ähnlich gut scoren
und die Beat-Dauer abdecken, bevorzugt der Matcher die frühere Phase. Das
verhindert, dass ein späterer, minimal stärkerer Standbildtreffer die
Bewegungsphase des Trailers sichtbar überholt.
Enthält ein Trailerbeat selbst einen harten Umschnitt, werden Kandidaten an
angrenzenden Source-Szenengrenzen zusätzlich als zusammenhängender Multi-Shot-
Span geprüft. Ein Match darf dann über eine Source-Szenengrenze laufen, aber
nur wenn die relative Source-Grenze zeitlich zu einem erkannten Trailer-Umschnitt
passt. So kann ein Beat aus Frage/Antwort-Shots vollständig erfasst werden,
ohne Szenen willkürlich zusammenzukleben.
Auch der lokale Content-Aligner darf einen Inpoint nur noch übernehmen, wenn
die feste Whole-Frame-/Spatial-Validation dadurch besser wird.
Vor dem teuren Frame-Refine wird der gesamte Kandidatenpool mit einer schnellen
festen Inhaltsprüfung neu sortiert. Dadurch können korrekte Treffer aus
wiederholten Einstellungen einer Szene nach oben kommen, auch wenn ein freier
Template-Peak an anderer Stelle numerisch stärker war. Suchanker bleiben im
Pool erhalten, dürfen aber erst nach der Inhaltsprüfung nach oben rücken. Wenn
ein Kandidat visuell plausibel ist, aber wegen Trailerblende oder kurzem
Source-Span die normale Coverage knapp verfehlt, wird er als provisional Match
behalten statt als `NO MATCH` verworfen.
Dieses Reranking berücksichtigt zusätzlich die verbleibende Szenenlänge ab dem
Kandidaten-Inpoint. Dadurch werden zu späte ähnliche Gesprächsphasen innerhalb
derselben Szene nicht mehr vor frühere, tragfähigere Phasen sortiert.
Das Inhalts-Reranking nutzt bewusst nur wenige repräsentative Referenzframes und
eine begrenzte Kandidatenzahl. So bleiben wiederholte Szenen auffindbar, ohne
dass der Lauf durch tausende Random-Seeks minutenlang festhängt.
Confirmed Matches werden zusätzlich durch eine feste nahezu-Whole-Frame-Prüfung
aus Luma, Kanten, Farbhistogramm und räumlichen 4x4-Farbhistogrammen gedeckelt.
Dadurch kann ein freier Template-Hit mit ähnlicher Fenster-/Gesichtsstruktur
nicht mehr als sicherer Match gelten, wenn die Gesamtkomposition oder die
Bewegungsphase sichtbar eine andere Szene ist.
Für gewichtete Vision-Kandidaten gibt es zusätzlich eine eigene Provisional-
Bewertung aus Content-Score, Restdauer und Seed-Stärke. Dadurch können echte,
aber durch Trailer-Grading/Crop numerisch schwache Treffer im Report landen,
ohne als confirmed Match durchzugehen.
Die Cache-Normalisierung für Report/Export verwendet dieselbe niedrigere
Content-Untergrenze für nicht bestätigte Vision-Provisional-Treffer, damit ein
gerade gefundener automatischer Match nicht beim Report-Aufbau wieder
weggefiltert wird.
Sie übernimmt auch die Multi-Shot-Coverage-Regel: gecachte Treffer, die passend
zu internen Trailer-Umschnitten über angrenzende Source-Szenen laufen, werden
nicht mehr auf die erste Source-Szene zurückgekürzt.
Gezielte Einzel-Beat-Matches gewichten außerdem die automatisch aus Nachbarbeats
abgeleiteten Continuity-Seeds. Wenn ein Beat direkt an einen bereits passenden
Vorgänger anschließt, kann ein späterer ähnlich aussehender Moment derselben
Dialogszene den erwarteten Anschluss nicht mehr nur wegen eines höheren
Standbildscores verdrängen.
Diese Continuity-Seeds sind aber nur Suchanker: in derselben Szene darf ein
späterer Inpoint gewinnen, wenn die mehrframeige Content-Prüfung die
Bewegungsphase klar besser trifft. Dadurch bleiben Anschlussmatches stabil,
ohne Hand-/Kopfbewegungen auf einen falschen Zeitpunkt festzunageln.
Continuity- und Vision-Seeds allein schalten den globalen FFmpeg-Scan
standardmäßig nicht ab. Sie sind Suchanker, keine Beweise; der volle CV-Scan
bleibt aktiv, damit semantisch plausible, aber falsche Vision-Treffer echte
Bildmatches nicht verdrängen.
Lange Trailerbeats werden nicht mehr automatisch über ihre gesamte Beat-Länge
gegen einen einzigen Source-Clip validiert. Sobald nach einem sichtbaren
Source-Abschnitt eine anhaltende Schwarzblende oder Titel-/Credit-Insel beginnt,
endet der matchbare Referenzbereich dort; zwei aufeinanderfolgende dunkle
Samples reichen dafür. Spätere Text-/Creditbilder im selben Beat gehen damit
nicht mehr in Reranking, Validation oder Span-Schätzung ein.
Zusätzlich werden sehr dunkle, kontrastarme oder noch nicht sauber
auf-/abgeblendete Referenzframes aus Score, Inhalts-Reranking,
Phasen-Alignment und Motion-Templates herausgenommen. Blenden sollen bestimmen,
wie der Clip später exportiert wird, aber nicht, ob der Bildinhalt als Match
gilt.
Treffer unter `provisional_content_threshold` werden gar nicht mehr gespeichert
oder aus alten Cache-Ergebnissen übernommen. Das verhindert, dass offensichtlich
falsche Szenen im Report als Match-Kandidat weiterleben.
### Log-Level
```powershell
python cli.py run --log-level DEBUG
```
---
## Projektstruktur
```
ai_trailer_2026/
├── config.toml ← Alle Parameter (kein Hardcoding im Code)
├── .env ← API-Keys (NICHT commiten)
├── cli.py ← Einstiegspunkt
├── src/
│ ├── core/
│ │ ├── config.py load_config() → AppConfig (frozen dataclasses)
│ │ └── models.py Scene, TrailerBeat, VibeHit, MatchResult, EditTimeline
│ ├── cv/
│ │ ├── fingerprinting.py Text-Safe Crop · HS-Histogramme · pHash
│ │ ├── vibe_check.py Phase 1: Histogram+pHash Filter
│ │ ├── scene_indexer.py PySceneDetect → Fingerprint → JSON-Cache
│ │ ├── frame_extractor.py VideoCapture-Wrapper
│ │ └── deep_scan.py Phase 2: Coarse+Refine Template-Matching
│ ├── audio/
│ │ └── transcriber.py faster-whisper Transkription
│ ├── llm/
│ │ ├── dramaturg.py OpenRouter → BeatType (Dialog/Dramaturgie)
│ │ └── vision_cache.py optionale gecachte 3-Frame Vision-Seeds
│ ├── pipeline/
│ │ ├── trailer_analyzer.py Reference-Trailer → TrailerBeat[]
│ │ └── matcher.py Orchestrierung + EditTimeline-Builder
│ └── export/
│ ├── timecode.py Sekunden ↔ FCPXML-Rational ↔ SMPTE
│ ├── fcpxml_writer.py FCPXML 1.10
│ └── edl_writer.py CMX 3600 EDL
├── output/ ← FCPXML/EDL Output (gitignored)
├── .cache/ ← Szenen-Index + Match-Ergebnisse (gitignored)
└── tests/ 52 Unit-Tests (pytest)
```
---
## Cache-Verhalten
Damit nicht bei jedem Lauf der gesamte Quellfilm neu analysiert werden muss:
| Datei | Inhalt | Neu bauen mit |
|-------|--------|---------------|
| `.cache/scene_index.json` | Alle Quellfilm-Szenen + Fingerprints | `--force-reindex` |
| `.cache/trailer_beats.json` | Erkannte Trailer-Beats | `python cli.py analyze` erneut |
| `.cache/match_results.json` | CV-Matching-Ergebnisse | `python cli.py match` erneut |
| `.cache/vision_descriptions.json` | Optionale 3-Frame Vision-Beschreibungen für Beats/Szenen | löschen oder anderes Vision-Modell konfigurieren |
---
## Tests
```powershell
pytest tests/ -v
```
Alle Tests laufen ohne echte Videodateien (synthetische Frames via numpy/OpenCV).
---
## Konfiguration (Auszug)
Alle Werte in `config.toml` — keine hardgecodeten Konstanten im Code.
```toml
[cv.vibe_check]
top_k_candidates = 10 # Top-K Kandidaten für Deep Scan
phash_max_distance = 12 # Hamming-Distanz Schwelle (064)
crop_top_fraction = 0.15 # Obere 15% ausblenden (Logos)
crop_bottom_fraction = 0.30 # Untere 30% ausblenden (Letterbox/Subs)
[cv.deep_scan]
coarse_step_seconds = 0.5 # Scan-Schrittgröße (Coarse Pass)
match_threshold = 0.65 # Mindestscore für bestätigte automatische Matches
provisional_match_threshold = 0.45 # Niedrigere automatische Kandidaten im Report zeigen
coarse_candidate_threshold = 0.50 # Niedrigeres Gate vor Multi-Frame-Refine
refine_window_seconds = 0.6 # Suchfenster für framegenaue Inpoint-Feinjustage
refine_step_seconds = 0.04 # ~1 Frame bei 25fps (Refine Pass)
content_align_window_seconds = 0.48 # Lokales Suchfenster um einen groben Treffer
content_align_sample_step_s = 0.28 # Referenzframes für direkten Bildinhalt-Offset
content_validation_weight = 0.35 # Gewicht der festen Whole-Frame-/Spatial-Endprüfung
provisional_content_threshold = 0.42 # Untergrenze für Report-/Cache-Kandidaten
start_tie_break_score_delta = 0.015 # Bei fast gleichen Scores früheren Inpoint wählen
start_preroll_frames = 0 # Kein pauschaler Start-Ausgleich; Offset kommt aus Bildinhalt
sequence_candidate_count = 240 # Breiter Kandidatenpool vor Inhalts-Reranking
max_refine_candidates = 6 # Teurer Frame-Refine läuft nur auf den besten Inhaltskandidaten
scene_seed_top_k = 30 # Scene-Level-Kandidaten als zusätzliche Suchanker
scene_seed_points_per_scene = 6 # Inpoint-Samples pro Scene-Level-Kandidat
content_rerank_candidate_count = 100 # Grobe Kandidaten vor Inhalts-Reranking
skip_coarse_scan_with_weighted_seeds = false # Vision-Seeds nur als Hinweise; Vollscan bleibt robust
sequence_score_weight = 0.55 # Gewicht für mehrere zeitliche Vergleichsframes
span_score_weight = 0.15 # Gewicht für Stabilität bis zum Beat-Ende
coarse_score_weight = 0.10 # Gewicht des groben Midpoint-Treffers
duration_score_weight = 0.20 # Gewicht für nutzbare Länge des Source-Treffers
duration_tie_break_score_delta = 0.03 # Bei ähnlichem Score längeren Treffer bevorzugen
min_duration_coverage = 0.65 # Treffer muss mindestens 65% des matchbaren Referenzanteils tragen
continuity_seed_offsets_s = [-1.0, 0.0, 0.5, 1.0, 1.5, 2.0, 3.0] # Suchanker um gematchte Nachbarbeats
span_sample_step_s = 0.08 # Schrittweite für End-/Drift-Erkennung
trim_tail_frames = 4 # Sicherheitsabstand gegen kurze Blitzer am Ende
scene_boundary_epsilon_s = 0.12 # Szenengrenzen-Toleranz gegen 1-2 Frame Cut-Drift
scoreable_luma_mean_min = 24.0 # Zu dunkle/Fade-Frames nicht scoren
scoreable_luma_p90_min = 58.0 # Helle Bildanteile müssen sichtbar genug sein
scoreable_contrast_min = 24.0 # Kontrastarme Blenden/Titelinseln ignorieren
[vision]
enabled = false # Kostenkontrolle: per CLI mit --vision aktivierbar
model = "google/gemma-4-31b-it" # Muss ein visionfähiges OpenAI-kompatibles Modell sein
scene_candidate_top_k = 8 # Nur wenige Top-Szenen pro Beat beschreiben
max_new_descriptions_per_run = 12 # API-Kosten pro Lauf begrenzen
max_seed_scenes = 3 # Nur beste Vision-Szenen als Suchanker weitergeben
seed_points_per_scene = 12 # Inpoint-Samples pro Vision-Szene
seed_score = 0.88 # Vision-Seeds bekommen mehr Priorität als normale Scene-Seeds
max_refine_candidates = 6 # Vision-Pfad prüft mehrere Bewegungsphasen derselben Szene
local_scan_step_s = 0.12 # Dichte lokale Bildsuche in Vision-Szenen
local_scan_max_points_per_scene = 180 # Laufzeitgrenze pro Source-Szene
local_scan_top_candidates = 18 # Beste lokale Kandidaten gehen ins Refinement
local_scan_tie_break_score_delta = 0.08 # Ähnliche Vision-Treffer: frühere Phase bevorzugen
multi_shot_cut_corr_threshold = 0.20 # Interne Trailer-Umschnitte erkennen
multi_shot_boundary_tolerance_s = 0.20 # Source-Grenze muss zum Trailer-Cut passen
fullscan_fallback = false # Nur relevant, wenn skip_coarse_scan_with_weighted_seeds=true ist
content_threshold = 0.22 # Lockeres Content-Gate nur für gewichtete Vision-Seeds
similarity_threshold = 0.18 # Mindest-Textähnlichkeit für Vision-Seeds
```
---
## Lizenz
Internes Tool — nicht für den öffentlichen Vertrieb.
+899
View File
@@ -0,0 +1,899 @@
"""
cli.py — AI Trailer Generator v2 — Command-Line Interface
Usage:
python cli.py analyze [--config CONFIG] [--no-audio] [--no-llm]
python cli.py match [--config CONFIG] [--force-reindex]
python cli.py rematch --beat N [--threshold F] [--refine]
python cli.py report [--config CONFIG]
python cli.py run [--config CONFIG] [--force-reindex] [--no-audio] [--no-llm]
python cli.py export [--config CONFIG] [--format fcpxml|edl|both]
On --no-audio / --no-llm:
These flags do NOT affect matching quality.
Whisper and the LLM only assign narrative labels (HOOK/SETUP/CLIMAX)
to beats in the export metadata. The CV pipeline is identical either way.
Use them for fast iterations: they skip large model downloads.
All heavy imports are deferred so --help is instant.
"""
from __future__ import annotations
import argparse
import json
import logging
import sys
from pathlib import Path
# ---------------------------------------------------------------------------
# Logging setup
# ---------------------------------------------------------------------------
def _setup_logging(level: str = "INFO") -> None:
# Force UTF-8 for Windows console emoji printing
if sys.stdout.encoding != 'utf-8':
sys.stdout.reconfigure(encoding='utf-8')
logging.basicConfig(
format="%(asctime)s %(levelname)-8s %(name)s%(message)s",
datefmt="%H:%M:%S",
level=getattr(logging, level.upper(), logging.INFO),
stream=sys.stdout,
)
logging.getLogger("PIL").setLevel(logging.WARNING)
def _ensure_utf8_console() -> None:
"""Make argparse help safe on Windows before logging is configured."""
if sys.stdout.encoding != "utf-8":
sys.stdout.reconfigure(encoding="utf-8")
# ---------------------------------------------------------------------------
# Cache helpers (match results ↔ JSON)
# ---------------------------------------------------------------------------
def _results_cache_path(cfg: "AppConfig") -> Path: # type: ignore[name-defined]
return cfg.paths.cache_dir / "match_results.json"
def _save_results(results: list, cfg: "AppConfig") -> None: # type: ignore[name-defined]
from src.core.models import MatchResult
data = [
{
"beat_id": r.beat_id,
"scene_id": r.scene_id,
"source_path": str(r.source_path),
"in_point_s": r.in_point_s,
"out_point_s": r.out_point_s,
"in_point_frame": r.in_point_frame,
"match_score": r.match_score,
"match_location": list(r.match_location),
"is_confirmed": r.is_confirmed,
"segments": [
{
"trailer_offset_s": s.trailer_offset_s,
"duration_s": s.duration_s,
"scene_id": s.scene_id,
"in_point_s": s.in_point_s,
"out_point_s": s.out_point_s,
"match_score": s.match_score,
"is_confirmed": s.is_confirmed,
}
for s in getattr(r, "segments", ())
],
}
for r in results
]
p = _results_cache_path(cfg)
p.parent.mkdir(parents=True, exist_ok=True)
p.write_text(json.dumps(data, indent=2), encoding="utf-8")
logging.getLogger(__name__).info("Match results cached → %s", p)
def _load_results(cfg: "AppConfig") -> list: # type: ignore[name-defined]
from src.core.models import MatchResult, MatchSegment
p = _results_cache_path(cfg)
if not p.exists():
raise FileNotFoundError(f"No cached results at {p}. Run 'match' first.")
raw = json.loads(p.read_text(encoding="utf-8"))
return [
MatchResult(
beat_id=d["beat_id"],
scene_id=d["scene_id"],
source_path=Path(d["source_path"]),
in_point_s=d["in_point_s"],
out_point_s=d["out_point_s"],
in_point_frame=d["in_point_frame"],
match_score=d["match_score"],
match_location=tuple(d["match_location"]),
is_confirmed=d.get("is_confirmed", True),
segments=tuple(
MatchSegment(
trailer_offset_s=float(s["trailer_offset_s"]),
duration_s=float(s["duration_s"]),
scene_id=int(s["scene_id"]),
in_point_s=float(s["in_point_s"]),
out_point_s=float(s["out_point_s"]),
match_score=float(s["match_score"]),
is_confirmed=bool(s.get("is_confirmed", True)),
)
for s in d.get("segments", ())
),
)
for d in raw
]
def _load_scene_cache_light(cfg) -> list[dict]:
p = cfg.paths.cache_dir / "scene_index.json"
if not p.exists():
return []
return json.loads(p.read_text(encoding="utf-8"))
def _scene_fps_light(scene: dict, cfg) -> float:
duration_s = max(0.0, float(scene["end_s"]) - float(scene["start_s"]))
frame_count = max(0, int(scene["end_frame"]) - int(scene["start_frame"]))
return frame_count / duration_s if duration_s > 0 and frame_count > 0 else cfg.export.edl_frame_rate
def _scene_for_time_light(scenes: list[dict], t_sec: float, cfg) -> dict | None:
for idx, scene in enumerate(scenes):
if float(scene["start_s"]) <= t_sec < float(scene["end_s"]):
if (
float(scene["end_s"]) - t_sec <= cfg.cv.deep_scan.scene_boundary_epsilon_s
and idx + 1 < len(scenes)
):
return scenes[idx + 1]
return scene
return None
def _scene_by_id_light(scenes: list[dict], scene_id: int) -> dict | None:
return next((s for s in scenes if int(s["scene_id"]) == scene_id), None)
def _contiguous_duration_light(beat, in_point_s: float, scenes: list[dict], cfg, matchable_duration_s: float) -> float:
if matchable_duration_s <= 0:
return 0.0
try:
from src.cv.global_scan import _reference_internal_cut_offsets
cut_offsets = _reference_internal_cut_offsets(beat, cfg)
except Exception:
cut_offsets = []
start_idx = None
for idx, scene in enumerate(scenes):
if float(scene["start_s"]) <= in_point_s < float(scene["end_s"]):
start_idx = idx
break
if start_idx is None:
return 0.0
target_end = in_point_s + matchable_duration_s
current_end = in_point_s
for scene in scenes[start_idx:]:
scene_end = float(scene["end_s"])
if target_end <= scene_end:
return matchable_duration_s
boundary_offset = scene_end - in_point_s
if not any(
abs(boundary_offset - cut_offset) <= cfg.vision.multi_shot_boundary_tolerance_s
for cut_offset in cut_offsets
):
tail_s = max(0.0, cfg.cv.deep_scan.trim_tail_frames / _scene_fps_light(scene, cfg))
return max(0.0, scene_end - in_point_s - tail_s)
current_end = scene_end
return max(0.0, current_end - in_point_s)
def _normalize_cached_results(beats: list, results: list, cfg) -> list:
"""
Re-apply current generic timing rules to cached results.
This keeps old automatic cache entries from preserving obsolete scene-boundary
or tail-trim behavior without introducing manual per-beat truth.
"""
from dataclasses import replace
scenes = _load_scene_cache_light(cfg)
if not scenes:
return results
beats_by_id = {b.beat_id: b for b in beats}
normalized = []
for result in results:
beat = beats_by_id.get(result.beat_id)
if result.match_score < cfg.cv.deep_scan.provisional_match_threshold:
continue
scene = _scene_for_time_light(scenes, result.in_point_s, cfg)
declared_scene = _scene_by_id_light(scenes, result.scene_id)
# If the automatic matcher selected a scene but its in-point sits just
# before that scene's detected start, treat this as scene-boundary drift
# and clamp to the declared scene. This is generic: no beat IDs, no
# manual timestamps, just consistent scene/time reconciliation.
if declared_scene is not None:
declared_start = float(declared_scene["start_s"])
declared_end = float(declared_scene["end_s"])
declared_fps = _scene_fps_light(declared_scene, cfg)
boundary_tolerance_s = (
cfg.cv.deep_scan.scene_boundary_epsilon_s
+ cfg.cv.deep_scan.start_preroll_frames / declared_fps
)
if declared_start - boundary_tolerance_s <= result.in_point_s < declared_end:
scene = declared_scene
if beat is None or scene is None:
normalized.append(result)
continue
fps = _scene_fps_light(scene, cfg)
adjusted_in_s = result.in_point_s
scene_changed = int(scene["scene_id"]) != result.scene_id
starts_before_scene = result.in_point_s < float(scene["start_s"])
if scene_changed or starts_before_scene or result.duration_s <= 0.12:
adjusted_in_s = max(0.0, result.in_point_s - (cfg.cv.deep_scan.start_preroll_frames / fps))
adjusted_in_s = max(float(scene["start_s"]), adjusted_in_s)
scene = _scene_for_time_light(scenes, adjusted_in_s, cfg) or scene
fps = _scene_fps_light(scene, cfg)
matchable_duration_s = beat.duration_s
try:
from src.cv.global_scan import estimate_matchable_reference_duration
matchable_duration_s = estimate_matchable_reference_duration(beat, cfg)
except Exception:
pass
tail_s = max(0.0, cfg.cv.deep_scan.trim_tail_frames / fps)
single_scene_duration_s = max(0.0, min(beat.duration_s, float(scene["end_s"]) - adjusted_in_s) - tail_s)
contiguous_duration_s = _contiguous_duration_light(
beat,
adjusted_in_s,
scenes,
cfg,
matchable_duration_s,
)
max_duration_s = max(single_scene_duration_s, min(beat.duration_s, contiguous_duration_s))
normalized_result = result
if (
scene_changed
or starts_before_scene
or result.duration_s <= 0.12
or result.out_point_s > adjusted_in_s + max_duration_s + (1.0 / fps)
):
normalized_result = replace(
result,
scene_id=int(scene["scene_id"]),
in_point_s=adjusted_in_s,
out_point_s=adjusted_in_s + max_duration_s,
in_point_frame=int(adjusted_in_s * fps),
)
coverage = (
max(0.0, normalized_result.duration_s) / matchable_duration_s
if matchable_duration_s > 0 else 0.0
)
if coverage < cfg.cv.deep_scan.min_duration_coverage:
continue
try:
from src.cv.content_align import align_cached_match_by_content
_, content_score = align_cached_match_by_content(
beat,
normalized_result.in_point_s,
cfg,
search_window_s=min(0.8, cfg.cv.deep_scan.content_align_window_seconds),
fps=12.5,
)
content_gate = (
cfg.cv.deep_scan.provisional_content_threshold
if normalized_result.is_confirmed
else min(cfg.cv.deep_scan.provisional_content_threshold, cfg.vision.content_threshold)
)
if content_score < content_gate:
continue
if content_score < cfg.cv.deep_scan.match_threshold and normalized_result.is_confirmed:
normalized_result = replace(
normalized_result,
match_score=min(normalized_result.match_score, content_score),
is_confirmed=False,
)
except Exception:
pass
normalized.append(normalized_result)
return normalized
# ---------------------------------------------------------------------------
# Command handlers
# ---------------------------------------------------------------------------
def _build_transcribe_callback(cfg):
"""Return a transcribe_callback closure, or None if audio is disabled."""
from src.audio.transcriber import transcribe_video
def _cb(path, start_s, end_s, offset_s):
return transcribe_video(path, cfg, start_s=start_s, end_s=end_s, time_offset_s=offset_s)
return _cb
def _build_classify_callback(cfg):
"""Return a classify_callback closure."""
from src.llm.dramaturg import classify_beats
def _cb(beats):
return classify_beats(beats, cfg)
return _cb
def cmd_analyze(args: argparse.Namespace, cfg) -> list:
from src.pipeline.trailer_analyzer import analyze_reference_trailer
transcribe_cb = _build_transcribe_callback(cfg) if not args.no_audio else None
classify_cb = _build_classify_callback(cfg) if not args.no_llm else None
beats = analyze_reference_trailer(
cfg,
transcribe_callback=transcribe_cb,
classify_callback=classify_cb,
)
# Persist beats for downstream commands (including histogram bytes as hex)
beats_cache = cfg.paths.cache_dir / "trailer_beats.json"
beats_cache.parent.mkdir(parents=True, exist_ok=True)
beats_data = [
{
"beat_id": b.beat_id,
"start_s": b.start_s,
"end_s": b.end_s,
"start_frame": b.start_frame,
"end_frame": b.end_frame,
"beat_type": b.beat_type.name,
"dialogue": [{"start_s": d.start_s, "end_s": d.end_s, "text": d.text} for d in b.dialogue],
"phash": b.phash,
"luma_hist": b.luma_hist.hex() if b.luma_hist else None,
"sat_hist": b.sat_hist.hex() if b.sat_hist else None,
}
for b in beats
]
beats_cache.write_text(json.dumps(beats_data, indent=2, ensure_ascii=False), encoding="utf-8")
print(f"\n\u2705 {len(beats)} beats analyzed \u2192 {beats_cache}")
return beats
def _load_beats(cfg) -> list:
from src.core.models import BeatType, DialogueLine, TrailerBeat
p = cfg.paths.cache_dir / "trailer_beats.json"
if not p.exists():
raise FileNotFoundError(f"No cached beats at {p}. Run 'analyze' first.")
raw = json.loads(p.read_text(encoding="utf-8"))
beats = []
for d in raw:
dialogue = tuple(
DialogueLine(start_s=x["start_s"], end_s=x["end_s"], text=x["text"])
for x in d.get("dialogue", [])
)
beats.append(TrailerBeat(
beat_id=d["beat_id"],
trailer_path=cfg.paths.reference_trailer,
start_s=d["start_s"],
end_s=d["end_s"],
start_frame=d["start_frame"],
end_frame=d["end_frame"],
beat_type=BeatType[d.get("beat_type", "UNKNOWN")],
dialogue=dialogue,
phash=d.get("phash"),
luma_hist=bytes.fromhex(d["luma_hist"]) if d.get("luma_hist") else None,
sat_hist= bytes.fromhex(d["sat_hist"]) if d.get("sat_hist") else None,
))
return beats
def _select_beats(beats: list, beat_id: int | None) -> list:
"""Return all beats or exactly one requested beat."""
if beat_id is None:
return beats
selected = [b for b in beats if b.beat_id == beat_id]
if not selected:
raise ValueError(f"Beat {beat_id} not found. Run 'analyze' first.")
return selected
def _select_results(results: list, beat_ids: set[int] | None) -> list:
"""Return all results or only results for the requested beats."""
if beat_ids is None:
return results
return [r for r in results if r.beat_id in beat_ids]
def _find_scene_for_in_point(cfg, in_point_s: float):
from src.cv.scene_indexer import build_scene_index
scenes = build_scene_index(cfg)
for idx, scene in enumerate(scenes):
if scene.start_s <= in_point_s < scene.end_s:
if (
scene.end_s - in_point_s <= cfg.cv.deep_scan.scene_boundary_epsilon_s
and idx + 1 < len(scenes)
):
return scenes[idx + 1]
return scene
return None
def _reference_scoreable_segments(beat, cfg) -> list[tuple[float, float]]:
"""Find visible source-matchable islands inside a trailer beat."""
from src.cv.frame_extractor import grab_frame_at_path
from src.cv.global_scan import _is_scoreable_reference_frame
step_s = max(0.08, cfg.cv.deep_scan.span_sample_step_s)
min_segment_s = max(0.32, step_s * 3.0)
bridge_gap_s = max(0.18, step_s * 2.0)
raw: list[tuple[float, float]] = []
start: float | None = None
last_seen: float | None = None
t = 0.0
while t <= beat.duration_s:
frame = grab_frame_at_path(beat.trailer_path, beat.start_s + t)
scoreable = frame is not None and _is_scoreable_reference_frame(frame, cfg)
if scoreable:
if start is None:
start = t
last_seen = t
elif start is not None and last_seen is not None and t - last_seen > bridge_gap_s:
end = min(beat.duration_s, last_seen + step_s)
if end - start >= min_segment_s:
raw.append((start, end))
start = None
last_seen = None
t = round(t + step_s, 6)
if start is not None and last_seen is not None:
end = min(beat.duration_s, last_seen + step_s)
if end - start >= min_segment_s:
raw.append((start, end))
return raw
def _attach_visual_segments(results: list, beats: list, cfg) -> list:
"""Attach automatic sub-shot matches for multi-island trailer beats."""
from dataclasses import replace
from src.core.models import MatchResult, MatchSegment
from src.cv.global_scan import run_global_scan
by_id = {b.beat_id: b for b in beats}
expanded: list[MatchResult] = []
for result in results:
beat = by_id.get(result.beat_id)
if beat is None:
expanded.append(result)
continue
islands = _reference_scoreable_segments(beat, cfg)
if len(islands) <= 1:
primary = MatchSegment(
trailer_offset_s=0.0,
duration_s=max(0.0, result.duration_s),
scene_id=result.scene_id,
in_point_s=result.in_point_s,
out_point_s=result.out_point_s,
match_score=result.match_score,
is_confirmed=result.is_confirmed,
)
expanded.append(replace(result, segments=(primary,)))
continue
segments: list[MatchSegment] = []
first_start, first_end = islands[0]
first_duration = min(max(0.0, result.duration_s), max(0.0, first_end - first_start))
segments.append(
MatchSegment(
trailer_offset_s=first_start,
duration_s=first_duration,
scene_id=result.scene_id,
in_point_s=result.in_point_s,
out_point_s=result.in_point_s + first_duration,
match_score=result.match_score,
is_confirmed=result.is_confirmed,
)
)
for start_s, end_s in islands[1:]:
segment_beat = replace(
beat,
start_s=beat.start_s + start_s,
end_s=beat.start_s + end_s,
)
segment_matches = run_global_scan([segment_beat], cfg, seed_in_points=None)
if not segment_matches:
continue
seg = segment_matches[0]
seg_dur = min(max(0.0, end_s - start_s), max(0.0, seg.duration_s))
segments.append(
MatchSegment(
trailer_offset_s=start_s,
duration_s=seg_dur,
scene_id=seg.scene_id,
in_point_s=seg.in_point_s,
out_point_s=seg.in_point_s + seg_dur,
match_score=seg.match_score,
is_confirmed=seg.is_confirmed,
)
)
expanded.append(replace(result, segments=tuple(segments)))
return expanded
def cmd_match(args: argparse.Namespace, cfg) -> list:
from src.pipeline.matcher import run_matching
from dataclasses import replace
if getattr(args, "vision", False):
cfg = replace(cfg, vision=replace(cfg.vision, enabled=True))
if getattr(args, "no_vision", False):
cfg = replace(cfg, vision=replace(cfg.vision, enabled=False))
all_beats = _load_beats(cfg)
beats = _select_beats(all_beats, getattr(args, "beat", None))
cached = _normalize_cached_results(all_beats, _load_results(cfg), cfg) if _results_cache_path(cfg).exists() else []
seed_in_points = (
_continuity_seed_in_points(args.beat, all_beats, cached, cfg)
if getattr(args, "beat", None) is not None
else None
)
results = run_matching(
cfg,
beats,
force_reindex=args.force_reindex,
seed_in_points=seed_in_points,
)
results = _attach_visual_segments(results, beats, cfg)
# A targeted one-beat match should improve the cache without deleting
# automatic matches for other beats.
if getattr(args, "beat", None) is not None and _results_cache_path(cfg).exists():
cached = [r for r in cached if r.beat_id != args.beat]
for result in results:
cached = _update_result(result, cached)
results_to_save = cached
else:
results_to_save = results
_save_results(results_to_save, cfg)
print(f"\n{len(results)} / {len(beats)} beats matched.")
for r in results:
print(f" Beat {r.beat_id:03d} → scene {r.scene_id:04d} "
f"in={r.in_point_s:>8.3f}s score={r.match_score:.3f}")
return results
def _update_result(new_result, results: list) -> list:
"""Replace or insert a MatchResult in the list (by beat_id)."""
updated = [r for r in results if r.beat_id != new_result.beat_id]
updated.append(new_result)
return sorted(updated, key=lambda r: r.beat_id)
def _continuity_seed_in_points(beat_id: int, beats: list, results: list, cfg) -> dict[int, list[float | tuple[float, float]]]:
beats_by_id = {b.beat_id: b for b in beats}
results_by_id = {r.beat_id: r for r in results}
target = beats_by_id.get(beat_id)
if target is None:
return {}
seeds: list[tuple[float, float]] = []
base_score = max(cfg.cv.deep_scan.coarse_candidate_threshold + 0.08, 0.92)
prev_matches = [
(b, results_by_id[b.beat_id])
for b in beats
if b.beat_id < beat_id and b.beat_id in results_by_id
]
if prev_matches:
prev_beat, prev_result = max(prev_matches, key=lambda item: item[0].beat_id)
trailer_gap_s = max(0.0, target.start_s - prev_beat.end_s)
expected = prev_result.out_point_s + trailer_gap_s
for offset in cfg.cv.deep_scan.continuity_seed_offsets_s:
offset_score = max(
cfg.cv.deep_scan.coarse_candidate_threshold,
base_score - abs(offset) * 0.06,
)
seeds.append((expected + offset, offset_score))
next_matches = [
(b, results_by_id[b.beat_id])
for b in beats
if b.beat_id > beat_id and b.beat_id in results_by_id
]
if next_matches:
next_beat, next_result = min(next_matches, key=lambda item: item[0].beat_id)
trailer_gap_s = max(0.0, next_beat.start_s - target.end_s)
expected = next_result.in_point_s - trailer_gap_s - target.duration_s
for offset in cfg.cv.deep_scan.continuity_seed_offsets_s:
offset_score = max(
cfg.cv.deep_scan.coarse_candidate_threshold,
base_score - abs(offset) * 0.06,
)
seeds.append((expected - offset, offset_score))
unique: dict[float, float] = {}
for seed_t, seed_score in seeds:
rounded = round(max(0.0, seed_t), 3)
unique[rounded] = max(unique.get(rounded, 0.0), seed_score)
points = [(seed_t, score) for seed_t, score in sorted(unique.items())]
return {beat_id: points} if points else {}
def cmd_rematch(args: argparse.Namespace, cfg) -> None:
"""
Re-run automatic matching for ONE beat.
python cli.py rematch --beat 5 # re-scan CV for beat 5
python cli.py rematch --beat 5 --threshold 0.40 # relax threshold
"""
beat_id = args.beat
beats = _load_beats(cfg)
results = _load_results(cfg) if _results_cache_path(cfg).exists() else []
beat = next((b for b in beats if b.beat_id == beat_id), None)
if beat is None:
print(f"\u274c Beat {beat_id} not found. Run 'analyze' first.")
return
# ---- Refine an already acceptable cached match -------------------------
if args.refine:
current = next((r for r in results if r.beat_id == beat_id), None)
if current is None:
print(f"❌ Beat {beat_id} has no cached match to refine. Run 'match --beat {beat_id}' first.")
return
from src.cv.content_align import align_cached_match_by_content
refined_in_s, sequence_score = align_cached_match_by_content(
beat,
current.in_point_s,
cfg,
search_window_s=args.refine_window,
)
usable_duration_s = max(0.0, current.out_point_s - current.in_point_s)
span_score = sequence_score
scene_data = _scene_for_time_light(_load_scene_cache_light(cfg), refined_in_s, cfg)
out_point_s = refined_in_s + usable_duration_s
if scene_data is not None:
out_point_s = min(out_point_s, float(scene_data["end_s"]))
matchable_duration_s = beat.duration_s
duration_coverage = (
max(0.0, out_point_s - refined_in_s) / matchable_duration_s
if matchable_duration_s > 0 else 0.0
)
if duration_coverage < cfg.cv.deep_scan.min_duration_coverage:
print(
f"❌ Beat {beat_id} refined candidate rejected: "
f"duration coverage {duration_coverage:.0%} < "
f"{cfg.cv.deep_scan.min_duration_coverage:.0%}"
)
return
try:
from src.cv.frame_extractor import get_video_info
fps = float(get_video_info(cfg.paths.source_movie)["fps"]) or cfg.export.edl_frame_rate
except Exception:
fps = cfg.export.edl_frame_rate
from src.core.models import MatchResult
refined = MatchResult(
beat_id=beat_id,
scene_id=int(scene_data["scene_id"]) if scene_data is not None else current.scene_id,
source_path=current.source_path,
in_point_s=max(0.0, refined_in_s),
out_point_s=out_point_s,
in_point_frame=int(max(0.0, refined_in_s) * fps),
match_score=max(sequence_score, span_score),
match_location=current.match_location,
is_confirmed=max(sequence_score, span_score) >= cfg.cv.deep_scan.match_threshold,
)
results = _update_result(refined, results)
_save_results(results, cfg)
print(
f"✅ Beat {beat_id} refined → "
f"in={refined.in_point_s:.3f}s, out={refined.out_point_s:.3f}s, "
f"sequence_score={refined.match_score:.3f}"
)
return
# ---- Re-run CV with optional threshold override ------------------------
from dataclasses import replace as dc_replace
run_cfg = cfg
if args.threshold is not None:
run_cfg = dc_replace(
cfg,
cv=dc_replace(
cfg.cv,
deep_scan=dc_replace(cfg.cv.deep_scan, match_threshold=args.threshold),
),
)
print(f"️ threshold overridden to {args.threshold} for beat {beat_id}")
from src.cv.global_scan import run_global_scan
seed_in_points = _continuity_seed_in_points(beat_id, beats, results, run_cfg)
matches = run_global_scan([beat], run_cfg, seed_in_points=seed_in_points)
if not matches:
print(f"❌ Beat {beat_id}: no match. Try --threshold 0.40.")
return
match = matches[0]
results = _update_result(match, results)
_save_results(results, cfg)
print(f"✅ Beat {beat_id} rematched → (in={match.in_point_s:.3f}s, score={match.match_score:.3f})")
def cmd_report(args: argparse.Namespace, cfg) -> None:
from src.pipeline.reporter import generate_report
beats = _select_beats(_load_beats(cfg), getattr(args, "beat", None))
beat_ids = {b.beat_id for b in beats} if getattr(args, "beat", None) is not None else None
results = _select_results(_normalize_cached_results(_load_beats(cfg), _load_results(cfg), cfg), beat_ids)
out = generate_report(beats, results, cfg)
if getattr(args, "beat", None) is not None and not results:
print(
f"\n⚠️ Beat {args.beat} has no cached match yet. "
f"Run: python cli.py match --beat {args.beat}"
)
print(f"\n\u2705 Report \u2192 {out}")
def cmd_export(args: argparse.Namespace, cfg) -> None:
from src.export.edl_writer import write_edl
from src.export.fcpxml_writer import write_fcpxml
from src.pipeline.matcher import build_timeline
beats = _select_beats(_load_beats(cfg), getattr(args, "beat", None))
beat_ids = {b.beat_id for b in beats} if getattr(args, "beat", None) is not None else None
results = _select_results(_normalize_cached_results(_load_beats(cfg), _load_results(cfg), cfg), beat_ids)
if getattr(args, "beat", None) is not None and not results:
print(f"❌ Beat {args.beat} has no cached match. Run 'match --beat {args.beat}' first.")
return
timeline = build_timeline(beats, results, cfg)
fmt = args.format or cfg.export.output_format
beat_id = getattr(args, "beat", None)
out_stem = (
f"{cfg.paths.reference_trailer.stem}_beat_{beat_id:03d}"
if beat_id is not None
else timeline.title
)
if fmt in ("fcpxml", "both"):
out = write_fcpxml(timeline, cfg, output_path=cfg.paths.output_dir / f"{out_stem}.fcpxml")
print(f"✅ FCPXML → {out}")
if fmt in ("edl", "both"):
out = write_edl(timeline, cfg, output_path=cfg.paths.output_dir / f"{out_stem}.edl")
print(f"✅ EDL → {out}")
def cmd_run(args: argparse.Namespace, cfg) -> None:
"""Full pipeline: analyze → match → report → export."""
cmd_analyze(args, cfg)
cmd_match(args, cfg)
cmd_report(args, cfg)
cmd_export(args, cfg)
# ---------------------------------------------------------------------------
# Argument parser
# ---------------------------------------------------------------------------
def _build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
prog="ai-trailer",
description="AI Trailer Generator v2 — Pure CV scene matching",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"--config", type=Path, default=Path("config.toml"),
metavar="CONFIG", help="Path to config.toml (default: ./config.toml)",
)
parser.add_argument(
"--log-level", default="INFO",
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
help="Logging verbosity (default: INFO)",
)
sub = parser.add_subparsers(dest="command", required=True)
# analyze
p_analyze = sub.add_parser("analyze", help="Detect trailer beats + fingerprint")
p_analyze.add_argument("--no-audio", action="store_true",
help="Skip Whisper (only affects beat labels, not matching)")
p_analyze.add_argument("--no-llm", action="store_true",
help="Skip LLM classification (only affects beat labels)")
# match
p_match = sub.add_parser("match", help="Run 2-phase CV matching")
p_match.add_argument("--force-reindex", action="store_true",
help="Ignore scene cache and re-run PySceneDetect")
p_match.add_argument("--beat", type=int,
help="Match only one beat and merge it into the cached results")
p_match.add_argument("--vision", action="store_true",
help="Enable cached vision descriptions for extra automatic search seeds")
p_match.add_argument("--no-vision", action="store_true",
help="Disable vision seeding even if [vision].enabled is true")
# rematch
p_rematch = sub.add_parser("rematch", help="Re-run or override matching for one beat")
p_rematch.add_argument("--beat", type=int, required=True, help="Beat ID to rematch")
p_rematch.add_argument("--threshold", type=float, default=None, help="Override match_threshold")
p_rematch.add_argument("--refine", action="store_true",
help="Refine the cached match by measuring a local image-content offset")
p_rematch.add_argument("--refine-window", type=float, default=None,
help="Seconds to search around the cached in-point when using --refine")
# report
p_report = sub.add_parser("report", help="Generate HTML visual comparison report")
p_report.add_argument("--beat", type=int, help="Report only one beat")
# export
p_export = sub.add_parser("export", help="Export timeline from cached results")
p_export.add_argument("--format", choices=["fcpxml", "edl", "both"],
help="Override [export] output_format from config")
p_export.add_argument("--beat", type=int, help="Export only one beat")
# run
p_run = sub.add_parser("run", help="Full pipeline: analyze → match → export")
p_run.add_argument("--no-audio", action="store_true")
p_run.add_argument("--no-llm", action="store_true")
p_run.add_argument("--force-reindex", action="store_true")
p_run.add_argument("--vision", action="store_true")
p_run.add_argument("--no-vision", action="store_true")
p_run.add_argument("--format", choices=["fcpxml", "edl", "both"])
p_run.add_argument("--beat", type=int,
help="Run match/report/export for only one cached beat")
return parser
# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------
def main() -> None:
_ensure_utf8_console()
parser = _build_parser()
args = parser.parse_args()
_setup_logging(args.log_level)
from src.core.config import load_config
cfg = load_config(args.config)
dispatch = {
"analyze": cmd_analyze,
"match": cmd_match,
"rematch": cmd_rematch,
"report": cmd_report,
"export": cmd_export,
"run": cmd_run,
}
handler = dispatch[args.command]
handler(args, cfg)
if __name__ == "__main__":
main()
+198
View File
@@ -0,0 +1,198 @@
# =============================================================================
# AI Trailer Generator v2 — Central Configuration
# =============================================================================
# All tunable parameters, thresholds, and file paths are defined here.
# NO hardcoded values are allowed in the Python source code.
# =============================================================================
[project]
name = "AI Trailer Generator v2"
version = "2.0.0"
log_level = "INFO" # DEBUG | INFO | WARNING | ERROR
# -----------------------------------------------------------------------------
# [paths] — External video sources (read-only access)
# -----------------------------------------------------------------------------
[paths]
source_movie = "B:/Proxy/BehindTheRedDoor_FTR_1080P_2398_Fixed.mp4"
reference_trailer = "F:/Encodings/BehindTheRedDoor_Trailer_REFERENCE.mp4"
# Output destinations (inside project sandbox)
output_dir = "output"
cache_dir = ".cache"
proxy_dir = "proxy"
# -----------------------------------------------------------------------------
# [video] — Decode / proxy settings
# -----------------------------------------------------------------------------
[video]
# Target FPS for internal frame extraction (0 = use source FPS)
extract_fps = 1.0
# Proxy resolution for template matching (width x height)
proxy_width = 640
proxy_height = 360
# -----------------------------------------------------------------------------
# [cv] — Computer Vision engine parameters
# Phase 1 — "Vibe Check" (histogram / perceptual hash scene-level filter)
# Phase 2 — "Deep Scan" (template matching frame-level precision)
# -----------------------------------------------------------------------------
[cv]
[cv.vibe_check]
# Number of top candidate scenes to forward to Deep Scan
top_k_candidates = 100
# Histogram comparison method:
# CORREL=0 | CHISQR=1 | INTERSECT=2 | BHATTACHARYYA=3
hist_compare_method = 0
# Histogram bins per channel (hue, saturation)
hist_bins_hue = 50
hist_bins_saturation = 60
# pHash similarity threshold (lower = stricter; 064 range)
# NOTE: 12 is for near-duplicate detection. Cross-video matching
# (trailer vs source movie with different grading/compression)
# needs 2535. Start at 32 and tighten if you get false positives.
phash_max_distance = 32
# ---- Text-Safe Crop -------------------------------------------------------
# Fraction of frame height to EXCLUDE from the top (e.g. logos, title cards)
crop_top_fraction = 0.15
# Fraction of frame height to EXCLUDE from the bottom (e.g. letterbox, subs)
crop_bottom_fraction = 0.30
[cv.deep_scan]
# Step size in SECONDS between sampled frames during the coarse scan pass
coarse_step_seconds = 0.5
# Minimum template match score (0.01.0) to accept a candidate as a hit
match_threshold = 0.65
# Store/report lower-confidence automatic candidates for visual review instead
# of dropping them as "NO MATCH". Confirmed exports can still use match_threshold.
provisional_match_threshold = 0.45
# Lower gate for entering temporal multi-frame refinement. The final decision
# still uses sequence/span scoring; this only avoids rejecting real matches
# because one midpoint frame is weak.
coarse_candidate_threshold = 0.50
# Candidate ranking weights. Duration coverage matters when the same visual
# shot appears multiple times: prefer the occurrence that can cover the beat.
sequence_score_weight = 0.55
span_score_weight = 0.15
coarse_score_weight = 0.10
duration_score_weight = 0.20
duration_tie_break_score_delta = 0.03
min_duration_coverage = 0.65
continuity_seed_offsets_s = [-1.0, 0.0, 0.5, 1.0, 1.5, 2.0, 3.0]
scene_seed_top_k = 30
scene_seed_points_per_scene = 6
content_rerank_candidate_count = 100
skip_coarse_scan_with_weighted_seeds = false
# cv2.matchTemplate method:
# TM_CCOEFF_NORMED=5 (recommended), TM_CCORR_NORMED=3
match_method = 5
# If a coarse hit is found, refine by scanning ± this many seconds
refine_window_seconds = 0.6
refine_step_seconds = 0.04 # ≈ 1 frame at 25 fps
content_align_window_seconds = 0.48
content_align_sample_step_s = 0.28
content_validation_weight = 0.35
provisional_content_threshold = 0.42
# When several adjacent frame offsets score almost the same, prefer the earlier
# one. This avoids matches that are visually correct but start a few frames late.
start_tie_break_score_delta = 0.015
start_preroll_frames = 0
# Automatic temporal verification after a coarse image hit.
# More candidates reduces false positives from visually similar shots.
sequence_candidate_count = 240
sequence_min_distance_s = 1.0
max_refine_candidates = 6
# Match-span detection: trim when the source starts drifting into a different shot.
span_sample_step_s = 0.08
trim_tail_frames = 4
# If a refined in-point lands this close to a detected scene end, treat it as
# the next scene. Scene detectors often place cuts a frame or two around the
# visible boundary.
scene_boundary_epsilon_s = 0.12
scoreable_luma_mean_min = 24.0
scoreable_luma_p90_min = 58.0
scoreable_contrast_min = 24.0
# -----------------------------------------------------------------------------
# [scene_detection] — PySceneDetect parameters (used to segment source movie)
# -----------------------------------------------------------------------------
[scene_detection]
# Threshold for ContentDetector (lower = more sensitive)
content_threshold = 27.0
# Minimum scene duration in seconds
min_scene_duration_s = 1.5
# -----------------------------------------------------------------------------
# [whisper] — Dialogue / audio analysis
# -----------------------------------------------------------------------------
[whisper]
model = "large-v3"
language = "ar"
device = "cuda" # cuda | cpu
compute_type = "float16" # float16 | int8 | float32
# -----------------------------------------------------------------------------
# [llm] — Used ONLY for thematic segmentation / dramaturgy
# -----------------------------------------------------------------------------
[llm]
provider = "openrouter"
base_url = "https://openrouter.ai/api/v1"
model = "google/gemma-4-31b-it"
timeout_seconds = 120
temperature = 0.3
max_tokens = 4096
# -----------------------------------------------------------------------------
# [vision] — Optional cached visual descriptions for ambiguous matching
# -----------------------------------------------------------------------------
[vision]
# Disabled by default to avoid surprise API cost. Enable when you want the
# matcher to ask a vision-capable model for cached 3-frame scene descriptions.
enabled = false
provider = "openrouter"
base_url = "https://openrouter.ai/api/v1"
model = "google/gemma-4-31b-it"
timeout_seconds = 90
temperature = 0.0
max_tokens = 350
# Cost controls: per beat, only the top scene-level candidates are described,
# and cached descriptions in .cache/vision_descriptions.json are reused.
scene_candidate_top_k = 8
max_new_descriptions_per_run = 12
max_seed_scenes = 3
seed_points_per_scene = 12
seed_score = 0.88
max_refine_candidates = 6
local_scan_step_s = 0.12
local_scan_max_points_per_scene = 180
local_scan_top_candidates = 18
local_scan_tie_break_score_delta = 0.08
multi_shot_cut_corr_threshold = 0.20
multi_shot_boundary_tolerance_s = 0.20
fullscan_fallback = false
content_threshold = 0.22
similarity_threshold = 0.18
# -----------------------------------------------------------------------------
# [export] — FCPXML / EDL export settings
# -----------------------------------------------------------------------------
[export]
fcpxml_version = "1.10"
edl_frame_rate = 23.976 # fps used in EDL timecode generation
output_format = "fcpxml" # fcpxml | edl | both
+68
View File
@@ -0,0 +1,68 @@
[build-system]
requires = ["setuptools>=69", "wheel"]
build-backend = "setuptools.backends.legacy:build"
[project]
name = "ai-trailer-2026"
version = "2.0.0"
description = "Frame-accurate trailer reconstruction via pure Computer Vision"
requires-python = ">=3.11"
dependencies = [
# Computer Vision
"opencv-python>=4.9",
"imagehash>=4.3",
"numpy>=1.26",
"Pillow>=10.0",
# Scene detection
"scenedetect[opencv]>=0.6",
# Audio / transcription
"faster-whisper>=1.0",
# Config / secrets
# tomllib — built-in stdlib (Python 3.11+), no install needed
"python-dotenv>=1.0", # loads .env into os.environ
# Export
"lxml>=5.0", # FCPXML generation
]
[project.optional-dependencies]
dev = [
"pytest>=8.0",
"pytest-cov",
"mypy>=1.9",
"ruff>=0.4",
]
[tool.setuptools.packages.find]
where = ["."]
include = ["src*"]
# ---------------------------------------------------------------------------
# Ruff (linter + formatter)
# ---------------------------------------------------------------------------
[tool.ruff]
line-length = 100
target-version = "py311"
[tool.ruff.lint]
select = ["E", "F", "I", "UP", "B", "C4", "ANN"]
ignore = ["ANN101", "ANN102"]
# ---------------------------------------------------------------------------
# Mypy
# ---------------------------------------------------------------------------
[tool.mypy]
python_version = "3.11"
strict = true
ignore_missing_imports = true
# ---------------------------------------------------------------------------
# Pytest
# ---------------------------------------------------------------------------
[tool.pytest.ini_options]
testpaths = ["tests"]
addopts = "-v --tb=short"
+37
View File
@@ -0,0 +1,37 @@
# AI Trailer Generator v2 — Python Dependencies
# Generated from: pip freeze (Python 3.11, Windows)
# Install with: pip install -r requirements.txt
#
# NOTE: faster-whisper and scenedetect may pull in torch/cuda extras
# depending on your platform. See README for CUDA setup.
# Computer Vision
opencv-python>=4.9
numpy>=1.26
Pillow>=10.0
ImageHash>=4.3
PyWavelets>=1.6 # required by ImageHash
# Video scene detection
scenedetect[opencv]>=0.6
# Audio transcription
# faster-whisper>=1.0 ← uncomment when ready to use Whisper
# (pulls in torch; large download)
# Config & secrets
python-dotenv>=1.0 # loads .env into os.environ
# tomllib — stdlib in Python 3.11+, no install needed
# XML export
# lxml>=5.0 ← optional: only needed for advanced FCPXML features
# stdlib xml.etree.ElementTree is used by default
# HTTP (LLM calls via urllib.request — no extra dep needed)
# requests ← not used; stdlib urllib is sufficient
# Dev / testing
pytest>=8.0
pytest-cov
# mypy>=1.9
# ruff>=0.4
+89
View File
@@ -0,0 +1,89 @@
# setup_venv.ps1 — AI Trailer Generator v2 — Virtual Environment Setup
# Run once: .\setup_venv.ps1
# -----------------------------------------------------------------------
# If blocked by ExecutionPolicy:
# Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser
$ErrorActionPreference = "Stop"
$VENV_DIR = ".venv"
function Resolve-ProjectPython {
$cmd = Get-Command python -ErrorAction SilentlyContinue
if ($cmd) {
return $cmd.Source
}
$candidates = @(
"$env:LOCALAPPDATA\Programs\Python\Python311\python.exe",
"$env:LOCALAPPDATA\Microsoft\WindowsApps\python.exe"
)
foreach ($candidate in $candidates) {
if ($candidate -and (Test-Path $candidate)) {
return $candidate
}
}
throw "Python 3.11+ not found. Install Python 3.11+ or add it to PATH."
}
Write-Host ""
Write-Host "==================================================" -ForegroundColor Cyan
Write-Host " AI Trailer Generator v2 — venv Setup" -ForegroundColor Cyan
Write-Host "==================================================" -ForegroundColor Cyan
Write-Host ""
# ---- 1. Check Python version ------------------------------------------------
$PROJECT_PYTHON = Resolve-ProjectPython
$pythonVersion = & $PROJECT_PYTHON --version 2>&1
Write-Host "Python: $pythonVersion"
if ($pythonVersion -notmatch "3\.(1[1-9]|[2-9]\d)") {
Write-Error "Python 3.11+ required. Found: $pythonVersion"
exit 1
}
# ---- 2. Create venv ---------------------------------------------------------
if (Test-Path $VENV_DIR) {
Write-Host "Virtual environment already exists at '$VENV_DIR'. Skipping creation." -ForegroundColor Yellow
} else {
Write-Host "Creating virtual environment in '$VENV_DIR' ..." -ForegroundColor Green
& $PROJECT_PYTHON -m venv $VENV_DIR
Write-Host "Done." -ForegroundColor Green
}
# ---- 3. Activate venv -------------------------------------------------------
$activate = Join-Path $VENV_DIR "Scripts\Activate.ps1"
Write-Host "Activating virtual environment ..."
. $activate
$VENV_PYTHON = Join-Path $VENV_DIR "Scripts\python.exe"
# ---- 4. Upgrade pip ---------------------------------------------------------
Write-Host "Upgrading pip ..." -ForegroundColor Green
& $VENV_PYTHON -m pip install --upgrade pip --quiet
# ---- 5. Install dependencies ------------------------------------------------
Write-Host "Installing dependencies from requirements.txt ..." -ForegroundColor Green
& $VENV_PYTHON -m pip install -r requirements.txt
# ---- 6. Copy .env if missing ------------------------------------------------
if (-not (Test-Path ".env")) {
if (Test-Path ".env.example") {
Copy-Item ".env.example" ".env"
Write-Host ""
Write-Host " .env created from .env.example." -ForegroundColor Yellow
Write-Host " >>> Open .env and fill in your OPENROUTER_API_KEY! <<<" -ForegroundColor Red
}
}
# ---- 7. Done ----------------------------------------------------------------
Write-Host ""
Write-Host "==================================================" -ForegroundColor Cyan
Write-Host " Setup complete!" -ForegroundColor Green
Write-Host ""
Write-Host " Activate the venv with:"
Write-Host " .\.venv\Scripts\Activate.ps1" -ForegroundColor White
Write-Host ""
Write-Host " Then run the pipeline:"
Write-Host " python cli.py run --no-audio --no-llm" -ForegroundColor White
Write-Host "==================================================" -ForegroundColor Cyan
Write-Host ""
+1
View File
@@ -0,0 +1 @@
# src package
+1
View File
@@ -0,0 +1 @@
# src.audio package — Whisper / dialogue analysis
+182
View File
@@ -0,0 +1,182 @@
"""
src/audio/transcriber.py — Whisper transcription via faster-whisper
Responsibility:
- Transcribe audio from a video file into a list of DialogueLine objects
- Optionally restrict to a time window [start_s, end_s] (for single beats)
- All model config (model name, device, compute_type) comes from AppConfig
The LLM is NOT used here. This is pure audio-to-text.
"""
from __future__ import annotations
import logging
import tempfile
from pathlib import Path
from typing import Sequence
from src.core.config import AppConfig
from src.core.models import DialogueLine
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Audio extraction helper (video → wav via ffmpeg)
# ---------------------------------------------------------------------------
def _extract_audio_segment(
video_path: Path,
start_s: float | None,
end_s: float | None,
out_wav: Path,
) -> None:
"""
Use ffmpeg (subprocess) to extract a mono 16kHz WAV from *video_path*.
Args:
video_path: Source video.
start_s: Start time in seconds (None = beginning of file).
end_s: End time in seconds (None = end of file).
out_wav: Destination WAV path.
Raises:
RuntimeError: If ffmpeg exits with a non-zero code.
"""
import subprocess
cmd = ["ffmpeg", "-y", "-loglevel", "error"]
if start_s is not None:
cmd += ["-ss", str(start_s)]
if end_s is not None and start_s is not None:
cmd += ["-t", str(end_s - start_s)]
elif end_s is not None:
cmd += ["-to", str(end_s)]
cmd += [
"-i", str(video_path),
"-vn", # no video
"-ac", "1", # mono
"-ar", "16000", # 16 kHz — Whisper native rate
"-f", "wav",
str(out_wav),
]
result = subprocess.run(cmd, capture_output=True)
if result.returncode != 0:
raise RuntimeError(
f"ffmpeg failed (code {result.returncode}):\n"
f"{result.stderr.decode(errors='replace')}"
)
# ---------------------------------------------------------------------------
# Core transcription
# ---------------------------------------------------------------------------
def transcribe_video(
video_path: Path,
cfg: AppConfig,
start_s: float | None = None,
end_s: float | None = None,
time_offset_s: float = 0.0,
) -> list[DialogueLine]:
"""
Transcribe dialogue from *video_path* using faster-whisper.
Args:
video_path: Path to source or trailer video.
cfg: Application configuration (whisper section).
start_s: Clip start in video-file seconds (None = beginning).
end_s: Clip end in video-file seconds (None = end of file).
time_offset_s: Added to every transcript timestamp so that beat-level
transcripts align with absolute movie time.
Returns:
List of DialogueLine ordered by start time.
"""
try:
from faster_whisper import WhisperModel
except ImportError:
raise ImportError("faster-whisper not installed. Run: pip install faster-whisper")
w = cfg.whisper
logger.info(
"Transcribing %s [%.1f%s] with %s on %s",
video_path.name,
start_s or 0.0,
f"{end_s:.1f}s" if end_s else "end",
w.model,
w.device,
)
with tempfile.TemporaryDirectory() as tmp:
wav = Path(tmp) / "audio.wav"
_extract_audio_segment(video_path, start_s, end_s, wav)
model = WhisperModel(w.model, device=w.device, compute_type=w.compute_type)
segments, _ = model.transcribe(
str(wav),
language=w.language if w.language else None,
beam_size=5,
)
lines: list[DialogueLine] = []
for seg in segments:
lines.append(DialogueLine(
start_s=seg.start + time_offset_s,
end_s=seg.end + time_offset_s,
text=seg.text.strip(),
))
logger.info("Transcription done: %d segments.", len(lines))
return lines
# ---------------------------------------------------------------------------
# Convenience: transcribe a whole file and return grouped by scene
# ---------------------------------------------------------------------------
def transcribe_full_movie(
cfg: AppConfig,
) -> list[DialogueLine]:
"""
Transcribe the entire source movie. Use this result to enrich Scenes
via a dialogue_callback passed to build_scene_index().
"""
return transcribe_video(cfg.paths.source_movie, cfg)
def assign_dialogue_to_scenes(
all_dialogue: Sequence[DialogueLine],
scenes: list["src.core.models.Scene"], # type: ignore[name-defined]
) -> list["src.core.models.Scene"]: # type: ignore[name-defined]
"""
Distribute pre-transcribed DialogueLines into their respective Scenes.
A line is assigned to the scene whose window contains its midpoint.
Args:
all_dialogue: Full-movie transcript as flat list.
scenes: Scene list (will be replaced with enriched copies).
Returns:
New list of Scene objects with dialogue tuples populated.
"""
from dataclasses import replace
from src.core.models import Scene
enriched: list[Scene] = []
for scene in scenes:
matched = tuple(
line for line in all_dialogue
if scene.start_s <= (line.start_s + line.end_s) / 2.0 < scene.end_s
)
enriched.append(replace(scene, dialogue=matched))
total_assigned = sum(len(s.dialogue) for s in enriched)
logger.info("Assigned %d dialogue lines across %d scenes.", total_assigned, len(enriched))
return enriched
+1
View File
@@ -0,0 +1 @@
# src.core package
+387
View File
@@ -0,0 +1,387 @@
"""
src/core/config.py — Configuration loader for AI Trailer Generator v2
Loads config.toml and exposes typed, nested dataclasses.
All CV thresholds, paths, and model settings are sourced exclusively here.
API keys are NEVER stored in config.toml; they are loaded from .env.
"""
from __future__ import annotations
import os
import tomllib
try:
from dotenv import load_dotenv as _load_dotenv
_HAS_DOTENV = True
except ImportError: # dotenv optional — falls back to existing env vars
_HAS_DOTENV = False
from dataclasses import dataclass, field
from pathlib import Path
from typing import Literal
# ---------------------------------------------------------------------------
# Leaf sections
# ---------------------------------------------------------------------------
@dataclass(frozen=True)
class PathsConfig:
source_movie: Path
reference_trailer: Path
output_dir: Path
cache_dir: Path
proxy_dir: Path
@dataclass(frozen=True)
class VideoConfig:
extract_fps: float
proxy_width: int
proxy_height: int
@dataclass(frozen=True)
class VibeCheckConfig:
top_k_candidates: int
hist_compare_method: int
hist_bins_hue: int
hist_bins_saturation: int
phash_max_distance: int
crop_top_fraction: float
crop_bottom_fraction: float
@dataclass(frozen=True)
class DeepScanConfig:
coarse_step_seconds: float
match_threshold: float
provisional_match_threshold: float
coarse_candidate_threshold: float
sequence_score_weight: float
span_score_weight: float
coarse_score_weight: float
duration_score_weight: float
duration_tie_break_score_delta: float
min_duration_coverage: float
continuity_seed_offsets_s: tuple[float, ...]
scene_seed_top_k: int
scene_seed_points_per_scene: int
content_rerank_candidate_count: int
skip_coarse_scan_with_weighted_seeds: bool
max_refine_candidates: int
match_method: int
refine_window_seconds: float
refine_step_seconds: float
content_align_window_seconds: float
content_align_sample_step_s: float
content_validation_weight: float
provisional_content_threshold: float
start_tie_break_score_delta: float
start_preroll_frames: int
sequence_candidate_count: int
sequence_min_distance_s: float
span_sample_step_s: float
trim_tail_frames: int
scene_boundary_epsilon_s: float
scoreable_luma_mean_min: float
scoreable_luma_p90_min: float
scoreable_contrast_min: float
@dataclass(frozen=True)
class CVConfig:
vibe_check: VibeCheckConfig
deep_scan: DeepScanConfig
@dataclass(frozen=True)
class SceneDetectionConfig:
content_threshold: float
min_scene_duration_s: float
@dataclass(frozen=True)
class WhisperConfig:
model: str
language: str
device: Literal["cuda", "cpu"]
compute_type: Literal["float16", "int8", "float32"]
@dataclass(frozen=True)
class LLMConfig:
provider: Literal["ollama", "openai", "openrouter"]
base_url: str
model: str
timeout_seconds: int
temperature: float
max_tokens: int
# Loaded from .env — NEVER committed to version control
api_key: str = ""
@dataclass(frozen=True)
class VisionConfig:
enabled: bool
provider: Literal["openai", "openrouter"]
base_url: str
model: str
timeout_seconds: int
temperature: float
max_tokens: int
scene_candidate_top_k: int
max_new_descriptions_per_run: int
max_seed_scenes: int
seed_points_per_scene: int
seed_score: float
max_refine_candidates: int
local_scan_step_s: float
local_scan_max_points_per_scene: int
local_scan_top_candidates: int
local_scan_tie_break_score_delta: float
multi_shot_cut_corr_threshold: float
multi_shot_boundary_tolerance_s: float
fullscan_fallback: bool
content_threshold: float
similarity_threshold: float
api_key: str = ""
@dataclass(frozen=True)
class ExportConfig:
fcpxml_version: str
edl_frame_rate: float
output_format: Literal["fcpxml", "edl", "both"]
# ---------------------------------------------------------------------------
# Root config — single object passed through the entire application
# ---------------------------------------------------------------------------
@dataclass(frozen=True)
class AppConfig:
project_name: str
version: str
log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR"]
paths: PathsConfig
video: VideoConfig
cv: CVConfig
scene_detection: SceneDetectionConfig
whisper: WhisperConfig
llm: LLMConfig
vision: VisionConfig
export: ExportConfig
# ---------------------------------------------------------------------------
# Loader
# ---------------------------------------------------------------------------
_DEFAULT_CONFIG_PATH = Path(__file__).parents[2] / "config.toml"
_DEFAULT_ENV_PATH = Path(__file__).parents[2] / ".env"
def load_config(
config_path: Path = _DEFAULT_CONFIG_PATH,
env_path: Path = _DEFAULT_ENV_PATH,
) -> AppConfig:
"""
Parse config.toml and return a fully-typed, immutable AppConfig.
API keys are read from the .env file (or existing environment variables);
they are never stored in config.toml.
Args:
config_path: Absolute or relative path to the TOML file.
Defaults to <project_root>/config.toml.
env_path: Path to the .env file.
Defaults to <project_root>/.env.
Raises:
FileNotFoundError: If the TOML file does not exist.
KeyError / TypeError: If a required key is missing or has the wrong type.
"""
# Load .env first so os.environ is populated before we read it below.
if _HAS_DOTENV:
_load_dotenv(dotenv_path=env_path, override=False)
if not config_path.exists():
raise FileNotFoundError(
f"Config file not found: {config_path}\n"
"Copy config.toml.example to config.toml and adjust your paths."
)
with config_path.open("rb") as fh:
raw: dict = tomllib.load(fh)
project = raw["project"]
paths_raw = raw["paths"]
video_raw = raw["video"]
cv_raw = raw["cv"]
sd_raw = raw["scene_detection"]
whisper_raw = raw["whisper"]
llm_raw = raw["llm"]
vision_raw = raw.get("vision", {})
export_raw = raw["export"]
# Resolve paths relative to the config file's parent directory so the
# project is relocatable, but keep absolute paths as-is.
def _resolve(p: str) -> Path:
path = Path(p)
return path if path.is_absolute() else (config_path.parent / path).resolve()
paths = PathsConfig(
source_movie=_resolve(paths_raw["source_movie"]),
reference_trailer=_resolve(paths_raw["reference_trailer"]),
output_dir=_resolve(paths_raw["output_dir"]),
cache_dir=_resolve(paths_raw["cache_dir"]),
proxy_dir=_resolve(paths_raw["proxy_dir"]),
)
video = VideoConfig(
extract_fps=float(video_raw["extract_fps"]),
proxy_width=int(video_raw["proxy_width"]),
proxy_height=int(video_raw["proxy_height"]),
)
vibe_check = VibeCheckConfig(
top_k_candidates=int(cv_raw["vibe_check"]["top_k_candidates"]),
hist_compare_method=int(cv_raw["vibe_check"]["hist_compare_method"]),
hist_bins_hue=int(cv_raw["vibe_check"]["hist_bins_hue"]),
hist_bins_saturation=int(cv_raw["vibe_check"]["hist_bins_saturation"]),
phash_max_distance=int(cv_raw["vibe_check"]["phash_max_distance"]),
crop_top_fraction=float(cv_raw["vibe_check"]["crop_top_fraction"]),
crop_bottom_fraction=float(cv_raw["vibe_check"]["crop_bottom_fraction"]),
)
deep_scan = DeepScanConfig(
coarse_step_seconds=float(cv_raw["deep_scan"]["coarse_step_seconds"]),
match_threshold=float(cv_raw["deep_scan"]["match_threshold"]),
provisional_match_threshold=float(cv_raw["deep_scan"].get("provisional_match_threshold", 0.45)),
coarse_candidate_threshold=float(cv_raw["deep_scan"].get("coarse_candidate_threshold", cv_raw["deep_scan"]["match_threshold"])),
sequence_score_weight=float(cv_raw["deep_scan"].get("sequence_score_weight", 0.55)),
span_score_weight=float(cv_raw["deep_scan"].get("span_score_weight", 0.15)),
coarse_score_weight=float(cv_raw["deep_scan"].get("coarse_score_weight", 0.10)),
duration_score_weight=float(cv_raw["deep_scan"].get("duration_score_weight", 0.20)),
duration_tie_break_score_delta=float(cv_raw["deep_scan"].get("duration_tie_break_score_delta", 0.03)),
min_duration_coverage=float(cv_raw["deep_scan"].get("min_duration_coverage", 0.65)),
continuity_seed_offsets_s=tuple(
float(v) for v in cv_raw["deep_scan"].get(
"continuity_seed_offsets_s",
[-1.0, 0.0, 0.5, 1.0, 1.5, 2.0, 3.0],
)
),
scene_seed_top_k=int(cv_raw["deep_scan"].get("scene_seed_top_k", 30)),
scene_seed_points_per_scene=int(cv_raw["deep_scan"].get("scene_seed_points_per_scene", 6)),
content_rerank_candidate_count=int(cv_raw["deep_scan"].get("content_rerank_candidate_count", 100)),
skip_coarse_scan_with_weighted_seeds=bool(cv_raw["deep_scan"].get("skip_coarse_scan_with_weighted_seeds", False)),
max_refine_candidates=int(cv_raw["deep_scan"].get("max_refine_candidates", 6)),
match_method=int(cv_raw["deep_scan"]["match_method"]),
refine_window_seconds=float(cv_raw["deep_scan"].get("refine_window_seconds", 0.6)),
refine_step_seconds=float(cv_raw["deep_scan"]["refine_step_seconds"]),
content_align_window_seconds=float(cv_raw["deep_scan"].get("content_align_window_seconds", 0.48)),
content_align_sample_step_s=float(cv_raw["deep_scan"].get("content_align_sample_step_s", 0.28)),
content_validation_weight=float(cv_raw["deep_scan"].get("content_validation_weight", 0.35)),
provisional_content_threshold=float(cv_raw["deep_scan"].get("provisional_content_threshold", 0.42)),
start_tie_break_score_delta=float(cv_raw["deep_scan"].get("start_tie_break_score_delta", 0.015)),
start_preroll_frames=int(cv_raw["deep_scan"].get("start_preroll_frames", 0)),
sequence_candidate_count=int(cv_raw["deep_scan"].get("sequence_candidate_count", 240)),
sequence_min_distance_s=float(cv_raw["deep_scan"].get("sequence_min_distance_s", 1.0)),
span_sample_step_s=float(cv_raw["deep_scan"].get("span_sample_step_s", 0.08)),
trim_tail_frames=int(cv_raw["deep_scan"].get("trim_tail_frames", 2)),
scene_boundary_epsilon_s=float(cv_raw["deep_scan"].get("scene_boundary_epsilon_s", 0.12)),
scoreable_luma_mean_min=float(cv_raw["deep_scan"].get("scoreable_luma_mean_min", 24.0)),
scoreable_luma_p90_min=float(cv_raw["deep_scan"].get("scoreable_luma_p90_min", 58.0)),
scoreable_contrast_min=float(cv_raw["deep_scan"].get("scoreable_contrast_min", 24.0)),
)
scene_detection = SceneDetectionConfig(
content_threshold=float(sd_raw["content_threshold"]),
min_scene_duration_s=float(sd_raw["min_scene_duration_s"]),
)
whisper = WhisperConfig(
model=whisper_raw["model"],
language=whisper_raw["language"],
device=whisper_raw["device"],
compute_type=whisper_raw["compute_type"],
)
# Resolve API key: env var takes precedence over config (which shouldn't have it).
# Supported env vars (in priority order):
# OPENROUTER_API_KEY → for provider = openrouter
# OPENAI_API_KEY → for provider = openai
# LLM_API_KEY → universal fallback
_provider = llm_raw["provider"]
_api_key = (
os.environ.get("OPENROUTER_API_KEY", "")
if _provider == "openrouter"
else os.environ.get("OPENAI_API_KEY", "")
if _provider == "openai"
else ""
) or os.environ.get("LLM_API_KEY", "")
llm = LLMConfig(
provider=_provider,
base_url=llm_raw["base_url"],
model=llm_raw["model"],
timeout_seconds=int(llm_raw["timeout_seconds"]),
temperature=float(llm_raw["temperature"]),
max_tokens=int(llm_raw["max_tokens"]),
api_key=_api_key,
)
vision_provider = vision_raw.get("provider", _provider if _provider in ("openai", "openrouter") else "openrouter")
vision_api_key = (
os.environ.get("OPENROUTER_API_KEY", "")
if vision_provider == "openrouter"
else os.environ.get("OPENAI_API_KEY", "")
) or os.environ.get("VISION_API_KEY", "") or os.environ.get("LLM_API_KEY", "")
vision = VisionConfig(
enabled=bool(vision_raw.get("enabled", False)),
provider=vision_provider,
base_url=str(vision_raw.get("base_url", llm.base_url)),
model=str(vision_raw.get("model", llm.model)),
timeout_seconds=int(vision_raw.get("timeout_seconds", llm.timeout_seconds)),
temperature=float(vision_raw.get("temperature", 0.0)),
max_tokens=int(vision_raw.get("max_tokens", 350)),
scene_candidate_top_k=int(vision_raw.get("scene_candidate_top_k", 8)),
max_new_descriptions_per_run=int(vision_raw.get("max_new_descriptions_per_run", 12)),
max_seed_scenes=int(vision_raw.get("max_seed_scenes", 3)),
seed_points_per_scene=int(vision_raw.get("seed_points_per_scene", 12)),
seed_score=float(vision_raw.get("seed_score", 0.88)),
max_refine_candidates=int(vision_raw.get("max_refine_candidates", 6)),
local_scan_step_s=float(vision_raw.get("local_scan_step_s", 0.12)),
local_scan_max_points_per_scene=int(vision_raw.get("local_scan_max_points_per_scene", 180)),
local_scan_top_candidates=int(vision_raw.get("local_scan_top_candidates", 18)),
local_scan_tie_break_score_delta=float(vision_raw.get("local_scan_tie_break_score_delta", 0.08)),
multi_shot_cut_corr_threshold=float(vision_raw.get("multi_shot_cut_corr_threshold", 0.20)),
multi_shot_boundary_tolerance_s=float(vision_raw.get("multi_shot_boundary_tolerance_s", 0.20)),
fullscan_fallback=bool(vision_raw.get("fullscan_fallback", False)),
content_threshold=float(vision_raw.get("content_threshold", 0.22)),
similarity_threshold=float(vision_raw.get("similarity_threshold", 0.18)),
api_key=vision_api_key,
)
export = ExportConfig(
fcpxml_version=str(export_raw["fcpxml_version"]),
edl_frame_rate=float(export_raw["edl_frame_rate"]),
output_format=export_raw["output_format"],
)
return AppConfig(
project_name=project["name"],
version=project["version"],
log_level=project["log_level"],
paths=paths,
video=video,
cv=CVConfig(vibe_check=vibe_check, deep_scan=deep_scan),
scene_detection=scene_detection,
whisper=whisper,
llm=llm,
vision=vision,
export=export,
)
+287
View File
@@ -0,0 +1,287 @@
"""
src/core/models.py Canonical data models for AI Trailer Generator v2
Rules:
- Every model is a frozen dataclass (immutable after creation).
- All fields are strictly typed; no bare dicts or untyped lists.
- Seconds are always float; frame numbers are always int.
- Confidence scores live in [0.0, 1.0].
"""
from __future__ import annotations
from dataclasses import dataclass, field
from enum import Enum, auto
from pathlib import Path
from typing import Optional
# ===========================================================================
# Enumerations
# ===========================================================================
class MatchMethod(Enum):
"""CV template matching method (mirrors cv2.TM_* constants)."""
TM_SQDIFF = 0
TM_SQDIFF_NORMED = 1
TM_CCORR = 2
TM_CCORR_NORMED = 3
TM_CCOEFF = 4
TM_CCOEFF_NORMED = 5
class BeatType(Enum):
"""Narrative role of a trailer beat (for dramaturgy / LLM use only)."""
HOOK = auto() # Opening attention grabber
SETUP = auto() # World / character introduction
CONFLICT = auto() # Inciting incident / rising tension
CLIMAX = auto() # Peak action / emotion
RESOLUTION = auto() # Cool-down / tagline
UNKNOWN = auto()
class ExportFormat(Enum):
FCPXML = "fcpxml"
EDL = "edl"
BOTH = "both"
# ===========================================================================
# Phase 0 — Source-movie scene index
# ===========================================================================
@dataclass(frozen=True)
class DialogueLine:
"""Single transcribed line from Whisper output."""
start_s: float # onset in seconds
end_s: float # offset in seconds
text: str # verbatim transcript
speaker: Optional[str] = None # diarisation label if available
@property
def duration_s(self) -> float:
return self.end_s - self.start_s
@dataclass(frozen=True)
class Scene:
"""
One detected scene in the source movie.
Produced by PySceneDetect; enriched by Whisper dialogue and
(optionally) perceptual hashes during the Vibe Check phase.
"""
scene_id: int # zero-based index in source movie
source_path: Path # absolute path to the source video file
start_s: float # scene start in seconds
end_s: float # scene end in seconds
start_frame: int # first frame number
end_frame: int # last frame number
# Populated after Vibe Check fingerprinting
luma_hist: Optional[bytes] = None # serialised np.ndarray (pickle)
sat_hist: Optional[bytes] = None
phash: Optional[str] = None # 64-bit hex string
# Populated after Whisper pass
dialogue: tuple[DialogueLine, ...] = field(default_factory=tuple)
@property
def duration_s(self) -> float:
return self.end_s - self.start_s
@property
def midpoint_s(self) -> float:
return self.start_s + self.duration_s / 2.0
def __repr__(self) -> str:
return (
f"Scene(id={self.scene_id}, "
f"{self.start_s:.2f}s{self.end_s:.2f}s, "
f"dur={self.duration_s:.2f}s)"
)
# ===========================================================================
# Phase 1 — Reference-trailer beat
# ===========================================================================
@dataclass(frozen=True)
class TrailerBeat:
"""
One cut / segment in the reference trailer.
The 'beat' is the atomic unit of a trailer: it maps exactly to one
clip that will later be sourced from the original movie.
"""
beat_id: int
trailer_path: Path
start_s: float
end_s: float
start_frame: int
end_frame: int
beat_type: BeatType = BeatType.UNKNOWN # set by LLM dramaturgy pass
# Visual fingerprints of the *middle* frame (populated by CV pipeline)
luma_hist: Optional[bytes] = None
sat_hist: Optional[bytes] = None
phash: Optional[str] = None
# Dialogue extracted from this beat
dialogue: tuple[DialogueLine, ...] = field(default_factory=tuple)
@property
def duration_s(self) -> float:
return self.end_s - self.start_s
@property
def midpoint_s(self) -> float:
return self.start_s + self.duration_s / 2.0
def __repr__(self) -> str:
return (
f"TrailerBeat(id={self.beat_id}, "
f"{self.beat_type.name}, "
f"{self.start_s:.2f}s{self.end_s:.2f}s)"
)
# ===========================================================================
# Phase 2 — CV match result
# ===========================================================================
@dataclass(frozen=True)
class VibeHit:
"""
Intermediate result from Phase 1 (Vibe Check histogram/pHash).
Represents a *candidate* scene that passed the coarse filter.
Not yet a confirmed match; forwarded to Deep Scan.
"""
beat_id: int
scene_id: int
hist_score: float # histogram similarity [0.0, 1.0] (CORREL method)
phash_distance: int # Hamming distance [0, 64]; lower = more similar
combined_score: float # weighted aggregate used for ranking
@dataclass(frozen=True)
class MatchSegment:
"""
One source-backed visual island inside a trailer beat.
Some trailer beats contain multiple shots separated by fades/title frames.
A single continuous source in/out cannot represent those beats accurately.
"""
trailer_offset_s: float
duration_s: float
scene_id: int
in_point_s: float
out_point_s: float
match_score: float
is_confirmed: bool = True
@dataclass(frozen=True)
class MatchResult:
"""
Final, confirmed match from Phase 2 (Deep Scan template matching).
One MatchResult per TrailerBeat: the best frame-accurate hit found
inside the source movie.
"""
beat_id: int # which trailer beat was matched
scene_id: int # which source scene contains the match
source_path: Path # absolute path to source video
# Frame-accurate in-point / out-point in the SOURCE movie
in_point_s: float # matched frame onset in source seconds
out_point_s: float # computed out-point (in_point + beat duration)
in_point_frame: int # matched frame number in source movie
# Match quality
match_score: float # cv2.matchTemplate peak value [0.0, 1.0]
match_location: tuple[int, int] = field(default_factory=lambda: (0, 0))
# (x, y) pixel location of the best match within the source frame
# Provenance
vibe_hit: Optional[VibeHit] = None # the candidate that led here
is_confirmed: bool = True
segments: tuple[MatchSegment, ...] = field(default_factory=tuple)
@property
def duration_s(self) -> float:
return self.out_point_s - self.in_point_s
def __repr__(self) -> str:
return (
f"MatchResult(beat={self.beat_id} → scene={self.scene_id}, "
f"in={self.in_point_s:.3f}s, score={self.match_score:.3f})"
)
# ===========================================================================
# Phase 3 — Edit timeline (pre-export)
# ===========================================================================
@dataclass(frozen=True)
class EditClip:
"""
One clip on the final edit timeline, ready for FCPXML / EDL export.
Combines beat dramaturgy + the CV-confirmed source in/out points.
"""
clip_index: int # position on the timeline (0-based)
beat: TrailerBeat
match: MatchResult
# Timeline position (in the OUTPUT trailer)
timeline_start_s: float
timeline_end_s: float
source_duration_s: float | None = None
trailer_tail_s: float = 0.0
# Optional audio override (e.g. VO or music)
audio_path: Optional[Path] = None
audio_offset_s: float = 0.0
@property
def timeline_duration_s(self) -> float:
return self.timeline_end_s - self.timeline_start_s
@property
def source_timeline_duration_s(self) -> float:
if self.source_duration_s is not None:
return max(0.0, self.source_duration_s)
return self.timeline_duration_s
def __repr__(self) -> str:
return (
f"EditClip(#{self.clip_index}, "
f"tl={self.timeline_start_s:.2f}s{self.timeline_end_s:.2f}s, "
f"src={self.match.in_point_s:.3f}s)"
)
@dataclass(frozen=True)
class EditTimeline:
"""
The complete ordered sequence of EditClips that forms the trailer.
Passed to the export layer (FCPXML / EDL writer).
"""
title: str
frame_rate: float # e.g. 23.976
clips: tuple[EditClip, ...] # ordered by clip_index
@property
def total_duration_s(self) -> float:
if not self.clips:
return 0.0
last = max(self.clips, key=lambda c: c.timeline_end_s)
return last.timeline_end_s
@property
def clip_count(self) -> int:
return len(self.clips)
+1
View File
@@ -0,0 +1 @@
# src.cv package — Computer Vision engine
+240
View File
@@ -0,0 +1,240 @@
from __future__ import annotations
import math
import shutil
import subprocess
from pathlib import Path
import numpy as np
from PIL import Image, ImageFilter, ImageOps
from src.core.config import AppConfig
from src.core.models import TrailerBeat
def _run(cmd: list[str]) -> None:
result = subprocess.run(cmd, capture_output=True)
if result.returncode != 0:
raise RuntimeError(result.stderr.decode(errors="replace"))
def _extract_frames(
video_path: Path,
start_s: float,
duration_s: float,
fps: float,
out_dir: Path,
prefix: str,
) -> None:
out_dir.mkdir(parents=True, exist_ok=True)
_run([
"ffmpeg", "-y", "-loglevel", "error",
"-ss", str(max(0.0, start_s)),
"-i", str(video_path),
"-t", str(max(0.04, duration_s)),
"-vf", f"scale=640:360,fps={fps}",
str(out_dir / f"{prefix}_%04d.png"),
])
def _cropped_image(path: Path, cfg: AppConfig) -> Image.Image:
image = Image.open(path).convert("L")
image = _trim_dark_borders(image)
w, h = image.size
# Final validation should see the composition. The broader text-safe crop
# used for coarse search can remove bodies, furniture and lower-frame
# spatial cues that distinguish otherwise similar face/window shots.
top = int(h * 0.05)
bottom = int(h * 0.95)
return image.crop((0, top, w, bottom))
def _trim_dark_borders(image: Image.Image) -> Image.Image:
"""Remove encoded black matte/pillarbox borders before content scoring."""
gray = image.convert("L")
arr = np.asarray(gray, dtype=np.float32)
if arr.size == 0:
return image
h, w = arr.shape[:2]
col_signal = np.percentile(arr, 90, axis=0)
row_signal = np.percentile(arr, 90, axis=1)
active_cols = np.where(col_signal > 18.0)[0]
active_rows = np.where(row_signal > 18.0)[0]
if active_cols.size >= max(8, int(w * 0.35)):
x0 = max(0, int(active_cols[0]) - 2)
x1 = min(w, int(active_cols[-1]) + 3)
else:
x0, x1 = 0, w
if active_rows.size >= max(8, int(h * 0.35)):
y0 = max(0, int(active_rows[0]) - 2)
y1 = min(h, int(active_rows[-1]) + 3)
else:
y0, y1 = 0, h
if x1 - x0 < int(w * 0.35) or y1 - y0 < int(h * 0.35):
return image
return image.crop((x0, y0, x1, y1))
def _feature(path: Path, cfg: AppConfig) -> np.ndarray:
image = _cropped_image(path, cfg)
w, h = image.size
image = image.crop((int(w * 0.10), int(h * 0.10), int(w * 0.90), int(h * 0.90)))
image = ImageOps.equalize(image).filter(ImageFilter.FIND_EDGES).resize((160, 62))
arr = np.asarray(image, dtype=np.float32)
return (arr - arr.mean()) / (arr.std() + 1e-6)
def _luma_feature(path: Path, cfg: AppConfig) -> np.ndarray:
image = ImageOps.equalize(_cropped_image(path, cfg)).resize((160, 80))
arr = np.asarray(image, dtype=np.float32)
return (arr - arr.mean()) / (arr.std() + 1e-6)
def _hist_feature(path: Path, cfg: AppConfig) -> np.ndarray:
image = _trim_dark_borders(Image.open(path).convert("RGB"))
w, h = image.size
top = int(h * 0.05)
bottom = int(h * 0.95)
arr = np.asarray(image.crop((0, top, w, bottom)).resize((160, 80)), dtype=np.float32)
hist_parts = []
for channel in range(3):
hist, _ = np.histogram(arr[:, :, channel], bins=32, range=(0, 255))
hist = hist.astype(np.float32)
hist_parts.append(hist / (hist.sum() + 1e-6))
return np.concatenate(hist_parts)
def _spatial_hist_feature(path: Path, cfg: AppConfig) -> np.ndarray:
image = _trim_dark_borders(Image.open(path).convert("RGB"))
w, h = image.size
top = int(h * 0.05)
bottom = int(h * 0.95)
arr = np.asarray(image.crop((0, top, w, bottom)).resize((160, 80)), dtype=np.float32)
cells = []
grid_y = 4
grid_x = 4
cell_h = arr.shape[0] // grid_y
cell_w = arr.shape[1] // grid_x
for gy in range(grid_y):
for gx in range(grid_x):
cell = arr[gy * cell_h:(gy + 1) * cell_h, gx * cell_w:(gx + 1) * cell_w, :]
for channel in range(3):
hist, _ = np.histogram(cell[:, :, channel], bins=16, range=(0, 255))
hist = hist.astype(np.float32)
cells.append(hist / (hist.sum() + 1e-6))
return np.concatenate(cells)
def _is_dark(path: Path, cfg: AppConfig) -> bool:
image = _trim_dark_borders(Image.open(path).convert("L"))
w, h = image.size
top = int(h * 0.05)
bottom = int(h * 0.95)
arr = np.asarray(image.crop((0, top, w, bottom)), dtype=np.float32)
return float(arr.mean()) < 28.0 and float(np.percentile(arr, 90)) < 58.0
def _corr(a: np.ndarray, b: np.ndarray) -> float:
return float((a * b).mean())
def _hist_intersection(a: np.ndarray, b: np.ndarray) -> float:
return float(np.minimum(a, b).sum() / (np.maximum(a, b).sum() + 1e-6))
def _paired_frame_score(ref_path: Path, src_path: Path, cfg: AppConfig) -> float:
edge_score = _corr(_feature(ref_path, cfg), _feature(src_path, cfg))
luma_score = _corr(_luma_feature(ref_path, cfg), _luma_feature(src_path, cfg))
hist_score = _hist_intersection(_hist_feature(ref_path, cfg), _hist_feature(src_path, cfg))
spatial_score = _hist_intersection(_spatial_hist_feature(ref_path, cfg), _spatial_hist_feature(src_path, cfg))
return (
edge_score * 0.24
+ luma_score * 0.24
+ hist_score * 0.14
+ spatial_score * 0.38
)
def align_cached_match_by_content(
beat: TrailerBeat,
estimated_in_point_s: float,
cfg: AppConfig,
search_window_s: float | None = None,
fps: float = 25.0,
) -> tuple[float, float]:
"""
Measure the local source offset directly from rendered frame content.
This is intentionally independent from the global OpenCV matcher: it only
needs FFmpeg, Pillow and numpy, and it scans a small window around an
already plausible candidate.
"""
window_s = (
search_window_s
if search_window_s is not None
else cfg.cv.deep_scan.content_align_window_seconds
)
sample_step_s = max(1.0 / fps, cfg.cv.deep_scan.content_align_sample_step_s)
source_start_s = max(0.0, estimated_in_point_s - window_s)
source_duration_s = beat.duration_s + (2.0 * window_s) + 0.5
tmp = cfg.paths.output_dir / "align_tmp" / f"beat_{beat.beat_id:03d}"
shutil.rmtree(tmp, ignore_errors=True)
tmp.mkdir(parents=True, exist_ok=True)
try:
ref_dir = tmp / "ref"
src_dir = tmp / "src"
_extract_frames(beat.trailer_path, beat.start_s, beat.duration_s, fps, ref_dir, "ref")
_extract_frames(cfg.paths.source_movie, source_start_s, source_duration_s, fps, src_dir, "src")
ref_frames = sorted(ref_dir.glob("ref_*.png"))
src_frames = sorted(src_dir.glob("src_*.png"))
if not ref_frames or not src_frames:
return estimated_in_point_s, 0.0
sample_frame_step = max(1, int(round(sample_step_s * fps)))
min_matchable_frames = max(1, len(ref_frames) - int(round(0.24 * fps)))
template_offsets: list[int] = []
templates: list[tuple[int, np.ndarray]] = []
for idx in range(0, min_matchable_frames, sample_frame_step):
path = ref_frames[idx]
if _is_dark(path, cfg):
continue
template_offsets.append(idx)
templates.append((idx, _feature(path, cfg)))
if len(templates) < 3:
template_offsets = list(range(0, min_matchable_frames, sample_frame_step))
templates = [
(idx, _feature(ref_frames[idx], cfg))
for idx in template_offsets
]
search_start_frame = 0
search_end_frame = max(0, len(src_frames) - min_matchable_frames)
estimated_frame = int(round((estimated_in_point_s - source_start_s) * fps))
best_frame = estimated_frame
best_score = -1.0
for candidate_frame in range(search_start_frame, search_end_frame + 1):
scores: list[float] = []
for offset_frame in template_offsets:
src_idx = candidate_frame + offset_frame
if src_idx < 0 or src_idx >= len(src_frames):
break
scores.append(_paired_frame_score(ref_frames[offset_frame], src_frames[src_idx], cfg))
if len(scores) < max(3, math.ceil(len(templates) * 0.65)):
continue
avg_score = sum(scores) / len(scores)
min_score = min(scores)
score = (avg_score * 0.68) + (min_score * 0.32)
if score > best_score + 0.003:
best_score = score
best_frame = candidate_frame
elif score >= best_score - 0.003 and abs(candidate_frame - estimated_frame) < abs(best_frame - estimated_frame):
best_frame = candidate_frame
return source_start_s + (best_frame / fps), max(0.0, best_score)
finally:
shutil.rmtree(tmp, ignore_errors=True)
+253
View File
@@ -0,0 +1,253 @@
"""
src/cv/deep_scan.py Phase 2: Frame-accurate template matching (Deep Scan)
Responsibility:
Given a TrailerBeat and a ranked list of VibeHit candidates, open the
source video and scan each candidate scene in two passes:
1. Coarse pass: step through at coarse_step_seconds intervals,
comparing via cv2.matchTemplate.
2. Refine pass: if coarse score > threshold, zoom in ± refine_window_seconds
at refine_step_seconds resolution to pin the exact in-point.
Returns a MatchResult if a confident hit is found, otherwise None.
"""
from __future__ import annotations
import logging
from pathlib import Path
from typing import Sequence
import cv2
import numpy as np
from src.core.config import AppConfig
from src.core.models import MatchResult, Scene, TrailerBeat, VibeHit
from src.cv.fingerprinting import text_safe_crop
from src.cv.frame_extractor import (
grab_frame_at,
grab_frame_at_path,
iter_frames_stepped,
open_video,
)
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Template preparation
# ---------------------------------------------------------------------------
def _prepare_template(
trailer_beat: TrailerBeat,
cfg: AppConfig,
proxy_w: int,
proxy_h: int,
) -> np.ndarray | None:
"""
Extract, crop, and resize the representative frame from the trailer beat.
This frame becomes the cv2.matchTemplate "needle".
"""
vc = cfg.cv.vibe_check
ds = cfg.cv.deep_scan
beat_frame = grab_frame_at_path(
trailer_beat.trailer_path,
trailer_beat.midpoint_s,
)
if beat_frame is None:
logger.warning("Beat %d: cannot decode midpoint frame.", trailer_beat.beat_id)
return None
cropped = text_safe_crop(beat_frame, vc.crop_top_fraction, vc.crop_bottom_fraction)
resized = cv2.resize(cropped, (proxy_w, proxy_h), interpolation=cv2.INTER_AREA)
# Crop the template by 10% on all sides to allow sliding window (translation invariance)
# when matching against the source movie, which might have slight pan/scan shifts.
margin_y = int(proxy_h * 0.10)
margin_x = int(proxy_w * 0.10)
template = resized[margin_y : proxy_h - margin_y, margin_x : proxy_w - margin_x]
return template
# ---------------------------------------------------------------------------
# Single-frame match
# ---------------------------------------------------------------------------
def _match_frame(
source_frame: np.ndarray,
template: np.ndarray,
method: int,
proxy_w: int,
proxy_h: int,
crop_top: float,
crop_bottom: float,
) -> tuple[float, tuple[int, int]]:
"""
Run cv2.matchTemplate between *source_frame* and *template*.
Returns:
(score, (x, y)) where score [0, 1] for CCOEFF_NORMED.
"""
cropped = text_safe_crop(source_frame, crop_top, crop_bottom)
haystack = cv2.resize(cropped, (proxy_w, proxy_h), interpolation=cv2.INTER_AREA)
# Match the slightly smaller template inside the full proxy frame
result = cv2.matchTemplate(haystack, template, method)
_, max_val, _, max_loc = cv2.minMaxLoc(result)
return float(max_val), (int(max_loc[0]), int(max_loc[1]))
# ---------------------------------------------------------------------------
# Deep Scan core
# ---------------------------------------------------------------------------
def scan_scene(
beat: TrailerBeat,
scene: Scene,
template: np.ndarray,
cfg: AppConfig,
) -> tuple[float, float, tuple[int, int]] | None:
"""
Scan one source scene in two passes (coarse refine).
Returns:
(best_timestamp_s, best_score, best_location) or None if no hit.
"""
ds = cfg.cv.deep_scan
vc = cfg.cv.vibe_check
proxy_w = cfg.video.proxy_width
proxy_h = cfg.video.proxy_height
best_t = scene.start_s
best_score = 0.0
best_loc = (0, 0)
# ---- Coarse pass --------------------------------------------------------
with open_video(scene.source_path) as cap:
for t, frame in iter_frames_stepped(
cap, scene.start_s, scene.end_s, ds.coarse_step_seconds
):
score, loc = _match_frame(
frame, template, ds.match_method,
proxy_w, proxy_h, vc.crop_top_fraction, vc.crop_bottom_fraction,
)
if score > best_score:
best_score = score
best_t = t
best_loc = loc
if best_score < ds.match_threshold:
return None # scene doesn't contain a match worth refining
# ---- Refine pass ----------------------------------------------------
refine_start = max(scene.start_s, best_t - ds.refine_window_seconds)
refine_end = min(scene.end_s, best_t + ds.refine_window_seconds)
refined_t = best_t
refined_score = best_score
refined_loc = best_loc
for t, frame in iter_frames_stepped(
cap, refine_start, refine_end, ds.refine_step_seconds
):
score, loc = _match_frame(
frame, template, ds.match_method,
proxy_w, proxy_h, vc.crop_top_fraction, vc.crop_bottom_fraction,
)
if score > refined_score:
refined_score = score
refined_t = t
refined_loc = loc
logger.debug(
"Beat %d → Scene %d: coarse=%.3f refined=%.3f @%.3fs",
beat.beat_id, scene.scene_id, best_score, refined_score, refined_t,
)
return refined_t, refined_score, refined_loc
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def run_deep_scan(
beat: TrailerBeat,
candidates: Sequence[VibeHit],
scenes_by_id: dict[int, Scene],
cfg: AppConfig,
) -> MatchResult | None:
"""
Phase 2 Deep Scan: iterate over Vibe Check candidates and template-match.
Args:
beat: The trailer beat to source.
candidates: Ranked VibeHit list from Phase 1 (best first).
scenes_by_id: Lookup dict: scene_id Scene.
cfg: Application configuration.
Returns:
The best MatchResult above threshold, or None if no match found.
"""
proxy_w = cfg.video.proxy_width
proxy_h = cfg.video.proxy_height
template = _prepare_template(beat, cfg, proxy_w, proxy_h)
if template is None:
return None
best_result: MatchResult | None = None
for vibe_hit in candidates:
scene = scenes_by_id.get(vibe_hit.scene_id)
if scene is None:
logger.warning("VibeHit references unknown scene_id=%d", vibe_hit.scene_id)
continue
hit = scan_scene(beat, scene, template, cfg)
if hit is None:
continue
in_point_s, match_score, match_loc = hit
# Frame number: approximate via FPS (refined later if needed)
from src.cv.frame_extractor import get_video_info
info = get_video_info(scene.source_path)
fps = float(info["fps"]) or 24.0
in_point_frame = int(in_point_s * fps)
candidate_result = MatchResult(
beat_id=beat.beat_id,
scene_id=scene.scene_id,
source_path=scene.source_path,
in_point_s=in_point_s,
out_point_s=in_point_s + beat.duration_s,
in_point_frame=in_point_frame,
match_score=match_score,
match_location=match_loc,
vibe_hit=vibe_hit,
)
if best_result is None or match_score > best_result.match_score:
best_result = candidate_result
# Early exit: if score is very high, no need to check other candidates
if match_score >= 0.90:
logger.info(
"Beat %d: early-exit match (score=%.3f) in scene %d @%.3fs",
beat.beat_id, match_score, scene.scene_id, in_point_s,
)
break
if best_result:
logger.info("Beat %d → MATCH scene=%d score=%.3f in=%.3fs",
beat.beat_id, best_result.scene_id,
best_result.match_score, best_result.in_point_s)
else:
logger.warning("Beat %d → NO MATCH found in %d candidates.",
beat.beat_id, len(candidates))
return best_result
+228
View File
@@ -0,0 +1,228 @@
"""
src/cv/fingerprinting.py Image fingerprinting for the Vibe Check phase
Responsibilities (Single Responsibility Principle):
- Text-Safe Crop: strip top/bottom fractions to hide logos & letterbox
- Luma + Saturation histogram extraction (scale-invariant)
- Perceptual hash (pHash) via imagehash
This module is PURELY functional no file I/O, no video decoding,
no search logic. It takes numpy arrays and returns numeric descriptors.
"""
from __future__ import annotations
import pickle
from typing import TYPE_CHECKING
import cv2
import numpy as np
try:
import imagehash
from PIL import Image as PilImage
_HAS_IMAGEHASH = True
except ImportError:
_HAS_IMAGEHASH = False
if TYPE_CHECKING:
from src.core.config import VibeCheckConfig
# ---------------------------------------------------------------------------
# Text-Safe Crop
# ---------------------------------------------------------------------------
def text_safe_crop(
frame: np.ndarray,
crop_top: float,
crop_bottom: float,
) -> np.ndarray:
"""
Remove the top and bottom fractions of a frame.
This eliminates title cards, logos (top) and letterbox / subtitles
(bottom) before any colour analysis, preventing false positives.
Args:
frame: BGR or greyscale frame as (H, W[, C]) ndarray.
crop_top: Fraction [0, 1) of height to remove from the top.
crop_bottom: Fraction [0, 1) of height to remove from the bottom.
Returns:
Cropped view (no copy avoids memory overhead).
Raises:
ValueError: If crop fractions are out of range or overlap.
"""
if not (0.0 <= crop_top < 1.0):
raise ValueError(f"crop_top must be in [0, 1); got {crop_top}")
if not (0.0 <= crop_bottom < 1.0):
raise ValueError(f"crop_bottom must be in [0, 1); got {crop_bottom}")
if crop_top + crop_bottom >= 1.0:
raise ValueError(
f"crop_top ({crop_top}) + crop_bottom ({crop_bottom}) must be < 1.0"
)
h = frame.shape[0]
y_start = int(h * crop_top)
y_end = int(h * (1.0 - crop_bottom))
return frame[y_start:y_end]
# ---------------------------------------------------------------------------
# Histogram extraction
# ---------------------------------------------------------------------------
def extract_hs_histograms(
frame_bgr: np.ndarray,
bins_luma: int | None = None,
bins_sat: int | None = None,
*,
bins_hue: int | None = None,
) -> tuple[np.ndarray, np.ndarray]:
"""
Compute normalised Luma (Value) and Saturation histograms from a BGR frame.
We use Luma and Saturation (ignoring Hue) because Hue is highly sensitive
to color grading differences between the trailer and the source movie.
Args:
frame_bgr: BGR frame (H, W, 3) uint8.
bins_luma: Number of histogram bins for the Luma channel [0, 256).
bins_hue: Backwards-compatible alias for bins_luma.
bins_sat: Number of histogram bins for the Saturation channel [0, 256).
Returns:
(luma_hist, sat_hist) each a 1-D float32 ndarray, L2-normalised.
"""
if bins_luma is None:
bins_luma = bins_hue
elif bins_hue is not None and bins_hue != bins_luma:
raise ValueError("bins_hue is an alias for bins_luma; pass only one value")
if bins_luma is None or bins_sat is None:
raise TypeError("bins_luma/bins_hue and bins_sat are required")
hsv = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2HSV)
luma = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY)
# Use perceptual grayscale luma rather than HSV Value. Value would make
# saturated red and blue look identical, weakening the scene-level filter.
luma_hist = cv2.calcHist(
[luma], [0], None, [bins_luma], [0, 256]
).flatten().astype(np.float32)
sat_hist = cv2.calcHist(
[hsv], [1], None, [bins_sat], [0, 256]
).flatten().astype(np.float32)
# L2-normalise so scene size doesn't affect scores
cv2.normalize(luma_hist, luma_hist, alpha=1.0, norm_type=cv2.NORM_L2)
cv2.normalize(sat_hist, sat_hist, alpha=1.0, norm_type=cv2.NORM_L2)
return luma_hist, sat_hist
def compare_histograms(
hist_a: np.ndarray,
hist_b: np.ndarray,
method: int,
) -> float:
"""
Compare two histograms using cv2.compareHist.
Args:
hist_a, hist_b: 1-D float32 ndarrays of identical shape.
method: cv2.HISTCMP_* constant (e.g. cv2.HISTCMP_CORREL = 0).
Returns:
Raw score from cv2.compareHist (range depends on method).
For CORREL: [-1, 1], higher = more similar.
For BHATTACHARYYA: [0, 1], lower = more similar.
"""
return float(cv2.compareHist(hist_a, hist_b, method))
# ---------------------------------------------------------------------------
# Perceptual Hash
# ---------------------------------------------------------------------------
def compute_phash(frame_bgr: np.ndarray, hash_size: int = 8) -> str:
"""
Compute a perceptual hash (pHash) of a BGR frame.
pHash is rotation- and scale-invariant; it catches visual similarity
even when resolution differs between trailer proxy and source movie.
Args:
frame_bgr: BGR frame (H, W, 3) uint8.
hash_size: DCT block size; 8 64-bit hash (default).
Returns:
Hex string representation of the 64-bit hash (e.g. "f8e0e0e0...").
Raises:
RuntimeError: If imagehash is not installed.
"""
if not _HAS_IMAGEHASH:
raise RuntimeError(
"imagehash is not installed. Run: pip install imagehash"
)
rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
pil = PilImage.fromarray(rgb)
phash = imagehash.phash(pil, hash_size=hash_size)
return str(phash)
def phash_distance(hash_a: str, hash_b: str) -> int:
"""
Compute Hamming distance between two pHash hex strings.
Args:
hash_a, hash_b: Hex strings as returned by compute_phash().
Returns:
Integer Hamming distance [0, 64]. 0 = identical.
"""
if not _HAS_IMAGEHASH:
raise RuntimeError("imagehash is not installed.")
return int(imagehash.hex_to_hash(hash_a) - imagehash.hex_to_hash(hash_b))
# ---------------------------------------------------------------------------
# Serialisation helpers (histograms ↔ bytes for caching)
# ---------------------------------------------------------------------------
def hist_to_bytes(hist: np.ndarray) -> bytes:
"""Serialise a numpy histogram array for storage in a Scene/Beat model."""
return pickle.dumps(hist, protocol=pickle.HIGHEST_PROTOCOL)
def bytes_to_hist(data: bytes) -> np.ndarray:
"""Deserialise a numpy histogram array from bytes."""
return pickle.loads(data) # noqa: S301 (trusted internal cache only)
# ---------------------------------------------------------------------------
# High-level convenience: fingerprint one frame using config
# ---------------------------------------------------------------------------
def fingerprint_frame(
frame_bgr: np.ndarray,
cfg: "VibeCheckConfig",
) -> tuple[bytes, bytes, str]:
"""
Apply Text-Safe Crop, histogram extraction, and pHash in one call.
Args:
frame_bgr: Full BGR frame (H, W, 3) uint8.
cfg: VibeCheckConfig carrying crop fractions and bin counts.
Returns:
(luma_hist_bytes, sat_hist_bytes, phash_hex)
"""
cropped = text_safe_crop(frame_bgr, cfg.crop_top_fraction, cfg.crop_bottom_fraction)
luma_hist, sat_hist = extract_hs_histograms(cropped, cfg.hist_bins_hue, cfg.hist_bins_saturation)
phash_hex = compute_phash(cropped)
return hist_to_bytes(luma_hist), hist_to_bytes(sat_hist), phash_hex
+172
View File
@@ -0,0 +1,172 @@
"""
src/cv/frame_extractor.py Low-level video frame access
Responsibility:
Provide a thin, testable wrapper around cv2.VideoCapture for:
- seeking to an exact timestamp and returning one BGR frame
- iterating frames with a configurable step size
- extracting the "representative" middle frame of a Scene / TrailerBeat
No fingerprinting, no matching only raw frame delivery.
"""
from __future__ import annotations
import logging
from contextlib import contextmanager
from pathlib import Path
from typing import Generator, Iterator
import cv2
import numpy as np
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Context-managed VideoCapture
# ---------------------------------------------------------------------------
@contextmanager
def open_video(path: Path) -> Generator[cv2.VideoCapture, None, None]:
"""
Context manager that opens a VideoCapture and guarantees release.
Args:
path: Absolute path to the video file.
Raises:
FileNotFoundError: If the file does not exist.
RuntimeError: If OpenCV cannot open the file.
"""
if not path.exists():
raise FileNotFoundError(f"Video not found: {path}")
cap = cv2.VideoCapture(str(path))
if not cap.isOpened():
raise RuntimeError(f"OpenCV could not open video: {path}")
try:
yield cap
finally:
cap.release()
# ---------------------------------------------------------------------------
# Video metadata
# ---------------------------------------------------------------------------
def get_video_info(path: Path) -> dict[str, float | int]:
"""
Return basic metadata without keeping the file open.
Returns:
dict with keys: fps, frame_count, duration_s, width, height
"""
with open_video(path) as cap:
fps = cap.get(cv2.CAP_PROP_FPS)
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
duration_s = frame_count / fps if fps > 0 else 0.0
return {
"fps": fps,
"frame_count": frame_count,
"duration_s": duration_s,
"width": width,
"height": height,
}
# ---------------------------------------------------------------------------
# Single frame extraction
# ---------------------------------------------------------------------------
def grab_frame_at(cap: cv2.VideoCapture, timestamp_s: float) -> np.ndarray | None:
"""
Seek to *timestamp_s* and return the BGR frame at that position.
Uses CAP_PROP_POS_MSEC for sub-frame accuracy.
Args:
cap: An already-open VideoCapture.
timestamp_s: Target time in seconds.
Returns:
BGR ndarray (H, W, 3) or None if seeking / decoding failed.
"""
cap.set(cv2.CAP_PROP_POS_MSEC, timestamp_s * 1000.0)
ok, frame = cap.read()
if not ok or frame is None:
logger.debug("grab_frame_at: failed at %.3fs", timestamp_s)
return None
return frame
def grab_frame_at_path(path: Path, timestamp_s: float) -> np.ndarray | None:
"""
One-shot convenience: open seek grab release.
Prefer open_video() when grabbing multiple frames from the same file.
"""
with open_video(path) as cap:
return grab_frame_at(cap, timestamp_s)
# ---------------------------------------------------------------------------
# Middle-frame extraction (representative frame for fingerprinting)
# ---------------------------------------------------------------------------
def grab_midpoint_frame(
cap: cv2.VideoCapture,
start_s: float,
end_s: float,
) -> np.ndarray | None:
"""
Grab the frame at the exact midpoint of a [start_s, end_s] interval.
Args:
cap: Open VideoCapture for the source video.
start_s: Interval start in seconds.
end_s: Interval end in seconds.
Returns:
BGR frame or None if decoding failed.
"""
mid = start_s + (end_s - start_s) / 2.0
return grab_frame_at(cap, mid)
# ---------------------------------------------------------------------------
# Stepped-frame iterator (used by Deep Scan coarse pass)
# ---------------------------------------------------------------------------
def iter_frames_stepped(
cap: cv2.VideoCapture,
start_s: float,
end_s: float,
step_s: float,
) -> Iterator[tuple[float, np.ndarray]]:
"""
Yield (timestamp_s, frame) for every *step_s* increment in [start_s, end_s].
Frames that fail to decode are silently skipped.
Args:
cap: Open VideoCapture.
start_s: Scan window start in seconds.
end_s: Scan window end in seconds.
step_s: Step between samples in seconds.
Yields:
(timestamp_s, bgr_frame)
"""
if step_s <= 0:
raise ValueError(f"step_s must be > 0; got {step_s}")
t = start_s
while t <= end_s:
frame = grab_frame_at(cap, t)
if frame is not None:
yield t, frame
t = round(t + step_s, 6) # avoid float accumulation drift
File diff suppressed because it is too large Load Diff
+229
View File
@@ -0,0 +1,229 @@
"""
src/cv/scene_indexer.py Source-movie scene segmentation + fingerprinting
Responsibility:
1. Run PySceneDetect on the source movie list of raw scene boundaries
2. For each scene, extract the midpoint frame and fingerprint it
3. Optionally run Whisper dialogue on each scene (injected as dependency)
4. Persist results to .cache/ as JSON for fast re-runs
Returns: list[Scene] with luma_hist, sat_hist, phash populated.
"""
from __future__ import annotations
import json
import logging
import pickle
from pathlib import Path
from typing import Callable, Sequence
import numpy as np
from src.core.config import AppConfig
from src.core.models import Scene
from src.cv.fingerprinting import fingerprint_frame
from src.cv.frame_extractor import grab_midpoint_frame, open_video
logger = logging.getLogger(__name__)
# Type alias for an optional dialogue-injection callback
DialogueCallback = Callable[[Scene], Scene]
# ---------------------------------------------------------------------------
# Cache helpers
# ---------------------------------------------------------------------------
def _cache_path(cfg: AppConfig) -> Path:
p = cfg.paths.cache_dir / "scene_index.json"
p.parent.mkdir(parents=True, exist_ok=True)
return p
def _scene_to_dict(s: Scene) -> dict:
return {
"scene_id": s.scene_id,
"source_path": str(s.source_path),
"start_s": s.start_s,
"end_s": s.end_s,
"start_frame": s.start_frame,
"end_frame": s.end_frame,
# histograms serialised as hex so JSON can hold them
"luma_hist": s.luma_hist.hex() if s.luma_hist else None,
"sat_hist": s.sat_hist.hex() if s.sat_hist else None,
"phash": s.phash,
}
def _scene_from_dict(d: dict) -> Scene:
return Scene(
scene_id=d["scene_id"],
source_path=Path(d["source_path"]),
start_s=d["start_s"],
end_s=d["end_s"],
start_frame=d["start_frame"],
end_frame=d["end_frame"],
luma_hist=bytes.fromhex(d["luma_hist"]) if d.get("luma_hist") else None,
sat_hist= bytes.fromhex(d["sat_hist"]) if d.get("sat_hist") else None,
phash=d.get("phash"),
)
def _save_cache(scenes: list[Scene], cfg: AppConfig) -> None:
data = [_scene_to_dict(s) for s in scenes]
_cache_path(cfg).write_text(json.dumps(data, indent=2), encoding="utf-8")
logger.info("Scene index cached → %s (%d scenes)", _cache_path(cfg), len(scenes))
def _load_cache(cfg: AppConfig) -> list[Scene] | None:
p = _cache_path(cfg)
if not p.exists():
return None
try:
data = json.loads(p.read_text(encoding="utf-8"))
scenes = [_scene_from_dict(d) for d in data]
logger.info("Loaded %d scenes from cache (%s)", len(scenes), p)
return scenes
except Exception as exc:
logger.warning("Cache corrupt, re-indexing: %s", exc)
return None
# ---------------------------------------------------------------------------
# PySceneDetect integration
# ---------------------------------------------------------------------------
def _detect_scenes_pyscenedetect(cfg: AppConfig) -> list[tuple[float, float, int, int]]:
"""
Run PySceneDetect ContentDetector on the source movie.
Returns:
List of (start_s, end_s, start_frame, end_frame) tuples.
"""
try:
from scenedetect import open_video as sd_open_video, SceneManager
from scenedetect.detectors import ContentDetector
except ImportError:
raise ImportError(
"scenedetect is not installed. Run: pip install scenedetect[opencv]"
)
video = sd_open_video(str(cfg.paths.source_movie))
manager = SceneManager()
manager.add_detector(
ContentDetector(
threshold=cfg.scene_detection.content_threshold,
min_scene_len=int(
cfg.scene_detection.min_scene_duration_s
* video.frame_rate
),
)
)
logger.info("Detecting scenes in %s", cfg.paths.source_movie.name)
manager.detect_scenes(video=video, show_progress=True)
raw = manager.get_scene_list()
result: list[tuple[float, float, int, int]] = []
for start_tc, end_tc in raw:
result.append((
start_tc.get_seconds(),
end_tc.get_seconds(),
start_tc.get_frames(),
end_tc.get_frames(),
))
logger.info("PySceneDetect found %d scenes.", len(result))
return result
# ---------------------------------------------------------------------------
# Fingerprint enrichment
# ---------------------------------------------------------------------------
def _fingerprint_scenes(
raw_scenes: list[tuple[float, float, int, int]],
cfg: AppConfig,
) -> list[Scene]:
"""
For each raw scene boundary, extract the midpoint frame and fingerprint it.
"""
scenes: list[Scene] = []
vc_cfg = cfg.cv.vibe_check
logger.info("Fingerprinting %d scenes …", len(raw_scenes))
with open_video(cfg.paths.source_movie) as cap:
for idx, (start_s, end_s, start_frame, end_frame) in enumerate(raw_scenes):
frame = grab_midpoint_frame(cap, start_s, end_s)
if frame is None:
logger.warning("Scene %d: midpoint frame decode failed, skipping fingerprint.", idx)
scenes.append(Scene(
scene_id=idx,
source_path=cfg.paths.source_movie,
start_s=start_s, end_s=end_s,
start_frame=start_frame, end_frame=end_frame,
))
continue
luma_bytes, sat_bytes, phash_hex = fingerprint_frame(frame, vc_cfg)
scenes.append(Scene(
scene_id=idx,
source_path=cfg.paths.source_movie,
start_s=start_s, end_s=end_s,
start_frame=start_frame, end_frame=end_frame,
luma_hist=luma_bytes,
sat_hist=sat_bytes,
phash=phash_hex,
))
if (idx + 1) % 50 == 0:
logger.info("%d / %d scenes fingerprinted", idx + 1, len(raw_scenes))
return scenes
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def build_scene_index(
cfg: AppConfig,
force_reindex: bool = False,
dialogue_callback: DialogueCallback | None = None,
) -> list[Scene]:
"""
Build (or load from cache) the full scene index for the source movie.
Steps:
1. Load from .cache/scene_index.json if available and force_reindex=False.
2. Otherwise: detect scenes via PySceneDetect fingerprint cache.
3. Optionally enrich each scene with dialogue via dialogue_callback.
Args:
cfg: Application configuration.
force_reindex: Ignore cache and re-run detection + fingerprinting.
dialogue_callback: Optional function Scene Scene that adds dialogue.
Injected here so this module stays audio-free.
Returns:
List of Scene objects with fingerprints populated.
"""
if not force_reindex:
cached = _load_cache(cfg)
if cached is not None:
if dialogue_callback:
cached = [dialogue_callback(s) for s in cached]
return cached
raw = _detect_scenes_pyscenedetect(cfg)
scenes = _fingerprint_scenes(raw, cfg)
_save_cache(scenes, cfg)
if dialogue_callback:
scenes = [dialogue_callback(s) for s in scenes]
return scenes
+190
View File
@@ -0,0 +1,190 @@
"""
src/cv/vibe_check.py Phase 1: Scene-level histogram / pHash filter
Responsibility:
Given ONE TrailerBeat (with pre-computed fingerprints) and a list of
source Scenes (also fingerprinted), return the Top-K candidates ranked
by a combined histogram + pHash score.
This module contains ZERO file I/O and ZERO frame decoding those live
in the pipeline layer. Input = model objects, output = sorted VibeHit list.
"""
from __future__ import annotations
import logging
from dataclasses import replace
from typing import Sequence
import cv2
import numpy as np
from src.core.models import Scene, TrailerBeat, VibeHit
from src.cv.fingerprinting import bytes_to_hist, phash_distance
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Scoring
# ---------------------------------------------------------------------------
# Weight applied to histogram score vs pHash score in the combined metric.
# pHash gets less weight because it's sensitive to text overlays on source.
_HIST_WEIGHT = 0.70
_PHASH_WEIGHT = 0.30
_PHASH_MAX_BITS = 64 # maximum possible Hamming distance
def _hist_combined_score(
beat: TrailerBeat,
scene: Scene,
hist_method: int,
) -> float:
"""
Average CORREL score of luma + saturation histograms.
Returns a value in [-1, 1] (CORREL) or [0, 1] depending on method.
Higher is always more similar (we invert BHATTACHARYYA if needed).
"""
if beat.luma_hist is None or scene.luma_hist is None:
return 0.0
if beat.sat_hist is None or scene.sat_hist is None:
return 0.0
luma_score = cv2.compareHist(
bytes_to_hist(beat.luma_hist),
bytes_to_hist(scene.luma_hist),
hist_method,
)
sat_score = cv2.compareHist(
bytes_to_hist(beat.sat_hist),
bytes_to_hist(scene.sat_hist),
hist_method,
)
# Normalise BHATTACHARYYA to [0, 1] similarity (invert distance)
if hist_method == cv2.HISTCMP_BHATTACHARYYA:
luma_score = 1.0 - float(luma_score)
sat_score = 1.0 - float(sat_score)
return float((luma_score + sat_score) / 2.0)
def _phash_score(beat: TrailerBeat, scene: Scene) -> float:
"""
Convert Hamming distance to a [0, 1] similarity score.
0 Hamming distance 1.0 (identical)
64 Hamming distance 0.0 (completely different)
"""
if beat.phash is None or scene.phash is None:
return 0.0
dist = phash_distance(beat.phash, scene.phash)
return 1.0 - (dist / _PHASH_MAX_BITS)
def _combined_score(
beat: TrailerBeat,
scene: Scene,
hist_method: int,
) -> float:
"""Weighted aggregate of histogram + pHash similarity."""
hist = _hist_combined_score(beat, scene, hist_method)
phash = _phash_score(beat, scene)
return _HIST_WEIGHT * hist + _PHASH_WEIGHT * phash
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def run_vibe_check(
beat: TrailerBeat,
scenes: Sequence[Scene],
top_k: int,
hist_method: int,
phash_max_distance: int,
) -> list[VibeHit]:
"""
Phase 1: Score all source scenes against one trailer beat and return
the top-K candidates for Deep Scan.
Args:
beat: The trailer beat to match (must have fingerprints).
scenes: All detected scenes from the source movie.
top_k: Maximum number of candidates to return.
hist_method: cv2.HISTCMP_* constant (e.g. 0 = CORREL).
phash_max_distance: Scenes with pHash Hamming distance > this value
are excluded before ranking (hard filter).
Returns:
List of VibeHit, sorted by combined_score descending, length top_k.
Empty list if beat has no fingerprints or no scenes pass the filter.
"""
if beat.luma_hist is None and beat.phash is None:
logger.warning(
"Beat %d has no fingerprints — skipping Vibe Check.", beat.beat_id
)
return []
candidates: list[VibeHit] = []
for scene in scenes:
# Hard pHash filter: skip scenes that are too visually distant
if beat.phash and scene.phash:
dist = phash_distance(beat.phash, scene.phash)
if dist > phash_max_distance:
continue # fast rejection — avoids full histogram compare
hist = _hist_combined_score(beat, scene, hist_method)
phash = _phash_score(beat, scene)
combined = _HIST_WEIGHT * hist + _PHASH_WEIGHT * phash
candidates.append(VibeHit(
beat_id=beat.beat_id,
scene_id=scene.scene_id,
hist_score=round(hist, 4),
phash_distance=(
phash_distance(beat.phash, scene.phash)
if beat.phash and scene.phash
else _PHASH_MAX_BITS
),
combined_score=round(combined, 4),
))
# Sort by combined score, descending; return top-K
candidates.sort(key=lambda h: h.combined_score, reverse=True)
top = candidates[:top_k]
logger.info(
"Vibe Check beat=%d: %d scenes scored, %d candidates forwarded to Deep Scan. "
"Best score: %.3f (scene %s)",
beat.beat_id,
len(candidates),
len(top),
top[0].combined_score if top else 0.0,
top[0].scene_id if top else "",
)
return top
def batch_vibe_check(
beats: Sequence[TrailerBeat],
scenes: Sequence[Scene],
top_k: int,
hist_method: int,
phash_max_distance: int,
) -> dict[int, list[VibeHit]]:
"""
Run Vibe Check for every beat and return a mapping beat_id [VibeHit].
Convenience wrapper for the pipeline layer.
"""
return {
beat.beat_id: run_vibe_check(
beat, scenes, top_k, hist_method, phash_max_distance
)
for beat in beats
}
+1
View File
@@ -0,0 +1 @@
# src.export package — FCPXML / EDL export
+114
View File
@@ -0,0 +1,114 @@
"""
src/export/edl_writer.py EditTimeline CMX 3600 EDL
Generates a standard CMX 3600 Edit Decision List compatible with
Avid, DaVinci Resolve, Premiere Pro, and most NLEs.
CMX 3600 format reference:
https://en.wikipedia.org/wiki/Edit_decision_list#CMX_3600
"""
from __future__ import annotations
import logging
from pathlib import Path
from src.core.config import AppConfig
from src.core.models import EditClip, EditTimeline
from src.export.timecode import seconds_to_smpte
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# EDL line builders
# ---------------------------------------------------------------------------
def _edl_header(title: str) -> str:
return f"TITLE: {title}\nFCM: NON-DROP FRAME\n"
def _edl_event(
event_num: int,
clip: EditClip,
fps: float,
) -> str:
"""
Build one CMX 3600 event block for a single clip.
Format:
NNN AX V C <SRC_IN> <SRC_OUT> <REC_IN> <REC_OUT>
* FROM CLIP NAME: ...
* COMMENT: ...
"""
src_in = seconds_to_smpte(clip.match.in_point_s, fps)
source_duration_s = clip.source_timeline_duration_s
src_out = seconds_to_smpte(clip.match.in_point_s + source_duration_s, fps)
rec_in = seconds_to_smpte(clip.timeline_start_s, fps)
rec_out = seconds_to_smpte(clip.timeline_start_s + source_duration_s, fps)
event_line = f"{event_num:03d} AX V C {src_in} {src_out} {rec_in} {rec_out}"
name_line = f"* FROM CLIP NAME: {clip.match.source_path.name}"
comment_line = (
f"* BEAT {clip.beat.beat_id:03d} | {clip.beat.beat_type.name} | "
f"score={clip.match.match_score:.3f}"
)
return "\n".join([event_line, name_line, comment_line, ""])
def _edl_black_tail_event(event_num: int, clip: EditClip, fps: float) -> str:
rec_in = seconds_to_smpte(clip.timeline_start_s + clip.source_timeline_duration_s, fps)
rec_out = seconds_to_smpte(clip.timeline_end_s, fps)
event_line = f"{event_num:03d} BL V C 00:00:00:00 00:00:00:00 {rec_in} {rec_out}"
comment_line = (
f"* BEAT {clip.beat.beat_id:03d} TRAILER-ONLY TAIL | "
"add fade/dissolve to black"
)
return "\n".join([event_line, "* FROM CLIP NAME: BLACK", comment_line, ""])
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def write_edl(
timeline: EditTimeline,
cfg: AppConfig,
output_path: Path | None = None,
) -> Path:
"""
Write the EditTimeline as a CMX 3600 EDL file.
Args:
timeline: EditTimeline from build_timeline().
cfg: Application configuration.
output_path: Override destination. Defaults to
<output_dir>/<project_name>.edl.
Returns:
Path to the written .edl file.
"""
if output_path is None:
output_path = cfg.paths.output_dir / f"{timeline.title}.edl"
output_path.parent.mkdir(parents=True, exist_ok=True)
fps = timeline.frame_rate
lines = [_edl_header(timeline.title), "\n"]
event_num = 1
for clip in sorted(timeline.clips, key=lambda c: c.clip_index):
lines.append(_edl_event(event_num, clip, fps))
event_num += 1
if clip.trailer_tail_s > 0:
lines.append("\n")
lines.append(_edl_black_tail_event(event_num, clip, fps))
event_num += 1
lines.append("\n")
edl_text = "\n".join(lines)
output_path.write_text(edl_text, encoding="utf-8")
logger.info("EDL written → %s (%d events)", output_path, timeline.clip_count)
return output_path
+222
View File
@@ -0,0 +1,222 @@
"""
src/export/fcpxml_writer.py EditTimeline Final Cut Pro XML (FCPXML 1.10)
Generates a standards-compliant FCPXML file that can be imported directly
into Final Cut Pro X, DaVinci Resolve, or Premiere Pro (via FCPXML plugin).
Spec reference: https://developer.apple.com/documentation/professional_video_applications/fcpxml_reference
"""
from __future__ import annotations
import logging
from pathlib import Path
from urllib.parse import quote
from xml.etree import ElementTree as ET
from xml.etree.ElementTree import Element, SubElement
from src.core.config import AppConfig
from src.core.models import EditClip, EditTimeline
from src.export.timecode import (
fcpxml_format_name,
fcpxml_frame_duration,
seconds_to_fcpxml,
)
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Asset registry — one <asset> per unique source file
# ---------------------------------------------------------------------------
class _AssetRegistry:
def __init__(self) -> None:
self._assets: dict[Path, str] = {} # path → asset id
self._counter = 2 # r1 reserved for format
def get_or_create(self, path: Path) -> str:
if path not in self._assets:
rid = f"r{self._counter}"
self._assets[path] = rid
self._counter += 1
return self._assets[path]
@property
def items(self) -> dict[Path, str]:
return dict(self._assets)
# ---------------------------------------------------------------------------
# Builder
# ---------------------------------------------------------------------------
def _path_to_url(path: Path) -> str:
"""Convert an absolute Path to a file:// URL as required by FCPXML."""
posix = path.as_posix()
if not posix.startswith("/"):
# Windows drive letter: C:/foo → /C:/foo
posix = "/" + posix
return "file://" + quote(posix, safe="/:@")
def build_fcpxml(
timeline: EditTimeline,
cfg: AppConfig,
source_duration_s: float = 7200.0, # 2-hour fallback if not probed
) -> ET.ElementTree:
"""
Build a complete FCPXML ElementTree from an EditTimeline.
Args:
timeline: Ordered sequence of EditClips.
cfg: Application configuration.
source_duration_s: Duration of the source movie asset (used for
<asset> duration attribute). Will be probed
automatically when possible.
Returns:
xml.etree.ElementTree.ElementTree call .write() to serialise.
"""
fps = timeline.frame_rate
# ---- root ---------------------------------------------------------------
root = Element("fcpxml", version=cfg.export.fcpxml_version)
root.set("xmlns", "http://www.apple.com/dt/FCPXML/1_10")
# ---- resources ----------------------------------------------------------
resources = SubElement(root, "resources")
format_id = "r1"
format_name = fcpxml_format_name(fps)
fmt = SubElement(resources, "format",
id=format_id,
name=format_name,
frameDuration=fcpxml_frame_duration(fps),
width="1920",
height="1080",
colorSpace="1-1-1 (Rec. 709)",
)
registry = _AssetRegistry()
# Pre-register all unique source paths so <asset> elements come before
# the <library> block (required by FCPXML spec).
for clip in timeline.clips:
registry.get_or_create(clip.match.source_path)
# Probe actual source duration when possible
_durations: dict[Path, float] = {}
for path in registry.items:
try:
from src.cv.frame_extractor import get_video_info
info = get_video_info(path)
_durations[path] = float(info["duration_s"])
except Exception:
_durations[path] = source_duration_s
for path, rid in registry.items.items():
dur_s = _durations.get(path, source_duration_s)
SubElement(resources, "asset",
id=rid,
name=path.stem,
src=_path_to_url(path),
start="0s",
duration=seconds_to_fcpxml(dur_s, fps),
hasVideo="1",
hasAudio="1",
format=format_id,
)
# ---- library / event / project ------------------------------------------
library = SubElement(root, "library")
event = SubElement(library, "event", name=timeline.title)
project = SubElement(event, "project", name=timeline.title)
sequence = SubElement(project, "sequence",
duration=seconds_to_fcpxml(timeline.total_duration_s, fps),
format=format_id,
tcStart="0s",
tcFormat="NDF",
audioLayout="stereo",
audioRate="48k",
)
spine = SubElement(sequence, "spine")
# ---- clips --------------------------------------------------------------
for clip in sorted(timeline.clips, key=lambda c: c.clip_index):
asset_id = registry.get_or_create(clip.match.source_path)
source_duration_s = clip.source_timeline_duration_s
clip_elem = SubElement(spine, "clip",
name=f"Beat_{clip.beat.beat_id:03d}_{clip.beat.beat_type.name}",
ref=asset_id,
# offset = position on the timeline
offset=seconds_to_fcpxml(clip.timeline_start_s, fps),
# duration = matched source part only; trailer-only tails become gaps.
duration=seconds_to_fcpxml(source_duration_s, fps),
# start = in-point inside the source asset
start=seconds_to_fcpxml(clip.match.in_point_s, fps),
)
# Inline audio role
SubElement(clip_elem, "audio",
role="dialogue",
srcCh="1, 2",
outCh="L, R",
)
if clip.trailer_tail_s > 0:
gap = SubElement(spine, "gap",
name=f"Beat_{clip.beat.beat_id:03d}_TRAILER_TAIL_BLACK_FADE",
offset=seconds_to_fcpxml(clip.timeline_start_s + source_duration_s, fps),
duration=seconds_to_fcpxml(clip.trailer_tail_s, fps),
start="0s",
)
SubElement(gap, "marker",
start="0s",
value="Trailer-only tail: add fade/dissolve to black here",
completed="0",
)
return ET.ElementTree(root)
# ---------------------------------------------------------------------------
# Writer
# ---------------------------------------------------------------------------
def write_fcpxml(
timeline: EditTimeline,
cfg: AppConfig,
output_path: Path | None = None,
) -> Path:
"""
Serialise the EditTimeline to a .fcpxml file.
Args:
timeline: EditTimeline from build_timeline().
cfg: Application configuration.
output_path: Override destination. Defaults to
<output_dir>/<project_name>.fcpxml.
Returns:
Path to the written .fcpxml file.
"""
if output_path is None:
output_path = cfg.paths.output_dir / f"{timeline.title}.fcpxml"
output_path.parent.mkdir(parents=True, exist_ok=True)
tree = build_fcpxml(timeline, cfg)
# Add XML declaration + DOCTYPE manually (ElementTree doesn't support DOCTYPE)
xml_bytes = ET.tostring(tree.getroot(), encoding="unicode", xml_declaration=False)
header = (
'<?xml version="1.0" encoding="UTF-8"?>\n'
'<!DOCTYPE fcpxml>\n'
)
output_path.write_text(header + xml_bytes, encoding="utf-8")
logger.info("FCPXML written → %s (%d clips)", output_path, timeline.clip_count)
return output_path
+146
View File
@@ -0,0 +1,146 @@
"""
src/export/timecode.py Timecode / rational-time conversion helpers
FCPXML uses rational fractions ("1001/24000s") for all time values.
EDL uses SMPTE timecode strings ("HH:MM:SS:FF").
All conversion functions are pure no I/O, no state.
"""
from __future__ import annotations
import math
from fractions import Fraction
# ---------------------------------------------------------------------------
# Common frame-rate denominators
# ---------------------------------------------------------------------------
_FPS_RATIONAL: dict[float, tuple[int, int]] = {
23.976: (24000, 1001),
24.0: (24, 1),
25.0: (25, 1),
29.97: (30000, 1001),
30.0: (30, 1),
50.0: (50, 1),
59.94: (60000, 1001),
60.0: (60, 1),
}
_TOLERANCE = 0.01 # fps match tolerance
def _fps_to_rational(fps: float) -> tuple[int, int]:
"""Return (numerator, denominator) for common fps values."""
for ref_fps, rational in _FPS_RATIONAL.items():
if abs(fps - ref_fps) < _TOLERANCE:
return rational
# Fallback: convert float to exact fraction
f = Fraction(fps).limit_denominator(1001)
return f.numerator, f.denominator
# ---------------------------------------------------------------------------
# Seconds → FCPXML rational string
# ---------------------------------------------------------------------------
def seconds_to_fcpxml(seconds: float, fps: float) -> str:
"""
Convert *seconds* to FCPXML rational time string.
FCPXML requires exact rational arithmetic to avoid drift.
Example: 10.0s @23.976fps "240240/24000s"
Args:
seconds: Time in seconds (float).
fps: Project frame rate.
Returns:
FCPXML time string, e.g. "240240/24000s".
"""
if seconds == 0.0:
return "0s"
num, den = _fps_to_rational(fps) # frames per second = num/den
# seconds × (num/den) = frames (float); round to nearest frame
frames = round(seconds * num / den)
# frames ÷ (num/den) = frames × den/num → rational seconds
total_num = frames * den
total_den = num
# Reduce fraction
g = math.gcd(total_num, total_den)
return f"{total_num // g}/{total_den // g}s"
def seconds_to_frame_count(seconds: float, fps: float) -> int:
"""Convert seconds to integer frame count."""
return round(seconds * fps)
# ---------------------------------------------------------------------------
# Seconds → SMPTE timecode (for EDL)
# ---------------------------------------------------------------------------
def seconds_to_smpte(seconds: float, fps: float, drop_frame: bool = False) -> str:
"""
Convert *seconds* to SMPTE timecode string "HH:MM:SS:FF".
Drop-frame timecode (;) is not implemented always returns NDF (:).
Args:
seconds: Time in float seconds.
fps: Frame rate (23.976, 24, 25, etc.).
drop_frame: Ignored; placeholder for future DF support.
Returns:
"HH:MM:SS:FF" string.
"""
total_frames = seconds_to_frame_count(seconds, fps)
nominal_fps = round(fps) # e.g. 23.976 → 24
ff = total_frames % nominal_fps
total_s = total_frames // nominal_fps
ss = total_s % 60
total_m = total_s // 60
mm = total_m % 60
hh = total_m // 60
return f"{hh:02d}:{mm:02d}:{ss:02d}:{ff:02d}"
# ---------------------------------------------------------------------------
# FCPXML format ID helpers
# ---------------------------------------------------------------------------
def fcpxml_format_name(fps: float, width: int = 1920, height: int = 1080) -> str:
"""
Return an FCPXML format name string for a given frame rate and resolution.
Example: fps=23.976, 1080p "FFVideoFormat1080p2398"
"""
res = f"{height}p"
fps_tag = {
23.976: "2398",
24.0: "24",
25.0: "25",
29.97: "2997",
30.0: "30",
50.0: "50",
59.94: "5994",
60.0: "60",
}.get(fps, str(int(fps * 100)))
return f"FFVideoFormat{res}{fps_tag}"
def fcpxml_frame_duration(fps: float) -> str:
"""
Return FCPXML frameDuration attribute for a given fps.
frame duration = 1 frame = 1/fps seconds = den/num seconds
Example: 23.976fps num=24000, den=1001 frame duration = 1001/24000s
"""
num, den = _fps_to_rational(fps) # fps = num/den (e.g. 24000/1001)
# frame duration = den/num seconds
g = math.gcd(den, num)
return f"{den // g}/{num // g}s"
+1
View File
@@ -0,0 +1 @@
# src.llm package — Thematic segmentation / dramaturgy (NO vision matching)
+202
View File
@@ -0,0 +1,202 @@
"""
src/llm/dramaturg.py LLM-based thematic beat classification (OpenRouter)
Responsibility:
- Receive a list of TrailerBeat objects (with dialogue lines attached)
- Send a single structured prompt to the LLM
- Parse the JSON response to assign BeatType to each beat
IMPORTANT: This module does ZERO visual analysis.
It classifies narrative dramaturgy from dialogue text only.
Visual matching is handled exclusively by the CV engine.
"""
from __future__ import annotations
import json
import logging
from dataclasses import replace
from typing import Sequence
from src.core.config import AppConfig
from src.core.models import BeatType, TrailerBeat
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Prompt builder
# ---------------------------------------------------------------------------
_SYSTEM_PROMPT = """You are a film trailer editor and narrative analyst.
Your task is to classify each beat of a trailer into one of these dramatic roles:
HOOK - Opening attention grabber (first impression, shocking image, logo)
SETUP - World/character introduction
CONFLICT - Inciting incident, rising tension, threat revealed
CLIMAX - Peak action/emotion, highest stakes
RESOLUTION - Cool-down, tagline, final title card
You will receive a JSON array of beats with their index and dialogue text.
Respond ONLY with a valid JSON array, one object per beat, with keys:
"beat_id" (int) and "beat_type" (one of the strings above).
Do NOT include any explanation or markdown fences."""
_USER_TEMPLATE = """Classify the following {n} trailer beats:
{beats_json}"""
def _build_beats_payload(beats: Sequence[TrailerBeat]) -> str:
payload = []
for b in beats:
dialogue_text = " / ".join(line.text for line in b.dialogue) or "(no dialogue)"
payload.append({
"beat_id": b.beat_id,
"duration": round(b.duration_s, 2),
"dialogue": dialogue_text,
})
return json.dumps(payload, ensure_ascii=False, indent=2)
# ---------------------------------------------------------------------------
# OpenRouter / OpenAI-compatible HTTP client
# ---------------------------------------------------------------------------
def _call_llm(prompt_user: str, cfg: AppConfig) -> str:
"""
Send a chat completion request to the configured LLM provider.
Supports: openrouter, openai, ollama (all use the OpenAI-compatible API).
Returns:
The raw text content of the first assistant message.
Raises:
RuntimeError: On HTTP errors or missing API key.
"""
import urllib.request
import urllib.error
llm = cfg.llm
if llm.provider in ("openrouter", "openai") and not llm.api_key:
raise RuntimeError(
f"LLM provider is '{llm.provider}' but no API key found. "
"Set OPENROUTER_API_KEY (or OPENAI_API_KEY) in your .env file."
)
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {llm.api_key}",
}
if llm.provider == "openrouter":
headers["HTTP-Referer"] = "https://github.com/ai-trailer-2026"
headers["X-Title"] = "AI Trailer Generator v2"
body = json.dumps({
"model": llm.model,
"messages": [
{"role": "system", "content": _SYSTEM_PROMPT},
{"role": "user", "content": prompt_user},
],
"temperature": llm.temperature,
"max_tokens": llm.max_tokens,
}).encode("utf-8")
url = f"{llm.base_url.rstrip('/')}/chat/completions"
req = urllib.request.Request(url, data=body, headers=headers, method="POST")
try:
with urllib.request.urlopen(req, timeout=llm.timeout_seconds) as resp:
data = json.loads(resp.read().decode("utf-8"))
return data["choices"][0]["message"]["content"]
except urllib.error.HTTPError as exc:
body_text = exc.read().decode(errors="replace")
raise RuntimeError(
f"LLM HTTP {exc.code} from {url}:\n{body_text}"
) from exc
# ---------------------------------------------------------------------------
# Response parser
# ---------------------------------------------------------------------------
_BEAT_TYPE_MAP: dict[str, BeatType] = {bt.name: bt for bt in BeatType}
def _parse_response(raw: str, beats: Sequence[TrailerBeat]) -> dict[int, BeatType]:
"""
Parse the LLM JSON array response into a beat_id BeatType mapping.
Falls back to BeatType.UNKNOWN for any beat that cannot be parsed.
"""
# Strip accidental markdown fences
clean = raw.strip()
if clean.startswith("```"):
clean = "\n".join(clean.split("\n")[1:])
if clean.endswith("```"):
clean = clean[: clean.rfind("```")]
clean = clean.strip()
result: dict[int, BeatType] = {b.beat_id: BeatType.UNKNOWN for b in beats}
try:
parsed = json.loads(clean)
if not isinstance(parsed, list):
raise ValueError("Expected JSON array at top level.")
for item in parsed:
bid = int(item["beat_id"])
name = str(item.get("beat_type", "UNKNOWN")).upper()
result[bid] = _BEAT_TYPE_MAP.get(name, BeatType.UNKNOWN)
except (json.JSONDecodeError, KeyError, ValueError) as exc:
logger.warning("LLM response parse error (%s) — all beats → UNKNOWN.\nRaw: %s", exc, raw[:300])
return result
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def classify_beats(
beats: Sequence[TrailerBeat],
cfg: AppConfig,
) -> list[TrailerBeat]:
"""
Use the LLM to assign a BeatType to each TrailerBeat.
Args:
beats: TrailerBeat list (dialogue should be populated for best results).
cfg: Application configuration (llm section + api key).
Returns:
New list of TrailerBeat objects with beat_type set.
On LLM error, all beats keep BeatType.UNKNOWN (no exception raised).
"""
if not beats:
return list(beats)
logger.info(
"Classifying %d beats via %s / %s",
len(beats), cfg.llm.provider, cfg.llm.model,
)
payload = _build_beats_payload(beats)
prompt = _USER_TEMPLATE.format(n=len(beats), beats_json=payload)
try:
raw_response = _call_llm(prompt, cfg)
except Exception as exc:
logger.error("LLM classification failed: %s — keeping BeatType.UNKNOWN.", exc)
return list(beats)
type_map = _parse_response(raw_response, beats)
enriched = [replace(b, beat_type=type_map.get(b.beat_id, BeatType.UNKNOWN)) for b in beats]
classified = sum(1 for b in enriched if b.beat_type != BeatType.UNKNOWN)
logger.info("Beat classification done: %d / %d classified.", classified, len(beats))
return enriched
+316
View File
@@ -0,0 +1,316 @@
"""
Cached vision descriptions for ambiguous trailer/source matching.
This module is deliberately conservative: it never writes a final match and it
does not replace CV. It describes a small number of 3-frame beat/scene samples,
caches those descriptions, and returns extra source in-point seeds for the CV
scanner to verify.
"""
from __future__ import annotations
import base64
import json
import logging
import re
import urllib.error
import urllib.request
from dataclasses import asdict
from pathlib import Path
from typing import Sequence
import cv2
from src.core.config import AppConfig
from src.core.models import Scene, TrailerBeat
logger = logging.getLogger(__name__)
_CACHE_VERSION = 1
_STOPWORDS = {
"the", "and", "with", "from", "that", "this", "there", "their", "into",
"scene", "frame", "image", "shot", "video", "visible", "looks", "appears",
"eine", "einer", "einem", "einen", "und", "oder", "mit", "der", "die", "das",
}
_SYSTEM_PROMPT = """You describe film shots for automatic matching.
Return only compact JSON with these keys:
subject, setting, composition, action_phase, distinctive_objects, lighting_color, negatives.
Focus on stable visual facts and spatial layout. Ignore timecode overlays, subtitles, logos, compression, aspect ratio, and color grading differences."""
def _cache_path(cfg: AppConfig) -> Path:
return cfg.paths.cache_dir / "vision_descriptions.json"
def _load_cache(cfg: AppConfig) -> dict:
path = _cache_path(cfg)
if not path.exists():
return {"version": _CACHE_VERSION, "items": {}}
try:
data = json.loads(path.read_text(encoding="utf-8"))
except json.JSONDecodeError:
logger.warning("Vision cache is unreadable; rebuilding: %s", path)
return {"version": _CACHE_VERSION, "items": {}}
if data.get("version") != _CACHE_VERSION or not isinstance(data.get("items"), dict):
return {"version": _CACHE_VERSION, "items": {}}
return data
def _save_cache(cfg: AppConfig, cache: dict) -> None:
path = _cache_path(cfg)
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(cache, indent=2, ensure_ascii=False), encoding="utf-8")
def _sample_times(start_s: float, end_s: float) -> list[float]:
duration_s = max(0.04, end_s - start_s)
return [
start_s + min(duration_s * 0.12, max(0.0, duration_s - 0.04)),
start_s + duration_s * 0.50,
start_s + max(0.0, duration_s - min(duration_s * 0.12, 0.20)),
]
def _frame_data_url(video_path: Path, t_s: float) -> str | None:
cap = cv2.VideoCapture(str(video_path))
try:
if not cap.isOpened():
return None
cap.set(cv2.CAP_PROP_POS_MSEC, max(0.0, t_s) * 1000.0)
ok, frame = cap.read()
if not ok or frame is None:
return None
h, w = frame.shape[:2]
if w > 640:
frame = cv2.resize(frame, (640, int(h * (640 / w))), interpolation=cv2.INTER_AREA)
ok, encoded = cv2.imencode(".jpg", frame, [int(cv2.IMWRITE_JPEG_QUALITY), 72])
if not ok:
return None
payload = base64.b64encode(encoded.tobytes()).decode("ascii")
return f"data:image/jpeg;base64,{payload}"
finally:
cap.release()
def _call_vision_model(label: str, image_urls: list[str], cfg: AppConfig) -> str:
vision = cfg.vision
if vision.provider in ("openai", "openrouter") and not vision.api_key:
raise RuntimeError(
"Vision is enabled but no API key is available. Set VISION_API_KEY, "
"OPENROUTER_API_KEY, OPENAI_API_KEY, or LLM_API_KEY."
)
content: list[dict] = [{
"type": "text",
"text": (
f"Describe this 3-frame sample for matching. Label: {label}. "
"The frames are start, middle, and end of the same beat/scene."
),
}]
content.extend({
"type": "image_url",
"image_url": {"url": url, "detail": "low"},
} for url in image_urls)
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {vision.api_key}",
}
if vision.provider == "openrouter":
headers["HTTP-Referer"] = "https://github.com/ai-trailer-2026"
headers["X-Title"] = "AI Trailer Generator v2"
body = json.dumps({
"model": vision.model,
"messages": [
{"role": "system", "content": _SYSTEM_PROMPT},
{"role": "user", "content": content},
],
"temperature": vision.temperature,
"max_tokens": vision.max_tokens,
}).encode("utf-8")
url = f"{vision.base_url.rstrip('/')}/chat/completions"
req = urllib.request.Request(url, data=body, headers=headers, method="POST")
try:
with urllib.request.urlopen(req, timeout=vision.timeout_seconds) as resp:
data = json.loads(resp.read().decode("utf-8"))
return str(data["choices"][0]["message"]["content"]).strip()
except urllib.error.HTTPError as exc:
body_text = exc.read().decode(errors="replace")
raise RuntimeError(f"Vision HTTP {exc.code} from {url}:\n{body_text}") from exc
def _description_key(kind: str, item_id: int, start_s: float, end_s: float, cfg: AppConfig) -> str:
path = cfg.paths.reference_trailer if kind == "beat" else cfg.paths.source_movie
try:
stamp = int(path.stat().st_mtime)
except OSError:
stamp = 0
return (
f"{kind}:{item_id}:"
f"{start_s:.3f}:{end_s:.3f}:"
f"{cfg.vision.provider}:{cfg.vision.model}:{stamp}"
)
def _describe_sample(
*,
kind: str,
item_id: int,
label: str,
video_path: Path,
start_s: float,
end_s: float,
cfg: AppConfig,
cache: dict,
budget: list[int],
) -> str | None:
key = _description_key(kind, item_id, start_s, end_s, cfg)
cached = cache["items"].get(key)
if cached:
return str(cached.get("description", ""))
if budget[0] <= 0:
return None
image_urls = [
url for url in (_frame_data_url(video_path, t) for t in _sample_times(start_s, end_s))
if url is not None
]
if len(image_urls) < 2:
return None
description = _call_vision_model(label, image_urls, cfg)
cache["items"][key] = {
"kind": kind,
"item_id": item_id,
"start_s": start_s,
"end_s": end_s,
"label": label,
"description": description,
}
budget[0] -= 1
return description
def _terms(text: str) -> set[str]:
words = re.findall(r"[a-zA-Z][a-zA-Z0-9_'-]{2,}", text.lower())
return {w for w in words if w not in _STOPWORDS}
def _text_similarity(a: str, b: str) -> float:
ta = _terms(a)
tb = _terms(b)
if not ta or not tb:
return 0.0
overlap = len(ta & tb)
return float(overlap / max(8, min(len(ta), len(tb))))
def _scene_seed_points(scene: Scene, max_points: int) -> list[float]:
if max_points <= 1 or scene.duration_s <= 0:
return [scene.start_s]
usable_end = max(scene.start_s, scene.end_s - 0.2)
if usable_end <= scene.start_s:
return [scene.start_s]
step = (usable_end - scene.start_s) / max(1, max_points - 1)
return [scene.start_s + step * idx for idx in range(max_points)]
def build_vision_seed_in_points(
beats: Sequence[TrailerBeat],
scenes: Sequence[Scene],
cfg: AppConfig,
) -> dict[int, list[tuple[float, float]]]:
"""
Return extra in-point seeds from cached vision descriptions.
The function is intentionally small-budget: for each beat it describes the
beat once and only a few top scene-level candidates. Existing descriptions
are read from cache and cost nothing.
"""
if not cfg.vision.enabled:
return {}
if not beats or not scenes:
return {}
from src.cv.vibe_check import run_vibe_check
cache = _load_cache(cfg)
budget = [cfg.vision.max_new_descriptions_per_run]
scenes_by_id = {scene.scene_id: scene for scene in scenes}
seeds: dict[int, list[tuple[float, float]]] = {}
for beat in beats:
beat_desc = _describe_sample(
kind="beat",
item_id=beat.beat_id,
label=f"trailer beat {beat.beat_id}",
video_path=beat.trailer_path,
start_s=beat.start_s,
end_s=beat.end_s,
cfg=cfg,
cache=cache,
budget=budget,
)
if not beat_desc:
continue
hits = run_vibe_check(
beat,
scenes,
top_k=cfg.vision.scene_candidate_top_k,
hist_method=cfg.cv.vibe_check.hist_compare_method,
phash_max_distance=64,
)
ranked: list[tuple[float, Scene]] = []
for hit in hits:
scene = scenes_by_id.get(hit.scene_id)
if scene is None:
continue
scene_desc = _describe_sample(
kind="scene",
item_id=scene.scene_id,
label=f"source scene {scene.scene_id}",
video_path=scene.source_path,
start_s=scene.start_s,
end_s=scene.end_s,
cfg=cfg,
cache=cache,
budget=budget,
)
if not scene_desc:
continue
score = _text_similarity(beat_desc, scene_desc)
if score >= cfg.vision.similarity_threshold:
ranked.append((score, scene))
ranked.sort(key=lambda item: item[0], reverse=True)
points: list[tuple[float, float]] = []
for score, scene in ranked[:cfg.vision.max_seed_scenes]:
logger.info(
"Beat %d: vision seed scene=%d score=%.3f",
beat.beat_id,
scene.scene_id,
score,
)
weighted_score = max(
cfg.cv.deep_scan.coarse_candidate_threshold,
min(0.98, cfg.vision.seed_score * (0.75 + min(1.0, score) * 0.25)),
)
points.extend(
(point, weighted_score)
for point in _scene_seed_points(scene, cfg.vision.seed_points_per_scene)
)
if points:
merged: dict[float, float] = {}
for point, weighted_score in points:
key = round(max(0.0, point), 3)
merged[key] = max(weighted_score, merged.get(key, 0.0))
seeds[beat.beat_id] = sorted((point, score) for point, score in merged.items())
_save_cache(cfg, cache)
return seeds
+3
View File
@@ -0,0 +1,3 @@
"""
src/pipeline/__init__.py Orchestration layer
"""
+291
View File
@@ -0,0 +1,291 @@
"""
src/pipeline/matcher.py Top-level CV matching orchestrator
This is the single entry point for the full 2-phase CV pipeline:
Phase 0: Load / build scene index (PySceneDetect + fingerprinting)
Phase 1: Vibe Check histogram + pHash filter Top-K candidates per beat
Phase 2: Deep Scan template matching frame-accurate MatchResult per beat
Usage:
from src.core.config import load_config
from src.pipeline.matcher import run_matching
cfg = load_config()
beats = [...] # list[TrailerBeat] from trailer analysis
results = run_matching(cfg, beats)
"""
from __future__ import annotations
import logging
from typing import Sequence
from src.core.config import AppConfig
from src.core.models import MatchResult, Scene, TrailerBeat
logger = logging.getLogger(__name__)
SeedPoint = float | tuple[float, float]
def _scene_seed_points(scene: Scene, max_points: int) -> list[float]:
if max_points <= 1 or scene.duration_s <= 0:
return [scene.start_s]
usable_end = max(scene.start_s, scene.end_s - 0.2)
if usable_end <= scene.start_s:
return [scene.start_s]
step = (usable_end - scene.start_s) / max(1, max_points - 1)
return [scene.start_s + step * idx for idx in range(max_points)]
def _build_scene_seed_in_points(
beats: Sequence[TrailerBeat],
scenes: Sequence[Scene],
cfg: AppConfig,
) -> dict[int, list[float]]:
from src.cv.vibe_check import run_vibe_check
scenes_by_id = {scene.scene_id: scene for scene in scenes}
seeds: dict[int, list[float]] = {}
for beat in beats:
hits = run_vibe_check(
beat,
scenes,
top_k=cfg.cv.deep_scan.scene_seed_top_k,
hist_method=cfg.cv.vibe_check.hist_compare_method,
phash_max_distance=64,
)
points: list[float] = []
for hit in hits:
scene = scenes_by_id.get(hit.scene_id)
if scene is None:
continue
points.extend(_scene_seed_points(scene, cfg.cv.deep_scan.scene_seed_points_per_scene))
if points:
seeds[beat.beat_id] = sorted({round(max(0.0, p), 3) for p in points})
logger.info(
"Beat %d: added %d scene-level seed candidates from %d source scenes.",
beat.beat_id,
len(seeds[beat.beat_id]),
len(hits),
)
return seeds
def _merge_seed_in_points(
*seed_maps: dict[int, Sequence[SeedPoint]] | None,
) -> dict[int, list[SeedPoint]]:
merged: dict[int, dict[float, float | None]] = {}
for seed_map in seed_maps:
if not seed_map:
continue
for beat_id, points in seed_map.items():
beat_points = merged.setdefault(beat_id, {})
for point in points:
if isinstance(point, tuple):
t_sec = round(max(0.0, float(point[0])), 3)
score = float(point[1])
else:
t_sec = round(max(0.0, float(point)), 3)
score = None
old_score = beat_points.get(t_sec)
if old_score is None:
beat_points[t_sec] = score
elif score is not None:
beat_points[t_sec] = max(old_score, score)
result: dict[int, list[SeedPoint]] = {}
for beat_id, points in merged.items():
result[beat_id] = [
(t_sec, score) if score is not None else t_sec
for t_sec, score in sorted(points.items())
]
return result
# ---------------------------------------------------------------------------
# Beat fingerprinting
# ---------------------------------------------------------------------------
def fingerprint_beats(
beats: Sequence[TrailerBeat],
cfg: AppConfig,
) -> list[TrailerBeat]:
"""
Enrich every TrailerBeat with its visual fingerprint (histogram + pHash).
Extracts the midpoint frame from the reference trailer and fingerprints it
using the same Text-Safe Crop parameters as the scene indexer.
Args:
beats: TrailerBeat list (fingerprints will be None initially).
cfg: Application configuration.
Returns:
New list of TrailerBeat objects with luma_hist, sat_hist, phash set.
"""
from dataclasses import replace
from src.cv.fingerprinting import fingerprint_frame
from src.cv.frame_extractor import grab_frame_at_path
vc_cfg = cfg.cv.vibe_check
enriched: list[TrailerBeat] = []
for beat in beats:
frame = grab_frame_at_path(beat.trailer_path, beat.midpoint_s)
if frame is None:
logger.warning("Beat %d: cannot decode midpoint frame, leaving unfingerpinted.", beat.beat_id)
enriched.append(beat)
continue
luma_b, sat_b, phash = fingerprint_frame(frame, vc_cfg)
enriched.append(replace(beat, luma_hist=luma_b, sat_hist=sat_b, phash=phash))
logger.info("Fingerprinted %d / %d beats.", sum(1 for b in enriched if b.phash), len(beats))
return enriched
# ---------------------------------------------------------------------------
# Main pipeline entry point
# ---------------------------------------------------------------------------
def run_matching(
cfg: AppConfig,
beats: Sequence[TrailerBeat],
force_reindex: bool = False,
seed_in_points: dict[int, Sequence[SeedPoint]] | None = None,
) -> list[MatchResult]:
"""
Execute the full 2-phase CV matching pipeline.
Args:
cfg: Application configuration (loaded from config.toml).
beats: All trailer beats to source (must have trailer_path set).
force_reindex: If True, ignore the scene cache and re-run PySceneDetect.
Returns:
List of MatchResult, one per beat (unmatched beats are omitted).
Results are in the same order as the input beats.
"""
from src.cv.scene_indexer import build_scene_index
logger.info("=" * 60)
logger.info("AI Trailer Generator v2 — CV Matching Pipeline")
logger.info("Source : %s", cfg.paths.source_movie.name)
logger.info("Trailer: %s", cfg.paths.reference_trailer.name)
logger.info("Beats : %d", len(beats))
logger.info("=" * 60)
# ------------------------------------------------------------------
# Phase 0: Scene index
# ------------------------------------------------------------------
logger.info("[Phase 0] Building scene index …")
scenes: list[Scene] = build_scene_index(cfg, force_reindex=force_reindex)
scenes_by_id: dict[int, Scene] = {s.scene_id: s for s in scenes}
logger.info("[Phase 0] %d scenes indexed.", len(scenes))
# ------------------------------------------------------------------
# Phase 0b: Fingerprint the beats
# ------------------------------------------------------------------
logger.info("[Phase 0b] Fingerprinting %d trailer beats …", len(beats))
beats = fingerprint_beats(beats, cfg)
# ------------------------------------------------------------------
# Phase 1 & 2: Global Scan (bypasses Scene Indexer / Vibe Check entirely)
# ------------------------------------------------------------------
logger.info("[Phase 1 & 2] Running FFmpeg Global Scan for %d beats ...", len(beats))
from src.cv.global_scan import run_global_scan
scene_seed_in_points = _build_scene_seed_in_points(beats, scenes, cfg)
vision_seed_in_points = {}
if cfg.vision.enabled:
try:
from src.llm.vision_cache import build_vision_seed_in_points
vision_seed_in_points = build_vision_seed_in_points(beats, scenes, cfg)
except Exception as exc:
logger.error("Vision seeding failed: %s — continuing with CV-only seeds.", exc)
results = run_global_scan(
beats,
cfg,
scenes=scenes,
seed_in_points=_merge_seed_in_points(seed_in_points, scene_seed_in_points, vision_seed_in_points),
)
logger.info("[Phase 1 & 2] Done. %d / %d beats matched.", len(results), len(beats))
logger.info("=" * 60)
return results
# ---------------------------------------------------------------------------
# Convenience: build an EditTimeline from match results
# ---------------------------------------------------------------------------
def build_timeline(
beats: Sequence[TrailerBeat],
results: Sequence[MatchResult],
cfg: AppConfig,
) -> "src.core.models.EditTimeline": # type: ignore[name-defined]
"""
Combine beats + match results into an ordered EditTimeline.
Unmatched beats are skipped; timeline positions are computed
sequentially from the usable source-match durations.
Args:
beats: All trailer beats (defines order + durations).
results: MatchResult list from run_matching().
cfg: Application configuration.
Returns:
EditTimeline ready for FCPXML / EDL export.
"""
from src.core.models import EditClip, EditTimeline
results_by_beat: dict[int, MatchResult] = {r.beat_id: r for r in results}
clips: list[EditClip] = []
cursor = 0.0
for beat in beats:
match = results_by_beat.get(beat.beat_id)
if match is None:
logger.warning("Beat %d has no match — gap in timeline.", beat.beat_id)
cursor += beat.duration_s
continue
match_duration = max(0.0, match.duration_s)
source_duration = min(beat.duration_s, match_duration) if match_duration > 0 else beat.duration_s
trailer_tail_s = max(0.0, beat.duration_s - source_duration)
if trailer_tail_s > 0:
logger.warning(
"Beat %d uses %.2fs source + %.2fs generated trailer tail.",
beat.beat_id,
source_duration,
trailer_tail_s,
)
clip = EditClip(
clip_index=len(clips),
beat=beat,
match=match,
timeline_start_s=cursor,
timeline_end_s=cursor + beat.duration_s,
source_duration_s=source_duration,
trailer_tail_s=trailer_tail_s,
)
clips.append(clip)
cursor += beat.duration_s
timeline = EditTimeline(
title=cfg.paths.reference_trailer.stem,
frame_rate=cfg.export.edl_frame_rate,
clips=tuple(clips),
)
logger.info(
"Timeline built: %d clips, total duration %.2fs",
timeline.clip_count, timeline.total_duration_s,
)
return timeline
+427
View File
@@ -0,0 +1,427 @@
"""
src/pipeline/reporter.py Visual Match Report Generator
Generates an HTML file containing side-by-side video clips of:
Left: The original beat from the reference trailer
Right: The matched scene from the source movie
This allows instant visual verification of the CV pipeline's results.
"""
from __future__ import annotations
import logging
import subprocess
from pathlib import Path
from src.core.config import AppConfig
logger = logging.getLogger(__name__)
def _extract_clip(video_path: Path, start_s: float, duration_s: float, out_path: Path) -> None:
"""Use ffmpeg to extract a silent, low-res preview clip."""
out_path.parent.mkdir(parents=True, exist_ok=True)
# Fast input seek close to the target, then accurate output seek for
# frame-faithful preview clips. A plain "-ss before -i" can land on a
# nearby keyframe and make the report look several frames out of sync.
preroll_s = 2.0 if start_s >= 2.0 else 0.0
input_seek_s = max(0.0, start_s - preroll_s)
accurate_seek_s = start_s - input_seek_s
cmd = [
"ffmpeg", "-y", "-loglevel", "error",
"-ss", str(input_seek_s),
"-i", str(video_path),
"-ss", str(accurate_seek_s),
"-t", str(duration_s),
"-map", "0:v:0",
"-c:v", "libx264",
"-preset", "ultrafast",
"-crf", "28",
"-vf", "scale=640:-2", # scale down for lightweight report
"-an", # no audio
"-movflags", "+faststart",
str(out_path)
]
result = subprocess.run(cmd, capture_output=True)
if result.returncode != 0:
logger.error(
"ffmpeg clip extraction failed for %s:\n%s",
out_path.name, result.stderr.decode(errors="replace")
)
def _extract_clip_with_black_tail(
video_path: Path,
start_s: float,
source_duration_s: float,
total_duration_s: float,
out_path: Path,
) -> None:
"""Extract a source preview and append black frames for trailer-only tails."""
tail_s = max(0.0, total_duration_s - source_duration_s)
if tail_s <= 0.02:
_extract_clip(video_path, start_s, source_duration_s, out_path)
return
out_path.parent.mkdir(parents=True, exist_ok=True)
source_tmp = out_path.with_name(f"{out_path.stem}_source_tmp.mp4")
tail_tmp = out_path.with_name(f"{out_path.stem}_tail_tmp.mp4")
preroll_s = 2.0 if start_s >= 2.0 else 0.0
input_seek_s = max(0.0, start_s - preroll_s)
accurate_seek_s = start_s - input_seek_s
# First render the matched source portion with the same accurate seek path
# as _extract_clip(). Using trim=start=... after an input seek is brittle
# because FFmpeg may preserve non-zero packet timestamps around keyframes.
source_cmd = [
"ffmpeg", "-y", "-loglevel", "error",
"-ss", str(input_seek_s),
"-i", str(video_path),
"-ss", str(accurate_seek_s),
"-t", str(source_duration_s),
"-map", "0:v:0",
"-c:v", "libx264",
"-preset", "ultrafast",
"-crf", "28",
"-vf", "scale=640:360,setsar=1,fps=25,setpts=PTS-STARTPTS",
"-an",
"-movflags", "+faststart",
str(source_tmp),
]
result = subprocess.run(source_cmd, capture_output=True)
if result.returncode != 0:
logger.error(
"ffmpeg source preview extraction failed for %s:\n%s",
out_path.name,
result.stderr.decode(errors="replace"),
)
return
tail_cmd = [
"ffmpeg", "-y", "-loglevel", "error",
"-f", "lavfi",
"-i", f"color=c=black:s=640x360:r=25:d={tail_s}",
"-c:v", "libx264",
"-preset", "ultrafast",
"-crf", "28",
"-an",
"-movflags", "+faststart",
str(tail_tmp),
]
result = subprocess.run(tail_cmd, capture_output=True)
if result.returncode != 0:
logger.error(
"ffmpeg black tail render failed for %s:\n%s",
out_path.name,
result.stderr.decode(errors="replace"),
)
return
concat_cmd = [
"ffmpeg", "-y", "-loglevel", "error",
"-i", str(source_tmp),
"-i", str(tail_tmp),
"-filter_complex", "[0:v][1:v]concat=n=2:v=1:a=0[v]",
"-map", "[v]",
"-c:v", "libx264",
"-preset", "ultrafast",
"-crf", "28",
"-an",
"-movflags", "+faststart",
str(out_path),
]
result = subprocess.run(concat_cmd, capture_output=True)
if result.returncode != 0:
logger.error(
"ffmpeg tailed preview concat failed for %s:\n%s",
out_path.name,
result.stderr.decode(errors="replace"),
)
for tmp in (source_tmp, tail_tmp):
try:
tmp.unlink(missing_ok=True)
except OSError:
pass
def _extract_segmented_clip(
video_path: Path,
segments: list,
total_duration_s: float,
out_path: Path,
) -> None:
"""Render a beat-length source preview from multiple matched source islands."""
if not segments:
_extract_clip_with_black_tail(video_path, 0.0, 0.0, total_duration_s, out_path)
return
out_path.parent.mkdir(parents=True, exist_ok=True)
tmp_paths: list[Path] = []
cursor = 0.0
def add_black(duration_s: float) -> None:
if duration_s <= 0.02:
return
tmp = out_path.with_name(f"{out_path.stem}_part_{len(tmp_paths):03d}_black.mp4")
cmd = [
"ffmpeg", "-y", "-loglevel", "error",
"-f", "lavfi",
"-i", f"color=c=black:s=640x360:r=25:d={duration_s}",
"-c:v", "libx264", "-preset", "ultrafast", "-crf", "28",
"-an", "-movflags", "+faststart",
str(tmp),
]
result = subprocess.run(cmd, capture_output=True)
if result.returncode == 0:
tmp_paths.append(tmp)
else:
logger.error("ffmpeg black segment render failed:\n%s", result.stderr.decode(errors="replace"))
def add_source(start_s: float, duration_s: float) -> None:
if duration_s <= 0.02:
return
tmp = out_path.with_name(f"{out_path.stem}_part_{len(tmp_paths):03d}_src.mp4")
preroll_s = 2.0 if start_s >= 2.0 else 0.0
input_seek_s = max(0.0, start_s - preroll_s)
accurate_seek_s = start_s - input_seek_s
cmd = [
"ffmpeg", "-y", "-loglevel", "error",
"-ss", str(input_seek_s),
"-i", str(video_path),
"-ss", str(accurate_seek_s),
"-t", str(duration_s),
"-map", "0:v:0",
"-c:v", "libx264", "-preset", "ultrafast", "-crf", "28",
"-vf", "scale=640:360,setsar=1,fps=25,setpts=PTS-STARTPTS",
"-an", "-movflags", "+faststart",
str(tmp),
]
result = subprocess.run(cmd, capture_output=True)
if result.returncode == 0 and tmp.exists():
tmp_paths.append(tmp)
else:
logger.error("ffmpeg source segment render failed:\n%s", result.stderr.decode(errors="replace"))
for segment in sorted(segments, key=lambda s: s.trailer_offset_s):
offset_s = max(0.0, float(segment.trailer_offset_s))
duration_s = max(0.0, float(segment.duration_s))
add_black(offset_s - cursor)
add_source(float(segment.in_point_s), duration_s)
cursor = max(cursor, offset_s + duration_s)
add_black(total_duration_s - cursor)
if len(tmp_paths) == 1:
tmp_paths[0].replace(out_path)
return
inputs: list[str] = []
labels: list[str] = []
for idx, tmp in enumerate(tmp_paths):
inputs.extend(["-i", str(tmp)])
labels.append(f"[{idx}:v]")
filter_complex = "".join(labels) + f"concat=n={len(tmp_paths)}:v=1:a=0[v]"
cmd = [
"ffmpeg", "-y", "-loglevel", "error",
*inputs,
"-filter_complex", filter_complex,
"-map", "[v]",
"-c:v", "libx264", "-preset", "ultrafast", "-crf", "28",
"-an", "-movflags", "+faststart",
str(out_path),
]
result = subprocess.run(cmd, capture_output=True)
if result.returncode != 0:
logger.error("ffmpeg segmented preview concat failed:\n%s", result.stderr.decode(errors="replace"))
for tmp in tmp_paths:
try:
tmp.unlink(missing_ok=True)
except OSError:
pass
def _build_frame_locked_compare(ref_path: Path, src_path: Path, out_path: Path) -> None:
"""Render reference and source into one side-by-side video stream."""
out_path.parent.mkdir(parents=True, exist_ok=True)
normalize = (
"fps=25,scale=640:360:force_original_aspect_ratio=decrease,"
"pad=640:360:(ow-iw)/2:(oh-ih)/2,setsar=1,setpts=PTS-STARTPTS"
)
filter_complex = (
f"[0:v]{normalize}[ref];"
f"[1:v]{normalize}[src];"
"[ref][src]hstack=inputs=2[v]"
)
cmd = [
"ffmpeg", "-y", "-loglevel", "error",
"-i", str(ref_path),
"-i", str(src_path),
"-filter_complex", filter_complex,
"-map", "[v]",
"-c:v", "libx264",
"-preset", "ultrafast",
"-crf", "28",
"-an",
"-movflags", "+faststart",
str(out_path),
]
result = subprocess.run(cmd, capture_output=True)
if result.returncode != 0:
logger.error(
"ffmpeg compare render failed for %s:\n%s",
out_path.name,
result.stderr.decode(errors="replace"),
)
def generate_report(beats: list, results: list, cfg: AppConfig) -> Path:
"""
Generate an HTML side-by-side report.
Returns the path to the .html file.
"""
report_dir = cfg.paths.output_dir / "report"
report_dir.mkdir(parents=True, exist_ok=True)
html_path = report_dir / "match_report.html"
results_by_beat = {r.beat_id: r for r in results}
logger.info("Generating report clips in %s (this might take a moment) ...", report_dir)
html = [
"<!DOCTYPE html>",
"<html><head><meta charset='utf-8'><title>AI Trailer Match Report</title>",
"<style>",
"body { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; background: #0f0f0f; color: #e0e0e0; margin: 40px; }",
"h1 { color: #fff; border-bottom: 1px solid #333; padding-bottom: 10px; }",
".stats { font-size: 1.2em; margin-bottom: 30px; color: #aaa; }",
".beat-row { display: flex; margin-bottom: 30px; background: #1a1a1a; padding: 20px; border-radius: 12px; border: 1px solid #333; }",
".info { width: 250px; padding-right: 20px; flex-shrink: 0; }",
".info h3 { margin-top: 0; color: #fff; }",
".video-container { display: flex; gap: 20px; flex-grow: 1; }",
".videos { flex-grow: 1; }",
".compare { margin-bottom: 18px; }",
".video-col { flex: 1; }",
".video-col p { margin-top: 0; font-weight: bold; color: #888; }",
"video { width: 100%; border-radius: 6px; box-shadow: 0 4px 6px rgba(0,0,0,0.5); background: #000; }",
".status-match { color: #4ade80; font-weight: bold; font-size: 1.1em; }",
".status-miss { color: #f87171; font-weight: bold; font-size: 1.1em; }",
".score { font-family: monospace; font-size: 1.1em; color: #60a5fa; }",
".code-hint { background: #000; padding: 10px; border-radius: 4px; font-family: monospace; font-size: 0.9em; margin-top: 15px; color: #a3e635; }",
"</style></head><body>",
f"<h1>AI Trailer Generator — Match Report</h1>",
f"<div class='stats'>Total Beats: {len(beats)} | Matched: {len(results)}</div>",
"<script>",
"function syncBeat(row) {",
" const vids = row.querySelectorAll('video');",
" if (vids.length < 2) return;",
" const ref = vids[0];",
" const src = vids[1];",
" let syncing = false;",
" function align() {",
" if (syncing) return;",
" syncing = true;",
" const target = Math.min(ref.currentTime, Math.max(0, (src.duration || ref.currentTime) - 0.02));",
" if (Math.abs(src.currentTime - target) > 0.035) src.currentTime = target;",
" if (ref.paused && !src.paused) src.pause();",
" if (!ref.paused && src.paused) src.play().catch(() => {});",
" syncing = false;",
" }",
" ref.addEventListener('play', () => { src.currentTime = Math.min(ref.currentTime, Math.max(0, (src.duration || ref.currentTime) - 0.02)); src.play().catch(() => {}); });",
" ref.addEventListener('pause', () => src.pause());",
" ref.addEventListener('seeked', () => { src.currentTime = Math.min(ref.currentTime, Math.max(0, (src.duration || ref.currentTime) - 0.02)); });",
" ref.addEventListener('timeupdate', align);",
"}",
"document.addEventListener('DOMContentLoaded', () => document.querySelectorAll('.beat-row').forEach(syncBeat));",
"</script>"
]
for beat in beats:
res = results_by_beat.get(beat.beat_id)
# Extract Reference Clip
ref_mp4 = report_dir / f"beat_{beat.beat_id:03d}_ref.mp4"
_extract_clip(beat.trailer_path, beat.start_s, beat.duration_s, ref_mp4)
html.append("<div class='beat-row'>")
# Info Panel
html.append("<div class='info'>")
html.append(f"<h3>Beat {beat.beat_id:03d}</h3>")
html.append(f"<p><b>Type:</b> {beat.beat_type.name}</p>")
html.append(f"<p><b>Trailer:</b> {beat.start_s:.2f}s &rarr; {beat.end_s:.2f}s</p>")
if res:
segments = list(getattr(res, "segments", ()) or [])
source_duration = sum(max(0.0, float(s.duration_s)) for s in segments)
if not segments:
source_duration = max(0.0, res.out_point_s - res.in_point_s)
preview_duration = min(beat.duration_s, source_duration) if source_duration > 0 else beat.duration_s
last_segment_end = max(
(float(s.trailer_offset_s) + float(s.duration_s) for s in segments),
default=preview_duration,
)
trailer_tail_s = max(0.0, beat.duration_s - last_segment_end)
if getattr(res, "is_confirmed", True):
html.append("<p class='status-match'>MATCHED</p>")
else:
html.append("<p style='color: #fbbf24; font-weight: bold; font-size: 1.1em;'>PROVISIONAL MATCH</p>")
html.append(f"<p><b>Scene ID:</b> {res.scene_id}</p>")
html.append(f"<p><b>Movie In:</b> {res.in_point_s:.2f}s</p>")
html.append(f"<p><b>Source Dur:</b> {source_duration:.2f}s</p>")
if len(segments) > 1:
html.append(f"<p><b>Segments:</b> {len(segments)} matched visual islands</p>")
if trailer_tail_s > 0:
html.append(f"<p><b>Unmatched Tail:</b> {trailer_tail_s:.2f}s placeholder</p>")
html.append(f"<p><b>Score:</b> <span class='score'>{res.match_score:.3f}</span></p>")
if trailer_tail_s > 0:
html.append("<p style='color: #fbbf24; font-size: 0.9em;'>Some trailer frames are still unmatched; report fills only those gaps with placeholder black.</p>")
# Warn if score is low
if res.match_score < 0.80:
html.append("<p style='color: #fbbf24; font-size: 0.9em;'>⚠️ Score below 0.80. Verify visually.</p>")
# Extract Source Clip
src_mp4 = report_dir / f"beat_{beat.beat_id:03d}_src.mp4"
compare_mp4 = report_dir / f"beat_{beat.beat_id:03d}_compare.mp4"
if segments:
_extract_segmented_clip(res.source_path, segments, beat.duration_s, src_mp4)
else:
_extract_clip_with_black_tail(
res.source_path,
res.in_point_s,
preview_duration,
beat.duration_s,
src_mp4,
)
_build_frame_locked_compare(ref_mp4, src_mp4, compare_mp4)
else:
html.append("<p class='status-miss'>NO MATCH</p>")
src_mp4 = None
compare_mp4 = None
html.append(f"<div class='code-hint'>python cli.py rematch --beat {beat.beat_id}</div>")
html.append("</div>") # /info
# Video Panel
html.append("<div class='videos'>")
if compare_mp4:
html.append(f"<div class='compare'><p>Frame-Locked Compare</p><video src='{compare_mp4.name}' controls loop muted autoplay></video></div>")
else:
html.append("<div class='video-container'>")
html.append(f"<div class='video-col'><p>Reference Trailer</p><video src='{ref_mp4.name}' controls loop muted autoplay></video></div>")
html.append("<div class='video-col'><p>Matched Source</p><div style='width: 100%; aspect-ratio: 16/9; background: #222; display: flex; align-items: center; justify-content: center; border-radius: 6px; color: #555;'>No Match</div></div>")
html.append("</div>") # /video-container
html.append("</div>") # /videos
html.append("</div>") # /beat-row
html.append("</body></html>")
html_path.write_text("\n".join(html), encoding="utf-8")
return html_path
+175
View File
@@ -0,0 +1,175 @@
"""
src/pipeline/trailer_analyzer.py Reference trailer list[TrailerBeat]
Responsibility:
1. Run PySceneDetect on the REFERENCE TRAILER (not the source movie)
to detect cut boundaries raw beat intervals
2. Fingerprint the midpoint frame of each beat (for Vibe Check)
3. Transcribe dialogue per beat via Whisper (optional, injected)
4. Optionally classify BeatType via the LLM dramaturg (injected)
Returns: list[TrailerBeat] ready to feed into run_matching().
"""
from __future__ import annotations
import logging
from dataclasses import replace
from pathlib import Path
from typing import Callable, Sequence
from src.core.config import AppConfig
from src.core.models import BeatType, DialogueLine, TrailerBeat
from src.cv.fingerprinting import fingerprint_frame
from src.cv.frame_extractor import grab_midpoint_frame, open_video
logger = logging.getLogger(__name__)
# Injection type aliases — keeps this module free of hard audio/LLM imports
TranscribeCallback = Callable[[Path, float, float, float], list[DialogueLine]]
ClassifyCallback = Callable[[list[TrailerBeat]], list[TrailerBeat]]
# ---------------------------------------------------------------------------
# Step 1: Scene detection on the reference trailer
# ---------------------------------------------------------------------------
def _detect_trailer_beats(cfg: AppConfig) -> list[tuple[float, float, int, int]]:
"""
Run PySceneDetect on the reference trailer.
Returns list of (start_s, end_s, start_frame, end_frame).
Uses the same ContentDetector thresholds as the source movie.
"""
try:
from scenedetect import open_video as sd_open_video, SceneManager
from scenedetect.detectors import ContentDetector
except ImportError:
raise ImportError("pip install scenedetect[opencv]")
trailer_path = cfg.paths.reference_trailer
video = sd_open_video(str(trailer_path))
manager = SceneManager()
manager.add_detector(
ContentDetector(
threshold=cfg.scene_detection.content_threshold,
min_scene_len=int(
cfg.scene_detection.min_scene_duration_s * video.frame_rate
),
)
)
logger.info("Detecting beats in reference trailer: %s", trailer_path.name)
manager.detect_scenes(video=video, show_progress=False)
raw = manager.get_scene_list()
result = [
(s.get_seconds(), e.get_seconds(), s.get_frames(), e.get_frames())
for s, e in raw
]
logger.info("Detected %d beats in reference trailer.", len(result))
return result
# ---------------------------------------------------------------------------
# Step 2: Fingerprint beats
# ---------------------------------------------------------------------------
def _fingerprint_beats(
raw_beats: list[tuple[float, float, int, int]],
cfg: AppConfig,
) -> list[TrailerBeat]:
"""Extract midpoint frame for each beat and compute fingerprints."""
vc_cfg = cfg.cv.vibe_check
trailer_path = cfg.paths.reference_trailer
beats: list[TrailerBeat] = []
with open_video(trailer_path) as cap:
for idx, (start_s, end_s, start_frame, end_frame) in enumerate(raw_beats):
frame = grab_midpoint_frame(cap, start_s, end_s)
if frame is None:
logger.warning("Beat %d: midpoint frame decode failed.", idx)
beats.append(TrailerBeat(
beat_id=idx,
trailer_path=trailer_path,
start_s=start_s, end_s=end_s,
start_frame=start_frame, end_frame=end_frame,
))
continue
luma_b, sat_b, phash = fingerprint_frame(frame, vc_cfg)
beats.append(TrailerBeat(
beat_id=idx,
trailer_path=trailer_path,
start_s=start_s, end_s=end_s,
start_frame=start_frame, end_frame=end_frame,
luma_hist=luma_b,
sat_hist=sat_b,
phash=phash,
))
return beats
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def analyze_reference_trailer(
cfg: AppConfig,
transcribe_callback: TranscribeCallback | None = None,
classify_callback: ClassifyCallback | None = None,
) -> list[TrailerBeat]:
"""
Full reference-trailer analysis pipeline.
Args:
cfg: Application configuration.
transcribe_callback: Optional fn(path, start_s, end_s, offset_s)
list[DialogueLine]. Injected to keep this
module free of faster-whisper imports.
classify_callback: Optional fn(beats) beats with BeatType set.
Injected to keep this module LLM-free.
Returns:
List of TrailerBeat objects with fingerprints (and optionally
dialogue + BeatType) populated.
"""
# Step 1 — cut detection
raw_beats = _detect_trailer_beats(cfg)
# Step 2 — fingerprint
beats = _fingerprint_beats(raw_beats, cfg)
# Step 3 — dialogue (optional)
if transcribe_callback is not None:
enriched: list[TrailerBeat] = []
for beat in beats:
try:
lines = transcribe_callback(
beat.trailer_path,
beat.start_s,
beat.end_s,
beat.start_s, # time_offset so timestamps are absolute
)
enriched.append(replace(beat, dialogue=tuple(lines)))
except Exception as exc:
logger.warning("Beat %d transcription failed: %s", beat.beat_id, exc)
enriched.append(beat)
beats = enriched
# Step 4 — LLM dramaturgy (optional)
if classify_callback is not None:
try:
beats = classify_callback(beats)
except Exception as exc:
logger.warning("Beat classification failed: %s — keeping UNKNOWN.", exc)
logger.info(
"Trailer analysis complete: %d beats, %d with dialogue, %d classified.",
len(beats),
sum(1 for b in beats if b.dialogue),
sum(1 for b in beats if b.beat_type != BeatType.UNKNOWN),
)
return beats
+1
View File
@@ -0,0 +1 @@
# tests package
+144
View File
@@ -0,0 +1,144 @@
"""
tests/test_config.py Smoke tests for config loading and model integrity.
Run with: pytest tests/test_config.py -v
"""
from pathlib import Path
import pytest
from src.core.config import load_config, AppConfig
from src.core.models import (
Scene, TrailerBeat, MatchResult, VibeHit,
EditClip, EditTimeline, BeatType, DialogueLine,
)
CONFIG_PATH = Path(__file__).parents[1] / "config.toml"
# ---------------------------------------------------------------------------
# Config loader
# ---------------------------------------------------------------------------
class TestConfigLoader:
def test_loads_without_error(self) -> None:
cfg = load_config(CONFIG_PATH)
assert isinstance(cfg, AppConfig)
def test_project_meta(self) -> None:
cfg = load_config(CONFIG_PATH)
assert cfg.version == "2.0.0"
assert cfg.log_level in ("DEBUG", "INFO", "WARNING", "ERROR")
def test_cv_thresholds_in_range(self) -> None:
cfg = load_config(CONFIG_PATH)
ds = cfg.cv.deep_scan
assert 0.0 < ds.match_threshold < 1.0
assert ds.coarse_step_seconds > 0
def test_vibe_check_crop_fractions(self) -> None:
cfg = load_config(CONFIG_PATH)
vc = cfg.cv.vibe_check
assert 0.0 < vc.crop_top_fraction < 1.0
assert 0.0 < vc.crop_bottom_fraction < 1.0
assert vc.crop_top_fraction + vc.crop_bottom_fraction < 1.0
def test_missing_config_raises(self, tmp_path: Path) -> None:
with pytest.raises(FileNotFoundError):
load_config(tmp_path / "nonexistent.toml")
def test_paths_are_path_objects(self) -> None:
cfg = load_config(CONFIG_PATH)
assert isinstance(cfg.paths.source_movie, Path)
assert isinstance(cfg.paths.reference_trailer, Path)
# ---------------------------------------------------------------------------
# Data models — construction & properties
# ---------------------------------------------------------------------------
class TestSceneModel:
def test_duration(self) -> None:
s = Scene(
scene_id=0,
source_path=Path("dummy.mp4"),
start_s=10.0,
end_s=25.5,
start_frame=240,
end_frame=612,
)
assert s.duration_s == pytest.approx(15.5)
assert s.midpoint_s == pytest.approx(17.75)
def test_immutable(self) -> None:
s = Scene(
scene_id=0, source_path=Path("x.mp4"),
start_s=0.0, end_s=1.0,
start_frame=0, end_frame=24,
)
with pytest.raises(Exception): # FrozenInstanceError
s.scene_id = 99 # type: ignore[misc]
class TestTrailerBeatModel:
def test_beat_type_default(self) -> None:
b = TrailerBeat(
beat_id=0, trailer_path=Path("trailer.mp4"),
start_s=0.0, end_s=3.0,
start_frame=0, end_frame=72,
)
assert b.beat_type == BeatType.UNKNOWN
class TestMatchResultModel:
def test_duration_computed(self) -> None:
mr = MatchResult(
beat_id=0, scene_id=3,
source_path=Path("movie.mp4"),
in_point_s=120.0,
out_point_s=123.5,
in_point_frame=2880,
match_score=0.87,
)
assert mr.duration_s == pytest.approx(3.5)
def test_repr_contains_key_info(self) -> None:
mr = MatchResult(
beat_id=1, scene_id=7,
source_path=Path("movie.mp4"),
in_point_s=60.0, out_point_s=63.0,
in_point_frame=1440, match_score=0.91,
)
r = repr(mr)
assert "beat=1" in r
assert "scene=7" in r
class TestEditTimeline:
def _make_clip(self, idx: int, t_start: float, t_end: float) -> EditClip:
beat = TrailerBeat(
beat_id=idx, trailer_path=Path("t.mp4"),
start_s=t_start, end_s=t_end,
start_frame=0, end_frame=1,
)
match = MatchResult(
beat_id=idx, scene_id=0,
source_path=Path("m.mp4"),
in_point_s=0.0, out_point_s=t_end - t_start,
in_point_frame=0, match_score=0.9,
)
return EditClip(
clip_index=idx, beat=beat, match=match,
timeline_start_s=t_start, timeline_end_s=t_end,
)
def test_total_duration(self) -> None:
clips = (self._make_clip(0, 0.0, 5.0), self._make_clip(1, 5.0, 9.0))
tl = EditTimeline(title="Test Trailer", frame_rate=23.976, clips=clips)
assert tl.total_duration_s == pytest.approx(9.0)
assert tl.clip_count == 2
def test_empty_timeline(self) -> None:
tl = EditTimeline(title="Empty", frame_rate=24.0, clips=())
assert tl.total_duration_s == 0.0
+140
View File
@@ -0,0 +1,140 @@
"""
tests/test_deep_scan.py Unit tests for frame_extractor and deep_scan
Uses synthetic in-memory videos (cv2.VideoWriter temp file) so no real
video files are required. Tests cover the pure logic, not hardware decoding.
"""
from __future__ import annotations
import tempfile
from pathlib import Path
import cv2
import numpy as np
import pytest
from src.cv.frame_extractor import (
get_video_info,
grab_frame_at,
iter_frames_stepped,
open_video,
)
from src.cv.fingerprinting import text_safe_crop
# ---------------------------------------------------------------------------
# Helpers: build a tiny synthetic video on disk
# ---------------------------------------------------------------------------
FPS = 24
WIDTH = 320
HEIGHT = 240
SECS = 3
def _make_synthetic_video(path: Path, color_bgr: tuple[int, int, int] = (0, 128, 255)) -> Path:
"""Write a 3-second single-colour video to *path*."""
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
writer = cv2.VideoWriter(str(path), fourcc, float(FPS), (WIDTH, HEIGHT))
frame = np.full((HEIGHT, WIDTH, 3), color_bgr, dtype=np.uint8)
for _ in range(FPS * SECS):
writer.write(frame)
writer.release()
return path
@pytest.fixture
def synthetic_video(tmp_path: Path) -> Path:
return _make_synthetic_video(tmp_path / "test.mp4")
# ---------------------------------------------------------------------------
# open_video
# ---------------------------------------------------------------------------
class TestOpenVideo:
def test_opens_valid_file(self, synthetic_video: Path) -> None:
with open_video(synthetic_video) as cap:
assert cap.isOpened()
def test_raises_on_missing_file(self, tmp_path: Path) -> None:
with pytest.raises(FileNotFoundError):
with open_video(tmp_path / "ghost.mp4"):
pass
# ---------------------------------------------------------------------------
# get_video_info
# ---------------------------------------------------------------------------
class TestGetVideoInfo:
def test_returns_correct_fps(self, synthetic_video: Path) -> None:
info = get_video_info(synthetic_video)
assert info["fps"] == pytest.approx(FPS, rel=0.05)
def test_duration_approx(self, synthetic_video: Path) -> None:
info = get_video_info(synthetic_video)
assert info["duration_s"] == pytest.approx(SECS, rel=0.1)
def test_resolution(self, synthetic_video: Path) -> None:
info = get_video_info(synthetic_video)
assert info["width"] == WIDTH
assert info["height"] == HEIGHT
# ---------------------------------------------------------------------------
# grab_frame_at
# ---------------------------------------------------------------------------
class TestGrabFrameAt:
def test_returns_ndarray(self, synthetic_video: Path) -> None:
with open_video(synthetic_video) as cap:
frame = grab_frame_at(cap, 1.0)
assert frame is not None
assert isinstance(frame, np.ndarray)
assert frame.shape == (HEIGHT, WIDTH, 3)
def test_returns_none_past_end(self, synthetic_video: Path) -> None:
with open_video(synthetic_video) as cap:
frame = grab_frame_at(cap, 9999.0)
# May return None or a repeated last frame depending on codec;
# we only assert no exception is raised.
assert frame is None or isinstance(frame, np.ndarray)
# ---------------------------------------------------------------------------
# iter_frames_stepped
# ---------------------------------------------------------------------------
class TestIterFramesStepped:
def test_yields_correct_count(self, synthetic_video: Path) -> None:
with open_video(synthetic_video) as cap:
frames = list(iter_frames_stepped(cap, 0.0, 1.0, 0.5))
# Expect timestamps: 0.0, 0.5, 1.0 → 3 frames
assert len(frames) == 3
def test_timestamps_increasing(self, synthetic_video: Path) -> None:
with open_video(synthetic_video) as cap:
frames = list(iter_frames_stepped(cap, 0.0, 2.0, 0.5))
timestamps = [t for t, _ in frames]
assert timestamps == sorted(timestamps)
def test_invalid_step_raises(self, synthetic_video: Path) -> None:
with open_video(synthetic_video) as cap:
with pytest.raises(ValueError, match="step_s"):
list(iter_frames_stepped(cap, 0.0, 1.0, 0.0))
# ---------------------------------------------------------------------------
# text_safe_crop integration (sanity: cropped height consistent)
# ---------------------------------------------------------------------------
class TestCropSanity:
def test_crop_reduces_height(self, synthetic_video: Path) -> None:
with open_video(synthetic_video) as cap:
frame = grab_frame_at(cap, 0.5)
assert frame is not None
cropped = text_safe_crop(frame, 0.15, 0.30)
assert cropped.shape[0] < frame.shape[0]
assert cropped.shape[1] == frame.shape[1] # width unchanged
+218
View File
@@ -0,0 +1,218 @@
"""
tests/test_export.py Unit tests for timecode conversion and export writers
Tests use synthetic EditTimeline objects (no real video files needed).
"""
from __future__ import annotations
from pathlib import Path
import pytest
from src.export.timecode import (
seconds_to_fcpxml,
seconds_to_smpte,
fcpxml_frame_duration,
fcpxml_format_name,
seconds_to_frame_count,
)
# ---------------------------------------------------------------------------
# Timecode helpers
# ---------------------------------------------------------------------------
class TestSecondsToFcpxml:
def test_zero(self) -> None:
assert seconds_to_fcpxml(0.0, 24.0) == "0s"
def test_one_second_at_24fps(self) -> None:
# 1.0s @ 24fps → 24 frames → 24/24s = 1/1s
result = seconds_to_fcpxml(1.0, 24.0)
assert result == "1/1s"
def test_one_second_at_23976(self) -> None:
# 1s @ 23.976 → 24000/24000 * 1001/1001 = 1001/1000 ... let's just check it's rational
result = seconds_to_fcpxml(1.0, 23.976)
assert result.endswith("s")
assert "/" in result
def test_ten_seconds_at_25fps(self) -> None:
# 10s @ 25fps → 250 frames → 250/25s = 10/1s
result = seconds_to_fcpxml(10.0, 25.0)
assert result == "10/1s"
def test_rational_is_reduced(self) -> None:
# Should never produce 24/24s
result = seconds_to_fcpxml(1.0, 24.0)
num, den = result.rstrip("s").split("/")
from math import gcd
assert gcd(int(num), int(den)) == 1
class TestSecondsToSmpte:
def test_zero(self) -> None:
assert seconds_to_smpte(0.0, 24.0) == "00:00:00:00"
def test_one_minute(self) -> None:
assert seconds_to_smpte(60.0, 25.0) == "00:01:00:00"
def test_one_hour(self) -> None:
assert seconds_to_smpte(3600.0, 24.0) == "01:00:00:00"
def test_frames_overflow(self) -> None:
# 25fps: 26 frames → 1s + 1 frame = 00:00:01:01
result = seconds_to_smpte(26 / 25, 25.0)
assert result == "00:00:01:01"
def test_format_length(self) -> None:
result = seconds_to_smpte(123.456, 23.976)
parts = result.split(":")
assert len(parts) == 4
assert all(len(p) == 2 for p in parts)
class TestFcpxmlHelpers:
def test_frame_duration_24fps(self) -> None:
assert fcpxml_frame_duration(24.0) == "1/24s"
def test_frame_duration_23976(self) -> None:
fd = fcpxml_frame_duration(23.976)
# Should be "1001/24000s"
assert fd == "1001/24000s"
def test_format_name_1080p_2398(self) -> None:
name = fcpxml_format_name(23.976, 1920, 1080)
assert "1080" in name
assert "2398" in name
def test_frame_count_roundtrip(self) -> None:
fps = 25.0
seconds = 10.0
frames = seconds_to_frame_count(seconds, fps)
assert frames == 250
# ---------------------------------------------------------------------------
# EDL writer (string output)
# ---------------------------------------------------------------------------
class TestEdlWriter:
def _make_timeline(self) -> "src.core.models.EditTimeline": # type: ignore
from src.core.models import (
BeatType, EditClip, EditTimeline, MatchResult, TrailerBeat,
)
beat = TrailerBeat(
beat_id=0, trailer_path=Path("trailer.mp4"),
start_s=0.0, end_s=5.0, start_frame=0, end_frame=120,
beat_type=BeatType.HOOK,
)
match = MatchResult(
beat_id=0, scene_id=3,
source_path=Path("movie.mp4"),
in_point_s=30.0, out_point_s=35.0,
in_point_frame=720, match_score=0.88,
)
clip = EditClip(
clip_index=0, beat=beat, match=match,
timeline_start_s=0.0, timeline_end_s=5.0,
)
return EditTimeline(
title="TestTrailer", frame_rate=25.0, clips=(clip,)
)
def test_edl_contains_title(self, tmp_path: Path) -> None:
from src.core.config import load_config
from src.export.edl_writer import write_edl
cfg = load_config()
tl = self._make_timeline()
out = write_edl(tl, cfg, output_path=tmp_path / "test.edl")
text = out.read_text(encoding="utf-8")
assert "TITLE: TestTrailer" in text
def test_edl_has_event_line(self, tmp_path: Path) -> None:
from src.core.config import load_config
from src.export.edl_writer import write_edl
cfg = load_config()
tl = self._make_timeline()
out = write_edl(tl, cfg, output_path=tmp_path / "test.edl")
text = out.read_text(encoding="utf-8")
assert "001" in text # event number
assert "AX" in text # reel name
# ---------------------------------------------------------------------------
# FCPXML writer (XML structure)
# ---------------------------------------------------------------------------
class TestFcpxmlWriter:
def _make_timeline(self) -> "src.core.models.EditTimeline": # type: ignore
from src.core.models import (
BeatType, EditClip, EditTimeline, MatchResult, TrailerBeat,
)
beat = TrailerBeat(
beat_id=0, trailer_path=Path("trailer.mp4"),
start_s=0.0, end_s=5.0, start_frame=0, end_frame=120,
beat_type=BeatType.HOOK,
)
match = MatchResult(
beat_id=0, scene_id=3,
source_path=Path("B:/Proxy/movie.mp4"),
in_point_s=30.0, out_point_s=35.0,
in_point_frame=720, match_score=0.88,
)
clip = EditClip(
clip_index=0, beat=beat, match=match,
timeline_start_s=0.0, timeline_end_s=5.0,
)
return EditTimeline(
title="TestTrailer", frame_rate=25.0, clips=(clip,)
)
def test_fcpxml_is_valid_xml(self, tmp_path: Path) -> None:
from xml.etree import ElementTree as ET
from src.core.config import load_config
from src.export.fcpxml_writer import write_fcpxml
cfg = load_config()
tl = self._make_timeline()
out = write_fcpxml(tl, cfg, output_path=tmp_path / "test.fcpxml")
text = out.read_text(encoding="utf-8")
text_no_doctype = "\n".join(
line for line in text.splitlines()
if not line.strip().startswith("<!DOCTYPE")
)
root = ET.fromstring(text_no_doctype)
# Strip namespace prefix for comparison
local_tag = root.tag.split("}")[-1] if "}" in root.tag else root.tag
assert local_tag == "fcpxml"
def test_fcpxml_has_spine(self, tmp_path: Path) -> None:
from xml.etree import ElementTree as ET
from src.core.config import load_config
from src.export.fcpxml_writer import write_fcpxml
cfg = load_config()
tl = self._make_timeline()
out = write_fcpxml(tl, cfg, output_path=tmp_path / "test.fcpxml")
text = out.read_text(encoding="utf-8")
text_no_doctype = "\n".join(
line for line in text.splitlines()
if not line.strip().startswith("<!DOCTYPE")
)
# Register the FCPXML namespace so find() works
ns = {"fcp": "http://www.apple.com/dt/FCPXML/1_10"}
root = ET.fromstring(text_no_doctype)
spine = root.find(".//fcp:spine", ns)
assert spine is not None
clips = list(spine)
assert len(clips) == 1
+112
View File
@@ -0,0 +1,112 @@
"""
tests/test_fingerprinting.py Unit tests for src/cv/fingerprinting.py
Tests run WITHOUT requiring real video files.
"""
from __future__ import annotations
import numpy as np
import pytest
from src.cv.fingerprinting import (
text_safe_crop,
extract_hs_histograms,
compare_histograms,
hist_to_bytes,
bytes_to_hist,
)
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
@pytest.fixture
def solid_blue_frame() -> np.ndarray:
"""256×256 solid blue BGR frame."""
frame = np.zeros((256, 256, 3), dtype=np.uint8)
frame[:, :] = (255, 0, 0) # BGR blue
return frame
@pytest.fixture
def solid_red_frame() -> np.ndarray:
"""256×256 solid red BGR frame."""
frame = np.zeros((256, 256, 3), dtype=np.uint8)
frame[:, :] = (0, 0, 255) # BGR red
return frame
# ---------------------------------------------------------------------------
# text_safe_crop
# ---------------------------------------------------------------------------
class TestTextSafeCrop:
def test_removes_correct_rows(self, solid_blue_frame: np.ndarray) -> None:
cropped = text_safe_crop(solid_blue_frame, crop_top=0.15, crop_bottom=0.30)
h = solid_blue_frame.shape[0] # 256
expected_h = int(h * (1.0 - 0.30)) - int(h * 0.15)
assert cropped.shape[0] == expected_h
def test_zero_crop_returns_same_size(self, solid_blue_frame: np.ndarray) -> None:
cropped = text_safe_crop(solid_blue_frame, crop_top=0.0, crop_bottom=0.0)
assert cropped.shape == solid_blue_frame.shape
def test_invalid_top_raises(self, solid_blue_frame: np.ndarray) -> None:
with pytest.raises(ValueError, match="crop_top"):
text_safe_crop(solid_blue_frame, crop_top=1.0, crop_bottom=0.0)
def test_invalid_bottom_raises(self, solid_blue_frame: np.ndarray) -> None:
with pytest.raises(ValueError, match="crop_bottom"):
text_safe_crop(solid_blue_frame, crop_top=0.0, crop_bottom=-0.1)
def test_overlapping_crops_raise(self, solid_blue_frame: np.ndarray) -> None:
with pytest.raises(ValueError, match="must be < 1.0"):
text_safe_crop(solid_blue_frame, crop_top=0.6, crop_bottom=0.5)
# ---------------------------------------------------------------------------
# Histograms
# ---------------------------------------------------------------------------
class TestHistograms:
def test_output_shape(self, solid_blue_frame: np.ndarray) -> None:
luma, sat = extract_hs_histograms(solid_blue_frame, bins_hue=50, bins_sat=60)
assert luma.shape == (50,)
assert sat.shape == (60,)
def test_normalised(self, solid_blue_frame: np.ndarray) -> None:
import numpy as np
luma, sat = extract_hs_histograms(solid_blue_frame, bins_hue=50, bins_sat=60)
# L2-normalised → norm ≈ 1.0
assert np.linalg.norm(luma) == pytest.approx(1.0, abs=1e-5)
assert np.linalg.norm(sat) == pytest.approx(1.0, abs=1e-5)
def test_same_frame_correl_is_one(self, solid_blue_frame: np.ndarray) -> None:
import cv2
luma, _ = extract_hs_histograms(solid_blue_frame, bins_hue=50, bins_sat=60)
score = compare_histograms(luma, luma, method=cv2.HISTCMP_CORREL)
assert score == pytest.approx(1.0, abs=1e-5)
def test_different_frames_correl_lower(
self,
solid_blue_frame: np.ndarray,
solid_red_frame: np.ndarray,
) -> None:
import cv2
luma_b, _ = extract_hs_histograms(solid_blue_frame, 50, 60)
luma_r, _ = extract_hs_histograms(solid_red_frame, 50, 60)
score = compare_histograms(luma_b, luma_r, method=cv2.HISTCMP_CORREL)
assert score < 1.0
# ---------------------------------------------------------------------------
# Serialisation round-trip
# ---------------------------------------------------------------------------
class TestSerialisation:
def test_round_trip(self, solid_blue_frame: np.ndarray) -> None:
luma, _ = extract_hs_histograms(solid_blue_frame, 50, 60)
restored = bytes_to_hist(hist_to_bytes(luma))
np.testing.assert_array_almost_equal(luma, restored)