commit 8e1bcf142fa042815b5b4ec412c00e3ebf89ff9d Author: Melbar Date: Sat May 2 09:07:41 2026 +0200 Initial project import diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..626846b --- /dev/null +++ b/.env.example @@ -0,0 +1,15 @@ +# ============================================================================= +# AI Trailer Generator v2 — Environment Variables +# ============================================================================= +# Copy this file to .env and fill in your actual keys. +# .env is listed in .gitignore and will NEVER be committed. +# ============================================================================= + +# OpenRouter API key (required when [llm] provider = "openrouter") +OPENROUTER_API_KEY=sk-or-v1-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + +# OpenAI API key (required when [llm] provider = "openai") +# OPENAI_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + +# Universal fallback (used if provider-specific key is not set) +# LLM_API_KEY= diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d3e1e3e --- /dev/null +++ b/.gitignore @@ -0,0 +1,44 @@ +# --------------------------------------------------------------------------- +# AI Trailer Generator v2 — .gitignore +# --------------------------------------------------------------------------- + +# Python +__pycache__/ +*.py[cod] +*.pyo +*.pyd +*.egg-info/ +dist/ +build/ +*.whl +.venv/ +venv/ +.mypy_cache/ +.ruff_cache/ +.pytest_cache/ + +# Project-generated artefacts (potentially huge) +.cache/ +output/ +proxy/ +*.mp4 +*.mov +*.mxf +*.wav +*.mp3 +*.jpg +*.jpeg +*.png + +# IDE +.vscode/ +.idea/ +*.swp + +# OS +.DS_Store +Thumbs.db + +# Secrets / local overrides +.env +config.local.toml diff --git a/README.md b/README.md new file mode 100644 index 0000000..5323691 --- /dev/null +++ b/README.md @@ -0,0 +1,384 @@ +# AI Trailer Generator v2 + +**Frame-accurate trailer reconstruction via pure Computer Vision** + +> Gibt einen Reference Trailer und den dazugehörigen Quellfilm hinein — bekommt eine fertige FCPXML/EDL heraus, die den Trailer Frame-genau aus dem Quellfilm nachbaut. + +--- + +## Das Kernprinzip + +Standardmäßig kein LLM für visuelles Matching. Optional kann ein Vision-Layer +gecachte 3-Frame-Beschreibungen als zusätzliche Suchanker liefern; der finale +Match bleibt aber CV-verifiziert. + +| Phase | Was passiert | Technologie | +|-------|-------------|-------------| +| **0 — Prep** | Reference Trailer analysieren & Beats extrahieren | PySceneDetect + OpenCV | +| **1 — Global Scan**| Gesamten Quellfilm via FFmpeg-Stream (2 FPS) gegen alle Beats scannen | FFmpeg Pipe + Luma-Histogramm | +| **1b — Optional Vision Seeds** | Unsichere Top-K Szenen mit 3-Frame-Beschreibungen cachen | OpenAI-kompatibles Vision-LLM | +| **2 — Refine** | Beste Treffer auf Frame-Ebene präzisieren | OpenCV `matchTemplate` | +| **3 — Dramaturgie** | Narrative BeatType-Klassifikation aus Dialog-Text | OpenRouter LLM | +| **4 — Export** | Timeline → FCPXML 1.10 oder CMX 3600 EDL | xml.etree + eigener Timecode-Layer | + +**Text-Safe Crop:** Obere 15% und untere 30% des Frames werden vor jedem Vergleich ausgeblendet, um Title Cards, Logos und Letterbox zu ignorieren. + +--- + +## Voraussetzungen + +- Python **3.11+** +- [ffmpeg](https://ffmpeg.org/download.html) im PATH (für Whisper Audio-Extraktion) +- CUDA-fähige GPU empfohlen (für faster-whisper; CPU funktioniert auch) + +--- + +## Setup + +### 1. Virtual Environment erstellen & aktivieren + +```powershell +# Im Projektordner +python -m venv .venv +.\.venv\Scripts\Activate.ps1 + +# Falls ExecutionPolicy blockiert: +# Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser +``` + +### 2. Abhängigkeiten installieren + +```powershell +pip install -r requirements.txt +``` + +### 3. API-Key konfigurieren + +```powershell +# .env aus dem Template kopieren +Copy-Item .env.example .env + +# Dann .env öffnen und den echten Key eintragen: +# OPENROUTER_API_KEY=sk-or-v1-... +``` + +### 4. Videodateien eintragen + +`config.toml` öffnen und die Pfade anpassen: + +```toml +[paths] +source_movie = "B:/Proxy/DeinFilm_FTR.mp4" +reference_trailer = "F:/Encodings/DeinFilm_Trailer.mp4" +``` + +--- + +## Verwendung + +```powershell +# Vollständige Pipeline (analyze → match → report → export) +python cli.py run + +# Ohne Whisper-Transkription (schneller) +python cli.py run --no-audio + +# Ohne LLM-Klassifikation +python cli.py run --no-audio --no-llm + +# Schrittweise +python cli.py analyze # Reference Trailer → Beats erkennen +python cli.py match # Globaler FFmpeg Scan (Szenen-unabhängig) +python cli.py report # HTML Report mit Video-Vergleich bauen +python cli.py export --format both # FCPXML + EDL ausgeben + +# Gezielt nur einen Beat bearbeiten (empfohlen für erste Iterationen) +python cli.py match --beat 5 +python cli.py match --beat 5 --vision # optionale gecachte Vision-Seeds +python cli.py report --beat 5 +python cli.py export --beat 5 --format both + +# Fehlerhafte Matches korrigieren +python cli.py rematch --beat 5 --threshold 0.50 # Schwelle anpassen (Globaler Scan wird für diesen Beat wiederholt) +python cli.py rematch --beat 5 --refine # Cached Match per lokalem Bildinhalt-Offset nachschärfen +``` + +Der HTML-Report regeneriert seine Preview-Clips bei jedem Lauf mit genauer +FFmpeg-Nachsuche und synchronisiert die beiden Video-Player pro Beat. Dadurch +ist der Report zur Frame-Prüfung geeignet und zeigt keine alten gecachten +Preview-Clips. +Source-Previews bekommen bei Trailer-only-Tails denselben schwarzen Tail wie der +Export, damit der Browser nicht einen zu kurzen Source-Clip gegen den längeren +Referenzbeat weiterspult oder loopt. +Zur Synchronprüfung rendert der Report ein einzelnes Frame-Locked-Compare-Video +mit Referenz und Source in demselben MP4-Stream. Dieses Compare-Video ist +maßgeblich, weil zwei getrennte Browser-Videoelemente nie zuverlässig +framegenau synchron bleiben. + +Wenn ein Trailer-Beat am Ende eine Blende, Schwarzfläche oder Textkarte enthält, +die im Source-Film nicht als normaler Shot vorhanden ist, endet der Source-Match +am letzten stabil passenden Frame. Exportierte Timelines behalten trotzdem die +volle Beat-Länge und fügen danach automatisch einen schwarzen Trailer-Tail mit +Marker für Fade/Dissolve ein. + +Gezielte Ein-Beat-Matches nutzen zusätzlich vorhandene automatische Nachbarbeats +aus dem Cache als zeitliche Suchanker. Das hilft bei aufeinanderfolgenden Shots, +ohne manuelle Szenen oder Timecodes zu kuratieren. +Bei `match --beat N` wird ein alter Cache-Treffer für genau diesen Beat entfernt +und nur ein neu gefundener automatischer Treffer wieder eingetragen. Ein +fehlgeschlagener neuer Lauf kann dadurch keinen alten falschen Report-Treffer +stehen lassen. + +Der globale Bildvergleich arbeitet auf kontrast-normalisierten Luma- und +Kantenfeatures statt auf rohen Farb-Pixeln. Dadurch bleiben Schwarzweiß- oder +anders gegradete Trailerbilder mit dem Source-Material vergleichbar, während +unähnliche Farbshots schlechter ranken. +Die Inpoint-Feinjustage bestimmt den Versatz lokal aus dem Bildinhalt: Um einen +groben Treffer herum werden mehrere Referenzframes gegen mehrere Source-Offsets +verglichen, und der beste gemeinsame Offset wird übernommen. Das ist schneller +als ein erneuter globaler Scan und vermeidet pauschale Frame-Prerolls. +Schwarze Referenzframes aus Blenden oder Titel-Tails werden für diese +Offset-Messung ausgelassen, damit echte Bildbewegung und nicht die Blende selbst +den Inpoint bestimmt. +`rematch --refine` nutzt denselben lokalen FFmpeg/Pillow-Aligner und schreibt +den korrigierten Inpoint direkt zurück in `.cache/match_results.json`. + +Zusätzlich werden aus den besten szenenweiten Luma/Histogramm-Kandidaten +mehrere Inpoint-Suchanker erzeugt. Diese Scene-Seeds verwenden keine harte +pHash-Sperre, weil pHash bei stark anders gegradeten Trailerbildern echte +Matches zu früh ausschließen kann. +Optional kann `python cli.py match --beat N --vision` einen Vision-Layer +zuschalten. Dann werden pro Trailer-Beat und pro wenigen Scene-Level-Kandidaten +je drei Frames (Anfang, Mitte, Ende) von einem visionfähigen OpenAI-kompatiblen +Modell beschrieben. Die Beschreibungen liegen in +`.cache/vision_descriptions.json` und werden wiederverwendet. Vision erzeugt +nur zusätzliche Suchanker; der eigentliche Match muss weiterhin durch CV, +Content-Reranking, Timing und Duration-Coverage bestätigt werden. +Der gewichtete Vision-Seed-Pfad ersetzt standardmäßig keinen normalen +FFmpeg-Vollscan. Vision-Beschreibungen sind semantische Hinweise, aber keine +Beweise; der volle CV-Scan bleibt deshalb aktiv, damit falsch bewertete +Vision-Szenen echte Treffer nicht verdrängen. Für schnelle Experimente kann +`skip_coarse_scan_with_weighted_seeds = true` gesetzt werden. +Gewichtete Vision-Seeds werden nicht zuerst durch den alten Midpoint-Template +Refine verschoben; sie gehen direkt in die lokale Content-Alignment-Prüfung. +Das schützt wiederholte Gesprächseinstellungen, bei denen ähnliche Momente +mehrfach in derselben Szene vorkommen. +Innerhalb der automatisch von Vision vorgeschlagenen Szenen läuft zusätzlich +eine dichte lokale Bildsequenzsuche. Sie misst den Phasenversatz in kleinen +Zeitschritten direkt am Bildinhalt und bevorzugt Kandidaten mit genügend +Restdauer in derselben Source-Szene. Das ist kein manueller Override: Vision +grenzt nur Suchbereiche ein, die Auswahl bleibt Content-, Timing- und +Coverage-getrieben. +Nach einem dichten Vision-Treffer darf der spätere lokale Aligner nur noch im +Bereich dieses Scan-Schritts nachjustieren. So kann ein korrekt gefundener +Bewegungsmoment nicht wieder um viele Frames in eine ähnlich aussehende Phase +derselben Szene verschoben werden. +Wenn mehrere Vision-Kandidaten in derselben Source-Szene ähnlich gut scoren +und die Beat-Dauer abdecken, bevorzugt der Matcher die frühere Phase. Das +verhindert, dass ein späterer, minimal stärkerer Standbildtreffer die +Bewegungsphase des Trailers sichtbar überholt. +Enthält ein Trailerbeat selbst einen harten Umschnitt, werden Kandidaten an +angrenzenden Source-Szenengrenzen zusätzlich als zusammenhängender Multi-Shot- +Span geprüft. Ein Match darf dann über eine Source-Szenengrenze laufen, aber +nur wenn die relative Source-Grenze zeitlich zu einem erkannten Trailer-Umschnitt +passt. So kann ein Beat aus Frage/Antwort-Shots vollständig erfasst werden, +ohne Szenen willkürlich zusammenzukleben. +Auch der lokale Content-Aligner darf einen Inpoint nur noch übernehmen, wenn +die feste Whole-Frame-/Spatial-Validation dadurch besser wird. +Vor dem teuren Frame-Refine wird der gesamte Kandidatenpool mit einer schnellen +festen Inhaltsprüfung neu sortiert. Dadurch können korrekte Treffer aus +wiederholten Einstellungen einer Szene nach oben kommen, auch wenn ein freier +Template-Peak an anderer Stelle numerisch stärker war. Suchanker bleiben im +Pool erhalten, dürfen aber erst nach der Inhaltsprüfung nach oben rücken. Wenn +ein Kandidat visuell plausibel ist, aber wegen Trailerblende oder kurzem +Source-Span die normale Coverage knapp verfehlt, wird er als provisional Match +behalten statt als `NO MATCH` verworfen. +Dieses Reranking berücksichtigt zusätzlich die verbleibende Szenenlänge ab dem +Kandidaten-Inpoint. Dadurch werden zu späte ähnliche Gesprächsphasen innerhalb +derselben Szene nicht mehr vor frühere, tragfähigere Phasen sortiert. +Das Inhalts-Reranking nutzt bewusst nur wenige repräsentative Referenzframes und +eine begrenzte Kandidatenzahl. So bleiben wiederholte Szenen auffindbar, ohne +dass der Lauf durch tausende Random-Seeks minutenlang festhängt. +Confirmed Matches werden zusätzlich durch eine feste nahezu-Whole-Frame-Prüfung +aus Luma, Kanten, Farbhistogramm und räumlichen 4x4-Farbhistogrammen gedeckelt. +Dadurch kann ein freier Template-Hit mit ähnlicher Fenster-/Gesichtsstruktur +nicht mehr als sicherer Match gelten, wenn die Gesamtkomposition oder die +Bewegungsphase sichtbar eine andere Szene ist. +Für gewichtete Vision-Kandidaten gibt es zusätzlich eine eigene Provisional- +Bewertung aus Content-Score, Restdauer und Seed-Stärke. Dadurch können echte, +aber durch Trailer-Grading/Crop numerisch schwache Treffer im Report landen, +ohne als confirmed Match durchzugehen. +Die Cache-Normalisierung für Report/Export verwendet dieselbe niedrigere +Content-Untergrenze für nicht bestätigte Vision-Provisional-Treffer, damit ein +gerade gefundener automatischer Match nicht beim Report-Aufbau wieder +weggefiltert wird. +Sie übernimmt auch die Multi-Shot-Coverage-Regel: gecachte Treffer, die passend +zu internen Trailer-Umschnitten über angrenzende Source-Szenen laufen, werden +nicht mehr auf die erste Source-Szene zurückgekürzt. +Gezielte Einzel-Beat-Matches gewichten außerdem die automatisch aus Nachbarbeats +abgeleiteten Continuity-Seeds. Wenn ein Beat direkt an einen bereits passenden +Vorgänger anschließt, kann ein späterer ähnlich aussehender Moment derselben +Dialogszene den erwarteten Anschluss nicht mehr nur wegen eines höheren +Standbildscores verdrängen. +Diese Continuity-Seeds sind aber nur Suchanker: in derselben Szene darf ein +späterer Inpoint gewinnen, wenn die mehrframeige Content-Prüfung die +Bewegungsphase klar besser trifft. Dadurch bleiben Anschlussmatches stabil, +ohne Hand-/Kopfbewegungen auf einen falschen Zeitpunkt festzunageln. +Continuity- und Vision-Seeds allein schalten den globalen FFmpeg-Scan +standardmäßig nicht ab. Sie sind Suchanker, keine Beweise; der volle CV-Scan +bleibt aktiv, damit semantisch plausible, aber falsche Vision-Treffer echte +Bildmatches nicht verdrängen. +Lange Trailerbeats werden nicht mehr automatisch über ihre gesamte Beat-Länge +gegen einen einzigen Source-Clip validiert. Sobald nach einem sichtbaren +Source-Abschnitt eine anhaltende Schwarzblende oder Titel-/Credit-Insel beginnt, +endet der matchbare Referenzbereich dort; zwei aufeinanderfolgende dunkle +Samples reichen dafür. Spätere Text-/Creditbilder im selben Beat gehen damit +nicht mehr in Reranking, Validation oder Span-Schätzung ein. +Zusätzlich werden sehr dunkle, kontrastarme oder noch nicht sauber +auf-/abgeblendete Referenzframes aus Score, Inhalts-Reranking, +Phasen-Alignment und Motion-Templates herausgenommen. Blenden sollen bestimmen, +wie der Clip später exportiert wird, aber nicht, ob der Bildinhalt als Match +gilt. +Treffer unter `provisional_content_threshold` werden gar nicht mehr gespeichert +oder aus alten Cache-Ergebnissen übernommen. Das verhindert, dass offensichtlich +falsche Szenen im Report als Match-Kandidat weiterleben. + +### Log-Level + +```powershell +python cli.py run --log-level DEBUG +``` + +--- + +## Projektstruktur + +``` +ai_trailer_2026/ +│ +├── config.toml ← Alle Parameter (kein Hardcoding im Code) +├── .env ← API-Keys (NICHT commiten) +├── cli.py ← Einstiegspunkt +│ +├── src/ +│ ├── core/ +│ │ ├── config.py load_config() → AppConfig (frozen dataclasses) +│ │ └── models.py Scene, TrailerBeat, VibeHit, MatchResult, EditTimeline +│ ├── cv/ +│ │ ├── fingerprinting.py Text-Safe Crop · HS-Histogramme · pHash +│ │ ├── vibe_check.py Phase 1: Histogram+pHash Filter +│ │ ├── scene_indexer.py PySceneDetect → Fingerprint → JSON-Cache +│ │ ├── frame_extractor.py VideoCapture-Wrapper +│ │ └── deep_scan.py Phase 2: Coarse+Refine Template-Matching +│ ├── audio/ +│ │ └── transcriber.py faster-whisper Transkription +│ ├── llm/ +│ │ ├── dramaturg.py OpenRouter → BeatType (Dialog/Dramaturgie) +│ │ └── vision_cache.py optionale gecachte 3-Frame Vision-Seeds +│ ├── pipeline/ +│ │ ├── trailer_analyzer.py Reference-Trailer → TrailerBeat[] +│ │ └── matcher.py Orchestrierung + EditTimeline-Builder +│ └── export/ +│ ├── timecode.py Sekunden ↔ FCPXML-Rational ↔ SMPTE +│ ├── fcpxml_writer.py FCPXML 1.10 +│ └── edl_writer.py CMX 3600 EDL +│ +├── output/ ← FCPXML/EDL Output (gitignored) +├── .cache/ ← Szenen-Index + Match-Ergebnisse (gitignored) +└── tests/ 52 Unit-Tests (pytest) +``` + +--- + +## Cache-Verhalten + +Damit nicht bei jedem Lauf der gesamte Quellfilm neu analysiert werden muss: + +| Datei | Inhalt | Neu bauen mit | +|-------|--------|---------------| +| `.cache/scene_index.json` | Alle Quellfilm-Szenen + Fingerprints | `--force-reindex` | +| `.cache/trailer_beats.json` | Erkannte Trailer-Beats | `python cli.py analyze` erneut | +| `.cache/match_results.json` | CV-Matching-Ergebnisse | `python cli.py match` erneut | +| `.cache/vision_descriptions.json` | Optionale 3-Frame Vision-Beschreibungen für Beats/Szenen | löschen oder anderes Vision-Modell konfigurieren | + +--- + +## Tests + +```powershell +pytest tests/ -v +``` + +Alle Tests laufen ohne echte Videodateien (synthetische Frames via numpy/OpenCV). + +--- + +## Konfiguration (Auszug) + +Alle Werte in `config.toml` — keine hardgecodeten Konstanten im Code. + +```toml +[cv.vibe_check] +top_k_candidates = 10 # Top-K Kandidaten für Deep Scan +phash_max_distance = 12 # Hamming-Distanz Schwelle (0–64) +crop_top_fraction = 0.15 # Obere 15% ausblenden (Logos) +crop_bottom_fraction = 0.30 # Untere 30% ausblenden (Letterbox/Subs) + +[cv.deep_scan] +coarse_step_seconds = 0.5 # Scan-Schrittgröße (Coarse Pass) +match_threshold = 0.65 # Mindestscore für bestätigte automatische Matches +provisional_match_threshold = 0.45 # Niedrigere automatische Kandidaten im Report zeigen +coarse_candidate_threshold = 0.50 # Niedrigeres Gate vor Multi-Frame-Refine +refine_window_seconds = 0.6 # Suchfenster für framegenaue Inpoint-Feinjustage +refine_step_seconds = 0.04 # ~1 Frame bei 25fps (Refine Pass) +content_align_window_seconds = 0.48 # Lokales Suchfenster um einen groben Treffer +content_align_sample_step_s = 0.28 # Referenzframes für direkten Bildinhalt-Offset +content_validation_weight = 0.35 # Gewicht der festen Whole-Frame-/Spatial-Endprüfung +provisional_content_threshold = 0.42 # Untergrenze für Report-/Cache-Kandidaten +start_tie_break_score_delta = 0.015 # Bei fast gleichen Scores früheren Inpoint wählen +start_preroll_frames = 0 # Kein pauschaler Start-Ausgleich; Offset kommt aus Bildinhalt +sequence_candidate_count = 240 # Breiter Kandidatenpool vor Inhalts-Reranking +max_refine_candidates = 6 # Teurer Frame-Refine läuft nur auf den besten Inhaltskandidaten +scene_seed_top_k = 30 # Scene-Level-Kandidaten als zusätzliche Suchanker +scene_seed_points_per_scene = 6 # Inpoint-Samples pro Scene-Level-Kandidat +content_rerank_candidate_count = 100 # Grobe Kandidaten vor Inhalts-Reranking +skip_coarse_scan_with_weighted_seeds = false # Vision-Seeds nur als Hinweise; Vollscan bleibt robust +sequence_score_weight = 0.55 # Gewicht für mehrere zeitliche Vergleichsframes +span_score_weight = 0.15 # Gewicht für Stabilität bis zum Beat-Ende +coarse_score_weight = 0.10 # Gewicht des groben Midpoint-Treffers +duration_score_weight = 0.20 # Gewicht für nutzbare Länge des Source-Treffers +duration_tie_break_score_delta = 0.03 # Bei ähnlichem Score längeren Treffer bevorzugen +min_duration_coverage = 0.65 # Treffer muss mindestens 65% des matchbaren Referenzanteils tragen +continuity_seed_offsets_s = [-1.0, 0.0, 0.5, 1.0, 1.5, 2.0, 3.0] # Suchanker um gematchte Nachbarbeats +span_sample_step_s = 0.08 # Schrittweite für End-/Drift-Erkennung +trim_tail_frames = 4 # Sicherheitsabstand gegen kurze Blitzer am Ende +scene_boundary_epsilon_s = 0.12 # Szenengrenzen-Toleranz gegen 1-2 Frame Cut-Drift +scoreable_luma_mean_min = 24.0 # Zu dunkle/Fade-Frames nicht scoren +scoreable_luma_p90_min = 58.0 # Helle Bildanteile müssen sichtbar genug sein +scoreable_contrast_min = 24.0 # Kontrastarme Blenden/Titelinseln ignorieren + +[vision] +enabled = false # Kostenkontrolle: per CLI mit --vision aktivierbar +model = "google/gemma-4-31b-it" # Muss ein visionfähiges OpenAI-kompatibles Modell sein +scene_candidate_top_k = 8 # Nur wenige Top-Szenen pro Beat beschreiben +max_new_descriptions_per_run = 12 # API-Kosten pro Lauf begrenzen +max_seed_scenes = 3 # Nur beste Vision-Szenen als Suchanker weitergeben +seed_points_per_scene = 12 # Inpoint-Samples pro Vision-Szene +seed_score = 0.88 # Vision-Seeds bekommen mehr Priorität als normale Scene-Seeds +max_refine_candidates = 6 # Vision-Pfad prüft mehrere Bewegungsphasen derselben Szene +local_scan_step_s = 0.12 # Dichte lokale Bildsuche in Vision-Szenen +local_scan_max_points_per_scene = 180 # Laufzeitgrenze pro Source-Szene +local_scan_top_candidates = 18 # Beste lokale Kandidaten gehen ins Refinement +local_scan_tie_break_score_delta = 0.08 # Ähnliche Vision-Treffer: frühere Phase bevorzugen +multi_shot_cut_corr_threshold = 0.20 # Interne Trailer-Umschnitte erkennen +multi_shot_boundary_tolerance_s = 0.20 # Source-Grenze muss zum Trailer-Cut passen +fullscan_fallback = false # Nur relevant, wenn skip_coarse_scan_with_weighted_seeds=true ist +content_threshold = 0.22 # Lockeres Content-Gate nur für gewichtete Vision-Seeds +similarity_threshold = 0.18 # Mindest-Textähnlichkeit für Vision-Seeds +``` + +--- + +## Lizenz + +Internes Tool — nicht für den öffentlichen Vertrieb. diff --git a/cli.py b/cli.py new file mode 100644 index 0000000..5105d5f --- /dev/null +++ b/cli.py @@ -0,0 +1,899 @@ +""" +cli.py — AI Trailer Generator v2 — Command-Line Interface + +Usage: + python cli.py analyze [--config CONFIG] [--no-audio] [--no-llm] + python cli.py match [--config CONFIG] [--force-reindex] + python cli.py rematch --beat N [--threshold F] [--refine] + python cli.py report [--config CONFIG] + python cli.py run [--config CONFIG] [--force-reindex] [--no-audio] [--no-llm] + python cli.py export [--config CONFIG] [--format fcpxml|edl|both] + +On --no-audio / --no-llm: + These flags do NOT affect matching quality. + Whisper and the LLM only assign narrative labels (HOOK/SETUP/CLIMAX) + to beats in the export metadata. The CV pipeline is identical either way. + Use them for fast iterations: they skip large model downloads. + +All heavy imports are deferred so --help is instant. +""" + +from __future__ import annotations + +import argparse +import json +import logging +import sys +from pathlib import Path + + +# --------------------------------------------------------------------------- +# Logging setup +# --------------------------------------------------------------------------- + +def _setup_logging(level: str = "INFO") -> None: + # Force UTF-8 for Windows console emoji printing + if sys.stdout.encoding != 'utf-8': + sys.stdout.reconfigure(encoding='utf-8') + logging.basicConfig( + format="%(asctime)s %(levelname)-8s %(name)s — %(message)s", + datefmt="%H:%M:%S", + level=getattr(logging, level.upper(), logging.INFO), + stream=sys.stdout, + ) + logging.getLogger("PIL").setLevel(logging.WARNING) + + +def _ensure_utf8_console() -> None: + """Make argparse help safe on Windows before logging is configured.""" + if sys.stdout.encoding != "utf-8": + sys.stdout.reconfigure(encoding="utf-8") + + +# --------------------------------------------------------------------------- +# Cache helpers (match results ↔ JSON) +# --------------------------------------------------------------------------- + +def _results_cache_path(cfg: "AppConfig") -> Path: # type: ignore[name-defined] + return cfg.paths.cache_dir / "match_results.json" + + +def _save_results(results: list, cfg: "AppConfig") -> None: # type: ignore[name-defined] + from src.core.models import MatchResult + data = [ + { + "beat_id": r.beat_id, + "scene_id": r.scene_id, + "source_path": str(r.source_path), + "in_point_s": r.in_point_s, + "out_point_s": r.out_point_s, + "in_point_frame": r.in_point_frame, + "match_score": r.match_score, + "match_location": list(r.match_location), + "is_confirmed": r.is_confirmed, + "segments": [ + { + "trailer_offset_s": s.trailer_offset_s, + "duration_s": s.duration_s, + "scene_id": s.scene_id, + "in_point_s": s.in_point_s, + "out_point_s": s.out_point_s, + "match_score": s.match_score, + "is_confirmed": s.is_confirmed, + } + for s in getattr(r, "segments", ()) + ], + } + for r in results + ] + p = _results_cache_path(cfg) + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(json.dumps(data, indent=2), encoding="utf-8") + logging.getLogger(__name__).info("Match results cached → %s", p) + + +def _load_results(cfg: "AppConfig") -> list: # type: ignore[name-defined] + from src.core.models import MatchResult, MatchSegment + p = _results_cache_path(cfg) + if not p.exists(): + raise FileNotFoundError(f"No cached results at {p}. Run 'match' first.") + raw = json.loads(p.read_text(encoding="utf-8")) + return [ + MatchResult( + beat_id=d["beat_id"], + scene_id=d["scene_id"], + source_path=Path(d["source_path"]), + in_point_s=d["in_point_s"], + out_point_s=d["out_point_s"], + in_point_frame=d["in_point_frame"], + match_score=d["match_score"], + match_location=tuple(d["match_location"]), + is_confirmed=d.get("is_confirmed", True), + segments=tuple( + MatchSegment( + trailer_offset_s=float(s["trailer_offset_s"]), + duration_s=float(s["duration_s"]), + scene_id=int(s["scene_id"]), + in_point_s=float(s["in_point_s"]), + out_point_s=float(s["out_point_s"]), + match_score=float(s["match_score"]), + is_confirmed=bool(s.get("is_confirmed", True)), + ) + for s in d.get("segments", ()) + ), + ) + for d in raw + ] + + +def _load_scene_cache_light(cfg) -> list[dict]: + p = cfg.paths.cache_dir / "scene_index.json" + if not p.exists(): + return [] + return json.loads(p.read_text(encoding="utf-8")) + + +def _scene_fps_light(scene: dict, cfg) -> float: + duration_s = max(0.0, float(scene["end_s"]) - float(scene["start_s"])) + frame_count = max(0, int(scene["end_frame"]) - int(scene["start_frame"])) + return frame_count / duration_s if duration_s > 0 and frame_count > 0 else cfg.export.edl_frame_rate + + +def _scene_for_time_light(scenes: list[dict], t_sec: float, cfg) -> dict | None: + for idx, scene in enumerate(scenes): + if float(scene["start_s"]) <= t_sec < float(scene["end_s"]): + if ( + float(scene["end_s"]) - t_sec <= cfg.cv.deep_scan.scene_boundary_epsilon_s + and idx + 1 < len(scenes) + ): + return scenes[idx + 1] + return scene + return None + + +def _scene_by_id_light(scenes: list[dict], scene_id: int) -> dict | None: + return next((s for s in scenes if int(s["scene_id"]) == scene_id), None) + + +def _contiguous_duration_light(beat, in_point_s: float, scenes: list[dict], cfg, matchable_duration_s: float) -> float: + if matchable_duration_s <= 0: + return 0.0 + try: + from src.cv.global_scan import _reference_internal_cut_offsets + cut_offsets = _reference_internal_cut_offsets(beat, cfg) + except Exception: + cut_offsets = [] + + start_idx = None + for idx, scene in enumerate(scenes): + if float(scene["start_s"]) <= in_point_s < float(scene["end_s"]): + start_idx = idx + break + if start_idx is None: + return 0.0 + + target_end = in_point_s + matchable_duration_s + current_end = in_point_s + for scene in scenes[start_idx:]: + scene_end = float(scene["end_s"]) + if target_end <= scene_end: + return matchable_duration_s + + boundary_offset = scene_end - in_point_s + if not any( + abs(boundary_offset - cut_offset) <= cfg.vision.multi_shot_boundary_tolerance_s + for cut_offset in cut_offsets + ): + tail_s = max(0.0, cfg.cv.deep_scan.trim_tail_frames / _scene_fps_light(scene, cfg)) + return max(0.0, scene_end - in_point_s - tail_s) + current_end = scene_end + + return max(0.0, current_end - in_point_s) + + +def _normalize_cached_results(beats: list, results: list, cfg) -> list: + """ + Re-apply current generic timing rules to cached results. + + This keeps old automatic cache entries from preserving obsolete scene-boundary + or tail-trim behavior without introducing manual per-beat truth. + """ + from dataclasses import replace + + scenes = _load_scene_cache_light(cfg) + if not scenes: + return results + + beats_by_id = {b.beat_id: b for b in beats} + normalized = [] + for result in results: + beat = beats_by_id.get(result.beat_id) + if result.match_score < cfg.cv.deep_scan.provisional_match_threshold: + continue + + scene = _scene_for_time_light(scenes, result.in_point_s, cfg) + declared_scene = _scene_by_id_light(scenes, result.scene_id) + + # If the automatic matcher selected a scene but its in-point sits just + # before that scene's detected start, treat this as scene-boundary drift + # and clamp to the declared scene. This is generic: no beat IDs, no + # manual timestamps, just consistent scene/time reconciliation. + if declared_scene is not None: + declared_start = float(declared_scene["start_s"]) + declared_end = float(declared_scene["end_s"]) + declared_fps = _scene_fps_light(declared_scene, cfg) + boundary_tolerance_s = ( + cfg.cv.deep_scan.scene_boundary_epsilon_s + + cfg.cv.deep_scan.start_preroll_frames / declared_fps + ) + if declared_start - boundary_tolerance_s <= result.in_point_s < declared_end: + scene = declared_scene + + if beat is None or scene is None: + normalized.append(result) + continue + + fps = _scene_fps_light(scene, cfg) + adjusted_in_s = result.in_point_s + scene_changed = int(scene["scene_id"]) != result.scene_id + starts_before_scene = result.in_point_s < float(scene["start_s"]) + if scene_changed or starts_before_scene or result.duration_s <= 0.12: + adjusted_in_s = max(0.0, result.in_point_s - (cfg.cv.deep_scan.start_preroll_frames / fps)) + adjusted_in_s = max(float(scene["start_s"]), adjusted_in_s) + scene = _scene_for_time_light(scenes, adjusted_in_s, cfg) or scene + fps = _scene_fps_light(scene, cfg) + + matchable_duration_s = beat.duration_s + try: + from src.cv.global_scan import estimate_matchable_reference_duration + matchable_duration_s = estimate_matchable_reference_duration(beat, cfg) + except Exception: + pass + + tail_s = max(0.0, cfg.cv.deep_scan.trim_tail_frames / fps) + single_scene_duration_s = max(0.0, min(beat.duration_s, float(scene["end_s"]) - adjusted_in_s) - tail_s) + contiguous_duration_s = _contiguous_duration_light( + beat, + adjusted_in_s, + scenes, + cfg, + matchable_duration_s, + ) + max_duration_s = max(single_scene_duration_s, min(beat.duration_s, contiguous_duration_s)) + + normalized_result = result + if ( + scene_changed + or starts_before_scene + or result.duration_s <= 0.12 + or result.out_point_s > adjusted_in_s + max_duration_s + (1.0 / fps) + ): + normalized_result = replace( + result, + scene_id=int(scene["scene_id"]), + in_point_s=adjusted_in_s, + out_point_s=adjusted_in_s + max_duration_s, + in_point_frame=int(adjusted_in_s * fps), + ) + + coverage = ( + max(0.0, normalized_result.duration_s) / matchable_duration_s + if matchable_duration_s > 0 else 0.0 + ) + if coverage < cfg.cv.deep_scan.min_duration_coverage: + continue + + try: + from src.cv.content_align import align_cached_match_by_content + _, content_score = align_cached_match_by_content( + beat, + normalized_result.in_point_s, + cfg, + search_window_s=min(0.8, cfg.cv.deep_scan.content_align_window_seconds), + fps=12.5, + ) + content_gate = ( + cfg.cv.deep_scan.provisional_content_threshold + if normalized_result.is_confirmed + else min(cfg.cv.deep_scan.provisional_content_threshold, cfg.vision.content_threshold) + ) + if content_score < content_gate: + continue + if content_score < cfg.cv.deep_scan.match_threshold and normalized_result.is_confirmed: + normalized_result = replace( + normalized_result, + match_score=min(normalized_result.match_score, content_score), + is_confirmed=False, + ) + except Exception: + pass + + normalized.append(normalized_result) + + return normalized + + +# --------------------------------------------------------------------------- +# Command handlers +# --------------------------------------------------------------------------- + +def _build_transcribe_callback(cfg): + """Return a transcribe_callback closure, or None if audio is disabled.""" + from src.audio.transcriber import transcribe_video + + def _cb(path, start_s, end_s, offset_s): + return transcribe_video(path, cfg, start_s=start_s, end_s=end_s, time_offset_s=offset_s) + + return _cb + + +def _build_classify_callback(cfg): + """Return a classify_callback closure.""" + from src.llm.dramaturg import classify_beats + + def _cb(beats): + return classify_beats(beats, cfg) + + return _cb + + +def cmd_analyze(args: argparse.Namespace, cfg) -> list: + from src.pipeline.trailer_analyzer import analyze_reference_trailer + + transcribe_cb = _build_transcribe_callback(cfg) if not args.no_audio else None + classify_cb = _build_classify_callback(cfg) if not args.no_llm else None + + beats = analyze_reference_trailer( + cfg, + transcribe_callback=transcribe_cb, + classify_callback=classify_cb, + ) + + # Persist beats for downstream commands (including histogram bytes as hex) + beats_cache = cfg.paths.cache_dir / "trailer_beats.json" + beats_cache.parent.mkdir(parents=True, exist_ok=True) + beats_data = [ + { + "beat_id": b.beat_id, + "start_s": b.start_s, + "end_s": b.end_s, + "start_frame": b.start_frame, + "end_frame": b.end_frame, + "beat_type": b.beat_type.name, + "dialogue": [{"start_s": d.start_s, "end_s": d.end_s, "text": d.text} for d in b.dialogue], + "phash": b.phash, + "luma_hist": b.luma_hist.hex() if b.luma_hist else None, + "sat_hist": b.sat_hist.hex() if b.sat_hist else None, + } + for b in beats + ] + beats_cache.write_text(json.dumps(beats_data, indent=2, ensure_ascii=False), encoding="utf-8") + print(f"\n\u2705 {len(beats)} beats analyzed \u2192 {beats_cache}") + return beats + + +def _load_beats(cfg) -> list: + from src.core.models import BeatType, DialogueLine, TrailerBeat + + p = cfg.paths.cache_dir / "trailer_beats.json" + if not p.exists(): + raise FileNotFoundError(f"No cached beats at {p}. Run 'analyze' first.") + + raw = json.loads(p.read_text(encoding="utf-8")) + beats = [] + for d in raw: + dialogue = tuple( + DialogueLine(start_s=x["start_s"], end_s=x["end_s"], text=x["text"]) + for x in d.get("dialogue", []) + ) + beats.append(TrailerBeat( + beat_id=d["beat_id"], + trailer_path=cfg.paths.reference_trailer, + start_s=d["start_s"], + end_s=d["end_s"], + start_frame=d["start_frame"], + end_frame=d["end_frame"], + beat_type=BeatType[d.get("beat_type", "UNKNOWN")], + dialogue=dialogue, + phash=d.get("phash"), + luma_hist=bytes.fromhex(d["luma_hist"]) if d.get("luma_hist") else None, + sat_hist= bytes.fromhex(d["sat_hist"]) if d.get("sat_hist") else None, + )) + return beats + + +def _select_beats(beats: list, beat_id: int | None) -> list: + """Return all beats or exactly one requested beat.""" + if beat_id is None: + return beats + selected = [b for b in beats if b.beat_id == beat_id] + if not selected: + raise ValueError(f"Beat {beat_id} not found. Run 'analyze' first.") + return selected + + +def _select_results(results: list, beat_ids: set[int] | None) -> list: + """Return all results or only results for the requested beats.""" + if beat_ids is None: + return results + return [r for r in results if r.beat_id in beat_ids] + + +def _find_scene_for_in_point(cfg, in_point_s: float): + from src.cv.scene_indexer import build_scene_index + + scenes = build_scene_index(cfg) + for idx, scene in enumerate(scenes): + if scene.start_s <= in_point_s < scene.end_s: + if ( + scene.end_s - in_point_s <= cfg.cv.deep_scan.scene_boundary_epsilon_s + and idx + 1 < len(scenes) + ): + return scenes[idx + 1] + return scene + return None + + +def _reference_scoreable_segments(beat, cfg) -> list[tuple[float, float]]: + """Find visible source-matchable islands inside a trailer beat.""" + from src.cv.frame_extractor import grab_frame_at_path + from src.cv.global_scan import _is_scoreable_reference_frame + + step_s = max(0.08, cfg.cv.deep_scan.span_sample_step_s) + min_segment_s = max(0.32, step_s * 3.0) + bridge_gap_s = max(0.18, step_s * 2.0) + raw: list[tuple[float, float]] = [] + start: float | None = None + last_seen: float | None = None + t = 0.0 + while t <= beat.duration_s: + frame = grab_frame_at_path(beat.trailer_path, beat.start_s + t) + scoreable = frame is not None and _is_scoreable_reference_frame(frame, cfg) + if scoreable: + if start is None: + start = t + last_seen = t + elif start is not None and last_seen is not None and t - last_seen > bridge_gap_s: + end = min(beat.duration_s, last_seen + step_s) + if end - start >= min_segment_s: + raw.append((start, end)) + start = None + last_seen = None + t = round(t + step_s, 6) + + if start is not None and last_seen is not None: + end = min(beat.duration_s, last_seen + step_s) + if end - start >= min_segment_s: + raw.append((start, end)) + + return raw + + +def _attach_visual_segments(results: list, beats: list, cfg) -> list: + """Attach automatic sub-shot matches for multi-island trailer beats.""" + from dataclasses import replace + from src.core.models import MatchResult, MatchSegment + from src.cv.global_scan import run_global_scan + + by_id = {b.beat_id: b for b in beats} + expanded: list[MatchResult] = [] + for result in results: + beat = by_id.get(result.beat_id) + if beat is None: + expanded.append(result) + continue + + islands = _reference_scoreable_segments(beat, cfg) + if len(islands) <= 1: + primary = MatchSegment( + trailer_offset_s=0.0, + duration_s=max(0.0, result.duration_s), + scene_id=result.scene_id, + in_point_s=result.in_point_s, + out_point_s=result.out_point_s, + match_score=result.match_score, + is_confirmed=result.is_confirmed, + ) + expanded.append(replace(result, segments=(primary,))) + continue + + segments: list[MatchSegment] = [] + first_start, first_end = islands[0] + first_duration = min(max(0.0, result.duration_s), max(0.0, first_end - first_start)) + segments.append( + MatchSegment( + trailer_offset_s=first_start, + duration_s=first_duration, + scene_id=result.scene_id, + in_point_s=result.in_point_s, + out_point_s=result.in_point_s + first_duration, + match_score=result.match_score, + is_confirmed=result.is_confirmed, + ) + ) + + for start_s, end_s in islands[1:]: + segment_beat = replace( + beat, + start_s=beat.start_s + start_s, + end_s=beat.start_s + end_s, + ) + segment_matches = run_global_scan([segment_beat], cfg, seed_in_points=None) + if not segment_matches: + continue + seg = segment_matches[0] + seg_dur = min(max(0.0, end_s - start_s), max(0.0, seg.duration_s)) + segments.append( + MatchSegment( + trailer_offset_s=start_s, + duration_s=seg_dur, + scene_id=seg.scene_id, + in_point_s=seg.in_point_s, + out_point_s=seg.in_point_s + seg_dur, + match_score=seg.match_score, + is_confirmed=seg.is_confirmed, + ) + ) + + expanded.append(replace(result, segments=tuple(segments))) + return expanded + + +def cmd_match(args: argparse.Namespace, cfg) -> list: + from src.pipeline.matcher import run_matching + from dataclasses import replace + + if getattr(args, "vision", False): + cfg = replace(cfg, vision=replace(cfg.vision, enabled=True)) + if getattr(args, "no_vision", False): + cfg = replace(cfg, vision=replace(cfg.vision, enabled=False)) + + all_beats = _load_beats(cfg) + beats = _select_beats(all_beats, getattr(args, "beat", None)) + cached = _normalize_cached_results(all_beats, _load_results(cfg), cfg) if _results_cache_path(cfg).exists() else [] + seed_in_points = ( + _continuity_seed_in_points(args.beat, all_beats, cached, cfg) + if getattr(args, "beat", None) is not None + else None + ) + results = run_matching( + cfg, + beats, + force_reindex=args.force_reindex, + seed_in_points=seed_in_points, + ) + results = _attach_visual_segments(results, beats, cfg) + + # A targeted one-beat match should improve the cache without deleting + # automatic matches for other beats. + if getattr(args, "beat", None) is not None and _results_cache_path(cfg).exists(): + cached = [r for r in cached if r.beat_id != args.beat] + for result in results: + cached = _update_result(result, cached) + results_to_save = cached + else: + results_to_save = results + + _save_results(results_to_save, cfg) + + print(f"\n✅ {len(results)} / {len(beats)} beats matched.") + for r in results: + print(f" Beat {r.beat_id:03d} → scene {r.scene_id:04d} " + f"in={r.in_point_s:>8.3f}s score={r.match_score:.3f}") + return results + + +def _update_result(new_result, results: list) -> list: + """Replace or insert a MatchResult in the list (by beat_id).""" + updated = [r for r in results if r.beat_id != new_result.beat_id] + updated.append(new_result) + return sorted(updated, key=lambda r: r.beat_id) + + +def _continuity_seed_in_points(beat_id: int, beats: list, results: list, cfg) -> dict[int, list[float | tuple[float, float]]]: + beats_by_id = {b.beat_id: b for b in beats} + results_by_id = {r.beat_id: r for r in results} + target = beats_by_id.get(beat_id) + if target is None: + return {} + + seeds: list[tuple[float, float]] = [] + base_score = max(cfg.cv.deep_scan.coarse_candidate_threshold + 0.08, 0.92) + prev_matches = [ + (b, results_by_id[b.beat_id]) + for b in beats + if b.beat_id < beat_id and b.beat_id in results_by_id + ] + if prev_matches: + prev_beat, prev_result = max(prev_matches, key=lambda item: item[0].beat_id) + trailer_gap_s = max(0.0, target.start_s - prev_beat.end_s) + expected = prev_result.out_point_s + trailer_gap_s + for offset in cfg.cv.deep_scan.continuity_seed_offsets_s: + offset_score = max( + cfg.cv.deep_scan.coarse_candidate_threshold, + base_score - abs(offset) * 0.06, + ) + seeds.append((expected + offset, offset_score)) + + next_matches = [ + (b, results_by_id[b.beat_id]) + for b in beats + if b.beat_id > beat_id and b.beat_id in results_by_id + ] + if next_matches: + next_beat, next_result = min(next_matches, key=lambda item: item[0].beat_id) + trailer_gap_s = max(0.0, next_beat.start_s - target.end_s) + expected = next_result.in_point_s - trailer_gap_s - target.duration_s + for offset in cfg.cv.deep_scan.continuity_seed_offsets_s: + offset_score = max( + cfg.cv.deep_scan.coarse_candidate_threshold, + base_score - abs(offset) * 0.06, + ) + seeds.append((expected - offset, offset_score)) + + unique: dict[float, float] = {} + for seed_t, seed_score in seeds: + rounded = round(max(0.0, seed_t), 3) + unique[rounded] = max(unique.get(rounded, 0.0), seed_score) + points = [(seed_t, score) for seed_t, score in sorted(unique.items())] + return {beat_id: points} if points else {} + + +def cmd_rematch(args: argparse.Namespace, cfg) -> None: + """ + Re-run automatic matching for ONE beat. + + python cli.py rematch --beat 5 # re-scan CV for beat 5 + python cli.py rematch --beat 5 --threshold 0.40 # relax threshold + """ + + beat_id = args.beat + beats = _load_beats(cfg) + results = _load_results(cfg) if _results_cache_path(cfg).exists() else [] + + beat = next((b for b in beats if b.beat_id == beat_id), None) + if beat is None: + print(f"\u274c Beat {beat_id} not found. Run 'analyze' first.") + return + + # ---- Refine an already acceptable cached match ------------------------- + if args.refine: + current = next((r for r in results if r.beat_id == beat_id), None) + if current is None: + print(f"❌ Beat {beat_id} has no cached match to refine. Run 'match --beat {beat_id}' first.") + return + + from src.cv.content_align import align_cached_match_by_content + refined_in_s, sequence_score = align_cached_match_by_content( + beat, + current.in_point_s, + cfg, + search_window_s=args.refine_window, + ) + usable_duration_s = max(0.0, current.out_point_s - current.in_point_s) + span_score = sequence_score + scene_data = _scene_for_time_light(_load_scene_cache_light(cfg), refined_in_s, cfg) + out_point_s = refined_in_s + usable_duration_s + if scene_data is not None: + out_point_s = min(out_point_s, float(scene_data["end_s"])) + matchable_duration_s = beat.duration_s + duration_coverage = ( + max(0.0, out_point_s - refined_in_s) / matchable_duration_s + if matchable_duration_s > 0 else 0.0 + ) + if duration_coverage < cfg.cv.deep_scan.min_duration_coverage: + print( + f"❌ Beat {beat_id} refined candidate rejected: " + f"duration coverage {duration_coverage:.0%} < " + f"{cfg.cv.deep_scan.min_duration_coverage:.0%}" + ) + return + + try: + from src.cv.frame_extractor import get_video_info + fps = float(get_video_info(cfg.paths.source_movie)["fps"]) or cfg.export.edl_frame_rate + except Exception: + fps = cfg.export.edl_frame_rate + + from src.core.models import MatchResult + refined = MatchResult( + beat_id=beat_id, + scene_id=int(scene_data["scene_id"]) if scene_data is not None else current.scene_id, + source_path=current.source_path, + in_point_s=max(0.0, refined_in_s), + out_point_s=out_point_s, + in_point_frame=int(max(0.0, refined_in_s) * fps), + match_score=max(sequence_score, span_score), + match_location=current.match_location, + is_confirmed=max(sequence_score, span_score) >= cfg.cv.deep_scan.match_threshold, + ) + results = _update_result(refined, results) + _save_results(results, cfg) + print( + f"✅ Beat {beat_id} refined → " + f"in={refined.in_point_s:.3f}s, out={refined.out_point_s:.3f}s, " + f"sequence_score={refined.match_score:.3f}" + ) + return + + # ---- Re-run CV with optional threshold override ------------------------ + from dataclasses import replace as dc_replace + run_cfg = cfg + if args.threshold is not None: + run_cfg = dc_replace( + cfg, + cv=dc_replace( + cfg.cv, + deep_scan=dc_replace(cfg.cv.deep_scan, match_threshold=args.threshold), + ), + ) + print(f"ℹ️ threshold overridden to {args.threshold} for beat {beat_id}") + + from src.cv.global_scan import run_global_scan + seed_in_points = _continuity_seed_in_points(beat_id, beats, results, run_cfg) + matches = run_global_scan([beat], run_cfg, seed_in_points=seed_in_points) + + if not matches: + print(f"❌ Beat {beat_id}: no match. Try --threshold 0.40.") + return + + match = matches[0] + results = _update_result(match, results) + _save_results(results, cfg) + print(f"✅ Beat {beat_id} rematched → (in={match.in_point_s:.3f}s, score={match.match_score:.3f})") + + +def cmd_report(args: argparse.Namespace, cfg) -> None: + from src.pipeline.reporter import generate_report + beats = _select_beats(_load_beats(cfg), getattr(args, "beat", None)) + beat_ids = {b.beat_id for b in beats} if getattr(args, "beat", None) is not None else None + results = _select_results(_normalize_cached_results(_load_beats(cfg), _load_results(cfg), cfg), beat_ids) + out = generate_report(beats, results, cfg) + if getattr(args, "beat", None) is not None and not results: + print( + f"\n⚠️ Beat {args.beat} has no cached match yet. " + f"Run: python cli.py match --beat {args.beat}" + ) + print(f"\n\u2705 Report \u2192 {out}") + + +def cmd_export(args: argparse.Namespace, cfg) -> None: + from src.export.edl_writer import write_edl + from src.export.fcpxml_writer import write_fcpxml + from src.pipeline.matcher import build_timeline + + beats = _select_beats(_load_beats(cfg), getattr(args, "beat", None)) + beat_ids = {b.beat_id for b in beats} if getattr(args, "beat", None) is not None else None + results = _select_results(_normalize_cached_results(_load_beats(cfg), _load_results(cfg), cfg), beat_ids) + if getattr(args, "beat", None) is not None and not results: + print(f"❌ Beat {args.beat} has no cached match. Run 'match --beat {args.beat}' first.") + return + timeline = build_timeline(beats, results, cfg) + + fmt = args.format or cfg.export.output_format + beat_id = getattr(args, "beat", None) + out_stem = ( + f"{cfg.paths.reference_trailer.stem}_beat_{beat_id:03d}" + if beat_id is not None + else timeline.title + ) + + if fmt in ("fcpxml", "both"): + out = write_fcpxml(timeline, cfg, output_path=cfg.paths.output_dir / f"{out_stem}.fcpxml") + print(f"✅ FCPXML → {out}") + + if fmt in ("edl", "both"): + out = write_edl(timeline, cfg, output_path=cfg.paths.output_dir / f"{out_stem}.edl") + print(f"✅ EDL → {out}") + + +def cmd_run(args: argparse.Namespace, cfg) -> None: + """Full pipeline: analyze → match → report → export.""" + cmd_analyze(args, cfg) + cmd_match(args, cfg) + cmd_report(args, cfg) + cmd_export(args, cfg) + + +# --------------------------------------------------------------------------- +# Argument parser +# --------------------------------------------------------------------------- + +def _build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="ai-trailer", + description="AI Trailer Generator v2 — Pure CV scene matching", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "--config", type=Path, default=Path("config.toml"), + metavar="CONFIG", help="Path to config.toml (default: ./config.toml)", + ) + parser.add_argument( + "--log-level", default="INFO", + choices=["DEBUG", "INFO", "WARNING", "ERROR"], + help="Logging verbosity (default: INFO)", + ) + + sub = parser.add_subparsers(dest="command", required=True) + + # analyze + p_analyze = sub.add_parser("analyze", help="Detect trailer beats + fingerprint") + p_analyze.add_argument("--no-audio", action="store_true", + help="Skip Whisper (only affects beat labels, not matching)") + p_analyze.add_argument("--no-llm", action="store_true", + help="Skip LLM classification (only affects beat labels)") + + # match + p_match = sub.add_parser("match", help="Run 2-phase CV matching") + p_match.add_argument("--force-reindex", action="store_true", + help="Ignore scene cache and re-run PySceneDetect") + p_match.add_argument("--beat", type=int, + help="Match only one beat and merge it into the cached results") + p_match.add_argument("--vision", action="store_true", + help="Enable cached vision descriptions for extra automatic search seeds") + p_match.add_argument("--no-vision", action="store_true", + help="Disable vision seeding even if [vision].enabled is true") + + # rematch + p_rematch = sub.add_parser("rematch", help="Re-run or override matching for one beat") + p_rematch.add_argument("--beat", type=int, required=True, help="Beat ID to rematch") + p_rematch.add_argument("--threshold", type=float, default=None, help="Override match_threshold") + p_rematch.add_argument("--refine", action="store_true", + help="Refine the cached match by measuring a local image-content offset") + p_rematch.add_argument("--refine-window", type=float, default=None, + help="Seconds to search around the cached in-point when using --refine") + + # report + p_report = sub.add_parser("report", help="Generate HTML visual comparison report") + p_report.add_argument("--beat", type=int, help="Report only one beat") + + # export + p_export = sub.add_parser("export", help="Export timeline from cached results") + p_export.add_argument("--format", choices=["fcpxml", "edl", "both"], + help="Override [export] output_format from config") + p_export.add_argument("--beat", type=int, help="Export only one beat") + + # run + p_run = sub.add_parser("run", help="Full pipeline: analyze → match → export") + p_run.add_argument("--no-audio", action="store_true") + p_run.add_argument("--no-llm", action="store_true") + p_run.add_argument("--force-reindex", action="store_true") + p_run.add_argument("--vision", action="store_true") + p_run.add_argument("--no-vision", action="store_true") + p_run.add_argument("--format", choices=["fcpxml", "edl", "both"]) + p_run.add_argument("--beat", type=int, + help="Run match/report/export for only one cached beat") + + return parser + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + +def main() -> None: + _ensure_utf8_console() + parser = _build_parser() + args = parser.parse_args() + + _setup_logging(args.log_level) + + from src.core.config import load_config + cfg = load_config(args.config) + + dispatch = { + "analyze": cmd_analyze, + "match": cmd_match, + "rematch": cmd_rematch, + "report": cmd_report, + "export": cmd_export, + "run": cmd_run, + } + + handler = dispatch[args.command] + handler(args, cfg) + + +if __name__ == "__main__": + main() diff --git a/config.toml b/config.toml new file mode 100644 index 0000000..d3d159c --- /dev/null +++ b/config.toml @@ -0,0 +1,198 @@ +# ============================================================================= +# AI Trailer Generator v2 — Central Configuration +# ============================================================================= +# All tunable parameters, thresholds, and file paths are defined here. +# NO hardcoded values are allowed in the Python source code. +# ============================================================================= + +[project] +name = "AI Trailer Generator v2" +version = "2.0.0" +log_level = "INFO" # DEBUG | INFO | WARNING | ERROR + +# ----------------------------------------------------------------------------- +# [paths] — External video sources (read-only access) +# ----------------------------------------------------------------------------- +[paths] +source_movie = "B:/Proxy/BehindTheRedDoor_FTR_1080P_2398_Fixed.mp4" +reference_trailer = "F:/Encodings/BehindTheRedDoor_Trailer_REFERENCE.mp4" + +# Output destinations (inside project sandbox) +output_dir = "output" +cache_dir = ".cache" +proxy_dir = "proxy" + +# ----------------------------------------------------------------------------- +# [video] — Decode / proxy settings +# ----------------------------------------------------------------------------- +[video] +# Target FPS for internal frame extraction (0 = use source FPS) +extract_fps = 1.0 +# Proxy resolution for template matching (width x height) +proxy_width = 640 +proxy_height = 360 + +# ----------------------------------------------------------------------------- +# [cv] — Computer Vision engine parameters +# Phase 1 — "Vibe Check" (histogram / perceptual hash scene-level filter) +# Phase 2 — "Deep Scan" (template matching frame-level precision) +# ----------------------------------------------------------------------------- +[cv] + +[cv.vibe_check] +# Number of top candidate scenes to forward to Deep Scan +top_k_candidates = 100 + +# Histogram comparison method: +# CORREL=0 | CHISQR=1 | INTERSECT=2 | BHATTACHARYYA=3 +hist_compare_method = 0 + +# Histogram bins per channel (hue, saturation) +hist_bins_hue = 50 +hist_bins_saturation = 60 + +# pHash similarity threshold (lower = stricter; 0–64 range) +# NOTE: 12 is for near-duplicate detection. Cross-video matching +# (trailer vs source movie with different grading/compression) +# needs 25–35. Start at 32 and tighten if you get false positives. +phash_max_distance = 32 + +# ---- Text-Safe Crop ------------------------------------------------------- +# Fraction of frame height to EXCLUDE from the top (e.g. logos, title cards) +crop_top_fraction = 0.15 +# Fraction of frame height to EXCLUDE from the bottom (e.g. letterbox, subs) +crop_bottom_fraction = 0.30 + +[cv.deep_scan] +# Step size in SECONDS between sampled frames during the coarse scan pass +coarse_step_seconds = 0.5 + +# Minimum template match score (0.0–1.0) to accept a candidate as a hit +match_threshold = 0.65 + +# Store/report lower-confidence automatic candidates for visual review instead +# of dropping them as "NO MATCH". Confirmed exports can still use match_threshold. +provisional_match_threshold = 0.45 + +# Lower gate for entering temporal multi-frame refinement. The final decision +# still uses sequence/span scoring; this only avoids rejecting real matches +# because one midpoint frame is weak. +coarse_candidate_threshold = 0.50 + +# Candidate ranking weights. Duration coverage matters when the same visual +# shot appears multiple times: prefer the occurrence that can cover the beat. +sequence_score_weight = 0.55 +span_score_weight = 0.15 +coarse_score_weight = 0.10 +duration_score_weight = 0.20 +duration_tie_break_score_delta = 0.03 +min_duration_coverage = 0.65 +continuity_seed_offsets_s = [-1.0, 0.0, 0.5, 1.0, 1.5, 2.0, 3.0] +scene_seed_top_k = 30 +scene_seed_points_per_scene = 6 +content_rerank_candidate_count = 100 +skip_coarse_scan_with_weighted_seeds = false + +# cv2.matchTemplate method: +# TM_CCOEFF_NORMED=5 (recommended), TM_CCORR_NORMED=3 +match_method = 5 + +# If a coarse hit is found, refine by scanning ± this many seconds +refine_window_seconds = 0.6 +refine_step_seconds = 0.04 # ≈ 1 frame at 25 fps +content_align_window_seconds = 0.48 +content_align_sample_step_s = 0.28 +content_validation_weight = 0.35 +provisional_content_threshold = 0.42 + +# When several adjacent frame offsets score almost the same, prefer the earlier +# one. This avoids matches that are visually correct but start a few frames late. +start_tie_break_score_delta = 0.015 +start_preroll_frames = 0 + +# Automatic temporal verification after a coarse image hit. +# More candidates reduces false positives from visually similar shots. +sequence_candidate_count = 240 +sequence_min_distance_s = 1.0 +max_refine_candidates = 6 + +# Match-span detection: trim when the source starts drifting into a different shot. +span_sample_step_s = 0.08 +trim_tail_frames = 4 + +# If a refined in-point lands this close to a detected scene end, treat it as +# the next scene. Scene detectors often place cuts a frame or two around the +# visible boundary. +scene_boundary_epsilon_s = 0.12 +scoreable_luma_mean_min = 24.0 +scoreable_luma_p90_min = 58.0 +scoreable_contrast_min = 24.0 + +# ----------------------------------------------------------------------------- +# [scene_detection] — PySceneDetect parameters (used to segment source movie) +# ----------------------------------------------------------------------------- +[scene_detection] +# Threshold for ContentDetector (lower = more sensitive) +content_threshold = 27.0 +# Minimum scene duration in seconds +min_scene_duration_s = 1.5 + +# ----------------------------------------------------------------------------- +# [whisper] — Dialogue / audio analysis +# ----------------------------------------------------------------------------- +[whisper] +model = "large-v3" +language = "ar" +device = "cuda" # cuda | cpu +compute_type = "float16" # float16 | int8 | float32 + +# ----------------------------------------------------------------------------- +# [llm] — Used ONLY for thematic segmentation / dramaturgy +# ----------------------------------------------------------------------------- +[llm] +provider = "openrouter" +base_url = "https://openrouter.ai/api/v1" +model = "google/gemma-4-31b-it" +timeout_seconds = 120 +temperature = 0.3 +max_tokens = 4096 + +# ----------------------------------------------------------------------------- +# [vision] — Optional cached visual descriptions for ambiguous matching +# ----------------------------------------------------------------------------- +[vision] +# Disabled by default to avoid surprise API cost. Enable when you want the +# matcher to ask a vision-capable model for cached 3-frame scene descriptions. +enabled = false +provider = "openrouter" +base_url = "https://openrouter.ai/api/v1" +model = "google/gemma-4-31b-it" +timeout_seconds = 90 +temperature = 0.0 +max_tokens = 350 + +# Cost controls: per beat, only the top scene-level candidates are described, +# and cached descriptions in .cache/vision_descriptions.json are reused. +scene_candidate_top_k = 8 +max_new_descriptions_per_run = 12 +max_seed_scenes = 3 +seed_points_per_scene = 12 +seed_score = 0.88 +max_refine_candidates = 6 +local_scan_step_s = 0.12 +local_scan_max_points_per_scene = 180 +local_scan_top_candidates = 18 +local_scan_tie_break_score_delta = 0.08 +multi_shot_cut_corr_threshold = 0.20 +multi_shot_boundary_tolerance_s = 0.20 +fullscan_fallback = false +content_threshold = 0.22 +similarity_threshold = 0.18 + +# ----------------------------------------------------------------------------- +# [export] — FCPXML / EDL export settings +# ----------------------------------------------------------------------------- +[export] +fcpxml_version = "1.10" +edl_frame_rate = 23.976 # fps used in EDL timecode generation +output_format = "fcpxml" # fcpxml | edl | both diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..5e831ff --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,68 @@ +[build-system] +requires = ["setuptools>=69", "wheel"] +build-backend = "setuptools.backends.legacy:build" + +[project] +name = "ai-trailer-2026" +version = "2.0.0" +description = "Frame-accurate trailer reconstruction via pure Computer Vision" +requires-python = ">=3.11" + +dependencies = [ + # Computer Vision + "opencv-python>=4.9", + "imagehash>=4.3", + "numpy>=1.26", + "Pillow>=10.0", + + # Scene detection + "scenedetect[opencv]>=0.6", + + # Audio / transcription + "faster-whisper>=1.0", + + # Config / secrets + # tomllib — built-in stdlib (Python 3.11+), no install needed + "python-dotenv>=1.0", # loads .env into os.environ + + # Export + "lxml>=5.0", # FCPXML generation +] + +[project.optional-dependencies] +dev = [ + "pytest>=8.0", + "pytest-cov", + "mypy>=1.9", + "ruff>=0.4", +] + +[tool.setuptools.packages.find] +where = ["."] +include = ["src*"] + +# --------------------------------------------------------------------------- +# Ruff (linter + formatter) +# --------------------------------------------------------------------------- +[tool.ruff] +line-length = 100 +target-version = "py311" + +[tool.ruff.lint] +select = ["E", "F", "I", "UP", "B", "C4", "ANN"] +ignore = ["ANN101", "ANN102"] + +# --------------------------------------------------------------------------- +# Mypy +# --------------------------------------------------------------------------- +[tool.mypy] +python_version = "3.11" +strict = true +ignore_missing_imports = true + +# --------------------------------------------------------------------------- +# Pytest +# --------------------------------------------------------------------------- +[tool.pytest.ini_options] +testpaths = ["tests"] +addopts = "-v --tb=short" diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8c67a95 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,37 @@ +# AI Trailer Generator v2 — Python Dependencies +# Generated from: pip freeze (Python 3.11, Windows) +# Install with: pip install -r requirements.txt +# +# NOTE: faster-whisper and scenedetect may pull in torch/cuda extras +# depending on your platform. See README for CUDA setup. + +# Computer Vision +opencv-python>=4.9 +numpy>=1.26 +Pillow>=10.0 +ImageHash>=4.3 +PyWavelets>=1.6 # required by ImageHash + +# Video scene detection +scenedetect[opencv]>=0.6 + +# Audio transcription +# faster-whisper>=1.0 ← uncomment when ready to use Whisper +# (pulls in torch; large download) + +# Config & secrets +python-dotenv>=1.0 # loads .env into os.environ +# tomllib — stdlib in Python 3.11+, no install needed + +# XML export +# lxml>=5.0 ← optional: only needed for advanced FCPXML features +# stdlib xml.etree.ElementTree is used by default + +# HTTP (LLM calls via urllib.request — no extra dep needed) +# requests ← not used; stdlib urllib is sufficient + +# Dev / testing +pytest>=8.0 +pytest-cov +# mypy>=1.9 +# ruff>=0.4 diff --git a/setup_venv.ps1 b/setup_venv.ps1 new file mode 100644 index 0000000..dac843d --- /dev/null +++ b/setup_venv.ps1 @@ -0,0 +1,89 @@ +# setup_venv.ps1 — AI Trailer Generator v2 — Virtual Environment Setup +# Run once: .\setup_venv.ps1 +# ----------------------------------------------------------------------- +# If blocked by ExecutionPolicy: +# Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser + +$ErrorActionPreference = "Stop" +$VENV_DIR = ".venv" + +function Resolve-ProjectPython { + $cmd = Get-Command python -ErrorAction SilentlyContinue + if ($cmd) { + return $cmd.Source + } + + $candidates = @( + "$env:LOCALAPPDATA\Programs\Python\Python311\python.exe", + "$env:LOCALAPPDATA\Microsoft\WindowsApps\python.exe" + ) + + foreach ($candidate in $candidates) { + if ($candidate -and (Test-Path $candidate)) { + return $candidate + } + } + + throw "Python 3.11+ not found. Install Python 3.11+ or add it to PATH." +} + +Write-Host "" +Write-Host "==================================================" -ForegroundColor Cyan +Write-Host " AI Trailer Generator v2 — venv Setup" -ForegroundColor Cyan +Write-Host "==================================================" -ForegroundColor Cyan +Write-Host "" + +# ---- 1. Check Python version ------------------------------------------------ +$PROJECT_PYTHON = Resolve-ProjectPython +$pythonVersion = & $PROJECT_PYTHON --version 2>&1 +Write-Host "Python: $pythonVersion" +if ($pythonVersion -notmatch "3\.(1[1-9]|[2-9]\d)") { + Write-Error "Python 3.11+ required. Found: $pythonVersion" + exit 1 +} + +# ---- 2. Create venv --------------------------------------------------------- +if (Test-Path $VENV_DIR) { + Write-Host "Virtual environment already exists at '$VENV_DIR'. Skipping creation." -ForegroundColor Yellow +} else { + Write-Host "Creating virtual environment in '$VENV_DIR' ..." -ForegroundColor Green + & $PROJECT_PYTHON -m venv $VENV_DIR + Write-Host "Done." -ForegroundColor Green +} + +# ---- 3. Activate venv ------------------------------------------------------- +$activate = Join-Path $VENV_DIR "Scripts\Activate.ps1" +Write-Host "Activating virtual environment ..." +. $activate +$VENV_PYTHON = Join-Path $VENV_DIR "Scripts\python.exe" + +# ---- 4. Upgrade pip --------------------------------------------------------- +Write-Host "Upgrading pip ..." -ForegroundColor Green +& $VENV_PYTHON -m pip install --upgrade pip --quiet + +# ---- 5. Install dependencies ------------------------------------------------ +Write-Host "Installing dependencies from requirements.txt ..." -ForegroundColor Green +& $VENV_PYTHON -m pip install -r requirements.txt + +# ---- 6. Copy .env if missing ------------------------------------------------ +if (-not (Test-Path ".env")) { + if (Test-Path ".env.example") { + Copy-Item ".env.example" ".env" + Write-Host "" + Write-Host " .env created from .env.example." -ForegroundColor Yellow + Write-Host " >>> Open .env and fill in your OPENROUTER_API_KEY! <<<" -ForegroundColor Red + } +} + +# ---- 7. Done ---------------------------------------------------------------- +Write-Host "" +Write-Host "==================================================" -ForegroundColor Cyan +Write-Host " Setup complete!" -ForegroundColor Green +Write-Host "" +Write-Host " Activate the venv with:" +Write-Host " .\.venv\Scripts\Activate.ps1" -ForegroundColor White +Write-Host "" +Write-Host " Then run the pipeline:" +Write-Host " python cli.py run --no-audio --no-llm" -ForegroundColor White +Write-Host "==================================================" -ForegroundColor Cyan +Write-Host "" diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..521670b --- /dev/null +++ b/src/__init__.py @@ -0,0 +1 @@ +# src package diff --git a/src/audio/__init__.py b/src/audio/__init__.py new file mode 100644 index 0000000..20dc2fc --- /dev/null +++ b/src/audio/__init__.py @@ -0,0 +1 @@ +# src.audio package — Whisper / dialogue analysis diff --git a/src/audio/transcriber.py b/src/audio/transcriber.py new file mode 100644 index 0000000..95be4d0 --- /dev/null +++ b/src/audio/transcriber.py @@ -0,0 +1,182 @@ +""" +src/audio/transcriber.py — Whisper transcription via faster-whisper + +Responsibility: + - Transcribe audio from a video file into a list of DialogueLine objects + - Optionally restrict to a time window [start_s, end_s] (for single beats) + - All model config (model name, device, compute_type) comes from AppConfig + +The LLM is NOT used here. This is pure audio-to-text. +""" + +from __future__ import annotations + +import logging +import tempfile +from pathlib import Path +from typing import Sequence + +from src.core.config import AppConfig +from src.core.models import DialogueLine + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Audio extraction helper (video → wav via ffmpeg) +# --------------------------------------------------------------------------- + +def _extract_audio_segment( + video_path: Path, + start_s: float | None, + end_s: float | None, + out_wav: Path, +) -> None: + """ + Use ffmpeg (subprocess) to extract a mono 16kHz WAV from *video_path*. + + Args: + video_path: Source video. + start_s: Start time in seconds (None = beginning of file). + end_s: End time in seconds (None = end of file). + out_wav: Destination WAV path. + + Raises: + RuntimeError: If ffmpeg exits with a non-zero code. + """ + import subprocess + + cmd = ["ffmpeg", "-y", "-loglevel", "error"] + + if start_s is not None: + cmd += ["-ss", str(start_s)] + if end_s is not None and start_s is not None: + cmd += ["-t", str(end_s - start_s)] + elif end_s is not None: + cmd += ["-to", str(end_s)] + + cmd += [ + "-i", str(video_path), + "-vn", # no video + "-ac", "1", # mono + "-ar", "16000", # 16 kHz — Whisper native rate + "-f", "wav", + str(out_wav), + ] + + result = subprocess.run(cmd, capture_output=True) + if result.returncode != 0: + raise RuntimeError( + f"ffmpeg failed (code {result.returncode}):\n" + f"{result.stderr.decode(errors='replace')}" + ) + + +# --------------------------------------------------------------------------- +# Core transcription +# --------------------------------------------------------------------------- + +def transcribe_video( + video_path: Path, + cfg: AppConfig, + start_s: float | None = None, + end_s: float | None = None, + time_offset_s: float = 0.0, +) -> list[DialogueLine]: + """ + Transcribe dialogue from *video_path* using faster-whisper. + + Args: + video_path: Path to source or trailer video. + cfg: Application configuration (whisper section). + start_s: Clip start in video-file seconds (None = beginning). + end_s: Clip end in video-file seconds (None = end of file). + time_offset_s: Added to every transcript timestamp so that beat-level + transcripts align with absolute movie time. + + Returns: + List of DialogueLine ordered by start time. + """ + try: + from faster_whisper import WhisperModel + except ImportError: + raise ImportError("faster-whisper not installed. Run: pip install faster-whisper") + + w = cfg.whisper + + logger.info( + "Transcribing %s [%.1f–%s] with %s on %s …", + video_path.name, + start_s or 0.0, + f"{end_s:.1f}s" if end_s else "end", + w.model, + w.device, + ) + + with tempfile.TemporaryDirectory() as tmp: + wav = Path(tmp) / "audio.wav" + _extract_audio_segment(video_path, start_s, end_s, wav) + + model = WhisperModel(w.model, device=w.device, compute_type=w.compute_type) + segments, _ = model.transcribe( + str(wav), + language=w.language if w.language else None, + beam_size=5, + ) + + lines: list[DialogueLine] = [] + for seg in segments: + lines.append(DialogueLine( + start_s=seg.start + time_offset_s, + end_s=seg.end + time_offset_s, + text=seg.text.strip(), + )) + + logger.info("Transcription done: %d segments.", len(lines)) + return lines + + +# --------------------------------------------------------------------------- +# Convenience: transcribe a whole file and return grouped by scene +# --------------------------------------------------------------------------- + +def transcribe_full_movie( + cfg: AppConfig, +) -> list[DialogueLine]: + """ + Transcribe the entire source movie. Use this result to enrich Scenes + via a dialogue_callback passed to build_scene_index(). + """ + return transcribe_video(cfg.paths.source_movie, cfg) + + +def assign_dialogue_to_scenes( + all_dialogue: Sequence[DialogueLine], + scenes: list["src.core.models.Scene"], # type: ignore[name-defined] +) -> list["src.core.models.Scene"]: # type: ignore[name-defined] + """ + Distribute pre-transcribed DialogueLines into their respective Scenes. + + A line is assigned to the scene whose window contains its midpoint. + + Args: + all_dialogue: Full-movie transcript as flat list. + scenes: Scene list (will be replaced with enriched copies). + + Returns: + New list of Scene objects with dialogue tuples populated. + """ + from dataclasses import replace + from src.core.models import Scene + + enriched: list[Scene] = [] + for scene in scenes: + matched = tuple( + line for line in all_dialogue + if scene.start_s <= (line.start_s + line.end_s) / 2.0 < scene.end_s + ) + enriched.append(replace(scene, dialogue=matched)) + + total_assigned = sum(len(s.dialogue) for s in enriched) + logger.info("Assigned %d dialogue lines across %d scenes.", total_assigned, len(enriched)) + return enriched diff --git a/src/core/__init__.py b/src/core/__init__.py new file mode 100644 index 0000000..61e4b74 --- /dev/null +++ b/src/core/__init__.py @@ -0,0 +1 @@ +# src.core package diff --git a/src/core/config.py b/src/core/config.py new file mode 100644 index 0000000..3e3f798 --- /dev/null +++ b/src/core/config.py @@ -0,0 +1,387 @@ +""" +src/core/config.py — Configuration loader for AI Trailer Generator v2 + +Loads config.toml and exposes typed, nested dataclasses. +All CV thresholds, paths, and model settings are sourced exclusively here. +API keys are NEVER stored in config.toml; they are loaded from .env. +""" + +from __future__ import annotations + +import os +import tomllib + +try: + from dotenv import load_dotenv as _load_dotenv + _HAS_DOTENV = True +except ImportError: # dotenv optional — falls back to existing env vars + _HAS_DOTENV = False +from dataclasses import dataclass, field +from pathlib import Path +from typing import Literal + + +# --------------------------------------------------------------------------- +# Leaf sections +# --------------------------------------------------------------------------- + +@dataclass(frozen=True) +class PathsConfig: + source_movie: Path + reference_trailer: Path + output_dir: Path + cache_dir: Path + proxy_dir: Path + + +@dataclass(frozen=True) +class VideoConfig: + extract_fps: float + proxy_width: int + proxy_height: int + + +@dataclass(frozen=True) +class VibeCheckConfig: + top_k_candidates: int + hist_compare_method: int + hist_bins_hue: int + hist_bins_saturation: int + phash_max_distance: int + crop_top_fraction: float + crop_bottom_fraction: float + + +@dataclass(frozen=True) +class DeepScanConfig: + coarse_step_seconds: float + match_threshold: float + provisional_match_threshold: float + coarse_candidate_threshold: float + sequence_score_weight: float + span_score_weight: float + coarse_score_weight: float + duration_score_weight: float + duration_tie_break_score_delta: float + min_duration_coverage: float + continuity_seed_offsets_s: tuple[float, ...] + scene_seed_top_k: int + scene_seed_points_per_scene: int + content_rerank_candidate_count: int + skip_coarse_scan_with_weighted_seeds: bool + max_refine_candidates: int + match_method: int + refine_window_seconds: float + refine_step_seconds: float + content_align_window_seconds: float + content_align_sample_step_s: float + content_validation_weight: float + provisional_content_threshold: float + start_tie_break_score_delta: float + start_preroll_frames: int + sequence_candidate_count: int + sequence_min_distance_s: float + span_sample_step_s: float + trim_tail_frames: int + scene_boundary_epsilon_s: float + scoreable_luma_mean_min: float + scoreable_luma_p90_min: float + scoreable_contrast_min: float + + +@dataclass(frozen=True) +class CVConfig: + vibe_check: VibeCheckConfig + deep_scan: DeepScanConfig + + +@dataclass(frozen=True) +class SceneDetectionConfig: + content_threshold: float + min_scene_duration_s: float + + +@dataclass(frozen=True) +class WhisperConfig: + model: str + language: str + device: Literal["cuda", "cpu"] + compute_type: Literal["float16", "int8", "float32"] + + +@dataclass(frozen=True) +class LLMConfig: + provider: Literal["ollama", "openai", "openrouter"] + base_url: str + model: str + timeout_seconds: int + temperature: float + max_tokens: int + # Loaded from .env — NEVER committed to version control + api_key: str = "" + + +@dataclass(frozen=True) +class VisionConfig: + enabled: bool + provider: Literal["openai", "openrouter"] + base_url: str + model: str + timeout_seconds: int + temperature: float + max_tokens: int + scene_candidate_top_k: int + max_new_descriptions_per_run: int + max_seed_scenes: int + seed_points_per_scene: int + seed_score: float + max_refine_candidates: int + local_scan_step_s: float + local_scan_max_points_per_scene: int + local_scan_top_candidates: int + local_scan_tie_break_score_delta: float + multi_shot_cut_corr_threshold: float + multi_shot_boundary_tolerance_s: float + fullscan_fallback: bool + content_threshold: float + similarity_threshold: float + api_key: str = "" + + +@dataclass(frozen=True) +class ExportConfig: + fcpxml_version: str + edl_frame_rate: float + output_format: Literal["fcpxml", "edl", "both"] + + +# --------------------------------------------------------------------------- +# Root config — single object passed through the entire application +# --------------------------------------------------------------------------- + +@dataclass(frozen=True) +class AppConfig: + project_name: str + version: str + log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR"] + + paths: PathsConfig + video: VideoConfig + cv: CVConfig + scene_detection: SceneDetectionConfig + whisper: WhisperConfig + llm: LLMConfig + vision: VisionConfig + export: ExportConfig + + +# --------------------------------------------------------------------------- +# Loader +# --------------------------------------------------------------------------- + +_DEFAULT_CONFIG_PATH = Path(__file__).parents[2] / "config.toml" +_DEFAULT_ENV_PATH = Path(__file__).parents[2] / ".env" + + +def load_config( + config_path: Path = _DEFAULT_CONFIG_PATH, + env_path: Path = _DEFAULT_ENV_PATH, +) -> AppConfig: + """ + Parse config.toml and return a fully-typed, immutable AppConfig. + + API keys are read from the .env file (or existing environment variables); + they are never stored in config.toml. + + Args: + config_path: Absolute or relative path to the TOML file. + Defaults to /config.toml. + env_path: Path to the .env file. + Defaults to /.env. + + Raises: + FileNotFoundError: If the TOML file does not exist. + KeyError / TypeError: If a required key is missing or has the wrong type. + """ + # Load .env first so os.environ is populated before we read it below. + if _HAS_DOTENV: + _load_dotenv(dotenv_path=env_path, override=False) + + if not config_path.exists(): + raise FileNotFoundError( + f"Config file not found: {config_path}\n" + "Copy config.toml.example to config.toml and adjust your paths." + ) + + with config_path.open("rb") as fh: + raw: dict = tomllib.load(fh) + + project = raw["project"] + paths_raw = raw["paths"] + video_raw = raw["video"] + cv_raw = raw["cv"] + sd_raw = raw["scene_detection"] + whisper_raw = raw["whisper"] + llm_raw = raw["llm"] + vision_raw = raw.get("vision", {}) + export_raw = raw["export"] + + # Resolve paths relative to the config file's parent directory so the + # project is relocatable, but keep absolute paths as-is. + def _resolve(p: str) -> Path: + path = Path(p) + return path if path.is_absolute() else (config_path.parent / path).resolve() + + paths = PathsConfig( + source_movie=_resolve(paths_raw["source_movie"]), + reference_trailer=_resolve(paths_raw["reference_trailer"]), + output_dir=_resolve(paths_raw["output_dir"]), + cache_dir=_resolve(paths_raw["cache_dir"]), + proxy_dir=_resolve(paths_raw["proxy_dir"]), + ) + + video = VideoConfig( + extract_fps=float(video_raw["extract_fps"]), + proxy_width=int(video_raw["proxy_width"]), + proxy_height=int(video_raw["proxy_height"]), + ) + + vibe_check = VibeCheckConfig( + top_k_candidates=int(cv_raw["vibe_check"]["top_k_candidates"]), + hist_compare_method=int(cv_raw["vibe_check"]["hist_compare_method"]), + hist_bins_hue=int(cv_raw["vibe_check"]["hist_bins_hue"]), + hist_bins_saturation=int(cv_raw["vibe_check"]["hist_bins_saturation"]), + phash_max_distance=int(cv_raw["vibe_check"]["phash_max_distance"]), + crop_top_fraction=float(cv_raw["vibe_check"]["crop_top_fraction"]), + crop_bottom_fraction=float(cv_raw["vibe_check"]["crop_bottom_fraction"]), + ) + + deep_scan = DeepScanConfig( + coarse_step_seconds=float(cv_raw["deep_scan"]["coarse_step_seconds"]), + match_threshold=float(cv_raw["deep_scan"]["match_threshold"]), + provisional_match_threshold=float(cv_raw["deep_scan"].get("provisional_match_threshold", 0.45)), + coarse_candidate_threshold=float(cv_raw["deep_scan"].get("coarse_candidate_threshold", cv_raw["deep_scan"]["match_threshold"])), + sequence_score_weight=float(cv_raw["deep_scan"].get("sequence_score_weight", 0.55)), + span_score_weight=float(cv_raw["deep_scan"].get("span_score_weight", 0.15)), + coarse_score_weight=float(cv_raw["deep_scan"].get("coarse_score_weight", 0.10)), + duration_score_weight=float(cv_raw["deep_scan"].get("duration_score_weight", 0.20)), + duration_tie_break_score_delta=float(cv_raw["deep_scan"].get("duration_tie_break_score_delta", 0.03)), + min_duration_coverage=float(cv_raw["deep_scan"].get("min_duration_coverage", 0.65)), + continuity_seed_offsets_s=tuple( + float(v) for v in cv_raw["deep_scan"].get( + "continuity_seed_offsets_s", + [-1.0, 0.0, 0.5, 1.0, 1.5, 2.0, 3.0], + ) + ), + scene_seed_top_k=int(cv_raw["deep_scan"].get("scene_seed_top_k", 30)), + scene_seed_points_per_scene=int(cv_raw["deep_scan"].get("scene_seed_points_per_scene", 6)), + content_rerank_candidate_count=int(cv_raw["deep_scan"].get("content_rerank_candidate_count", 100)), + skip_coarse_scan_with_weighted_seeds=bool(cv_raw["deep_scan"].get("skip_coarse_scan_with_weighted_seeds", False)), + max_refine_candidates=int(cv_raw["deep_scan"].get("max_refine_candidates", 6)), + match_method=int(cv_raw["deep_scan"]["match_method"]), + refine_window_seconds=float(cv_raw["deep_scan"].get("refine_window_seconds", 0.6)), + refine_step_seconds=float(cv_raw["deep_scan"]["refine_step_seconds"]), + content_align_window_seconds=float(cv_raw["deep_scan"].get("content_align_window_seconds", 0.48)), + content_align_sample_step_s=float(cv_raw["deep_scan"].get("content_align_sample_step_s", 0.28)), + content_validation_weight=float(cv_raw["deep_scan"].get("content_validation_weight", 0.35)), + provisional_content_threshold=float(cv_raw["deep_scan"].get("provisional_content_threshold", 0.42)), + start_tie_break_score_delta=float(cv_raw["deep_scan"].get("start_tie_break_score_delta", 0.015)), + start_preroll_frames=int(cv_raw["deep_scan"].get("start_preroll_frames", 0)), + sequence_candidate_count=int(cv_raw["deep_scan"].get("sequence_candidate_count", 240)), + sequence_min_distance_s=float(cv_raw["deep_scan"].get("sequence_min_distance_s", 1.0)), + span_sample_step_s=float(cv_raw["deep_scan"].get("span_sample_step_s", 0.08)), + trim_tail_frames=int(cv_raw["deep_scan"].get("trim_tail_frames", 2)), + scene_boundary_epsilon_s=float(cv_raw["deep_scan"].get("scene_boundary_epsilon_s", 0.12)), + scoreable_luma_mean_min=float(cv_raw["deep_scan"].get("scoreable_luma_mean_min", 24.0)), + scoreable_luma_p90_min=float(cv_raw["deep_scan"].get("scoreable_luma_p90_min", 58.0)), + scoreable_contrast_min=float(cv_raw["deep_scan"].get("scoreable_contrast_min", 24.0)), + ) + + scene_detection = SceneDetectionConfig( + content_threshold=float(sd_raw["content_threshold"]), + min_scene_duration_s=float(sd_raw["min_scene_duration_s"]), + ) + + whisper = WhisperConfig( + model=whisper_raw["model"], + language=whisper_raw["language"], + device=whisper_raw["device"], + compute_type=whisper_raw["compute_type"], + ) + + # Resolve API key: env var takes precedence over config (which shouldn't have it). + # Supported env vars (in priority order): + # OPENROUTER_API_KEY → for provider = openrouter + # OPENAI_API_KEY → for provider = openai + # LLM_API_KEY → universal fallback + _provider = llm_raw["provider"] + _api_key = ( + os.environ.get("OPENROUTER_API_KEY", "") + if _provider == "openrouter" + else os.environ.get("OPENAI_API_KEY", "") + if _provider == "openai" + else "" + ) or os.environ.get("LLM_API_KEY", "") + + llm = LLMConfig( + provider=_provider, + base_url=llm_raw["base_url"], + model=llm_raw["model"], + timeout_seconds=int(llm_raw["timeout_seconds"]), + temperature=float(llm_raw["temperature"]), + max_tokens=int(llm_raw["max_tokens"]), + api_key=_api_key, + ) + + vision_provider = vision_raw.get("provider", _provider if _provider in ("openai", "openrouter") else "openrouter") + vision_api_key = ( + os.environ.get("OPENROUTER_API_KEY", "") + if vision_provider == "openrouter" + else os.environ.get("OPENAI_API_KEY", "") + ) or os.environ.get("VISION_API_KEY", "") or os.environ.get("LLM_API_KEY", "") + + vision = VisionConfig( + enabled=bool(vision_raw.get("enabled", False)), + provider=vision_provider, + base_url=str(vision_raw.get("base_url", llm.base_url)), + model=str(vision_raw.get("model", llm.model)), + timeout_seconds=int(vision_raw.get("timeout_seconds", llm.timeout_seconds)), + temperature=float(vision_raw.get("temperature", 0.0)), + max_tokens=int(vision_raw.get("max_tokens", 350)), + scene_candidate_top_k=int(vision_raw.get("scene_candidate_top_k", 8)), + max_new_descriptions_per_run=int(vision_raw.get("max_new_descriptions_per_run", 12)), + max_seed_scenes=int(vision_raw.get("max_seed_scenes", 3)), + seed_points_per_scene=int(vision_raw.get("seed_points_per_scene", 12)), + seed_score=float(vision_raw.get("seed_score", 0.88)), + max_refine_candidates=int(vision_raw.get("max_refine_candidates", 6)), + local_scan_step_s=float(vision_raw.get("local_scan_step_s", 0.12)), + local_scan_max_points_per_scene=int(vision_raw.get("local_scan_max_points_per_scene", 180)), + local_scan_top_candidates=int(vision_raw.get("local_scan_top_candidates", 18)), + local_scan_tie_break_score_delta=float(vision_raw.get("local_scan_tie_break_score_delta", 0.08)), + multi_shot_cut_corr_threshold=float(vision_raw.get("multi_shot_cut_corr_threshold", 0.20)), + multi_shot_boundary_tolerance_s=float(vision_raw.get("multi_shot_boundary_tolerance_s", 0.20)), + fullscan_fallback=bool(vision_raw.get("fullscan_fallback", False)), + content_threshold=float(vision_raw.get("content_threshold", 0.22)), + similarity_threshold=float(vision_raw.get("similarity_threshold", 0.18)), + api_key=vision_api_key, + ) + + export = ExportConfig( + fcpxml_version=str(export_raw["fcpxml_version"]), + edl_frame_rate=float(export_raw["edl_frame_rate"]), + output_format=export_raw["output_format"], + ) + + return AppConfig( + project_name=project["name"], + version=project["version"], + log_level=project["log_level"], + paths=paths, + video=video, + cv=CVConfig(vibe_check=vibe_check, deep_scan=deep_scan), + scene_detection=scene_detection, + whisper=whisper, + llm=llm, + vision=vision, + export=export, + ) diff --git a/src/core/models.py b/src/core/models.py new file mode 100644 index 0000000..838609c --- /dev/null +++ b/src/core/models.py @@ -0,0 +1,287 @@ +""" +src/core/models.py — Canonical data models for AI Trailer Generator v2 + +Rules: + - Every model is a frozen dataclass (immutable after creation). + - All fields are strictly typed; no bare dicts or untyped lists. + - Seconds are always float; frame numbers are always int. + - Confidence scores live in [0.0, 1.0]. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from enum import Enum, auto +from pathlib import Path +from typing import Optional + + +# =========================================================================== +# Enumerations +# =========================================================================== + +class MatchMethod(Enum): + """CV template matching method (mirrors cv2.TM_* constants).""" + TM_SQDIFF = 0 + TM_SQDIFF_NORMED = 1 + TM_CCORR = 2 + TM_CCORR_NORMED = 3 + TM_CCOEFF = 4 + TM_CCOEFF_NORMED = 5 + + +class BeatType(Enum): + """Narrative role of a trailer beat (for dramaturgy / LLM use only).""" + HOOK = auto() # Opening attention grabber + SETUP = auto() # World / character introduction + CONFLICT = auto() # Inciting incident / rising tension + CLIMAX = auto() # Peak action / emotion + RESOLUTION = auto() # Cool-down / tagline + UNKNOWN = auto() + + +class ExportFormat(Enum): + FCPXML = "fcpxml" + EDL = "edl" + BOTH = "both" + + +# =========================================================================== +# Phase 0 — Source-movie scene index +# =========================================================================== + +@dataclass(frozen=True) +class DialogueLine: + """Single transcribed line from Whisper output.""" + start_s: float # onset in seconds + end_s: float # offset in seconds + text: str # verbatim transcript + speaker: Optional[str] = None # diarisation label if available + + @property + def duration_s(self) -> float: + return self.end_s - self.start_s + + +@dataclass(frozen=True) +class Scene: + """ + One detected scene in the source movie. + + Produced by PySceneDetect; enriched by Whisper dialogue and + (optionally) perceptual hashes during the Vibe Check phase. + """ + scene_id: int # zero-based index in source movie + source_path: Path # absolute path to the source video file + start_s: float # scene start in seconds + end_s: float # scene end in seconds + start_frame: int # first frame number + end_frame: int # last frame number + + # Populated after Vibe Check fingerprinting + luma_hist: Optional[bytes] = None # serialised np.ndarray (pickle) + sat_hist: Optional[bytes] = None + phash: Optional[str] = None # 64-bit hex string + + # Populated after Whisper pass + dialogue: tuple[DialogueLine, ...] = field(default_factory=tuple) + + @property + def duration_s(self) -> float: + return self.end_s - self.start_s + + @property + def midpoint_s(self) -> float: + return self.start_s + self.duration_s / 2.0 + + def __repr__(self) -> str: + return ( + f"Scene(id={self.scene_id}, " + f"{self.start_s:.2f}s–{self.end_s:.2f}s, " + f"dur={self.duration_s:.2f}s)" + ) + + +# =========================================================================== +# Phase 1 — Reference-trailer beat +# =========================================================================== + +@dataclass(frozen=True) +class TrailerBeat: + """ + One cut / segment in the reference trailer. + + The 'beat' is the atomic unit of a trailer: it maps exactly to one + clip that will later be sourced from the original movie. + """ + beat_id: int + trailer_path: Path + start_s: float + end_s: float + start_frame: int + end_frame: int + + beat_type: BeatType = BeatType.UNKNOWN # set by LLM dramaturgy pass + + # Visual fingerprints of the *middle* frame (populated by CV pipeline) + luma_hist: Optional[bytes] = None + sat_hist: Optional[bytes] = None + phash: Optional[str] = None + + # Dialogue extracted from this beat + dialogue: tuple[DialogueLine, ...] = field(default_factory=tuple) + + @property + def duration_s(self) -> float: + return self.end_s - self.start_s + + @property + def midpoint_s(self) -> float: + return self.start_s + self.duration_s / 2.0 + + def __repr__(self) -> str: + return ( + f"TrailerBeat(id={self.beat_id}, " + f"{self.beat_type.name}, " + f"{self.start_s:.2f}s–{self.end_s:.2f}s)" + ) + + +# =========================================================================== +# Phase 2 — CV match result +# =========================================================================== + +@dataclass(frozen=True) +class VibeHit: + """ + Intermediate result from Phase 1 (Vibe Check — histogram/pHash). + + Represents a *candidate* scene that passed the coarse filter. + Not yet a confirmed match; forwarded to Deep Scan. + """ + beat_id: int + scene_id: int + hist_score: float # histogram similarity [0.0, 1.0] (CORREL method) + phash_distance: int # Hamming distance [0, 64]; lower = more similar + combined_score: float # weighted aggregate used for ranking + + +@dataclass(frozen=True) +class MatchSegment: + """ + One source-backed visual island inside a trailer beat. + + Some trailer beats contain multiple shots separated by fades/title frames. + A single continuous source in/out cannot represent those beats accurately. + """ + trailer_offset_s: float + duration_s: float + scene_id: int + in_point_s: float + out_point_s: float + match_score: float + is_confirmed: bool = True + + +@dataclass(frozen=True) +class MatchResult: + """ + Final, confirmed match from Phase 2 (Deep Scan — template matching). + + One MatchResult per TrailerBeat: the best frame-accurate hit found + inside the source movie. + """ + beat_id: int # which trailer beat was matched + scene_id: int # which source scene contains the match + source_path: Path # absolute path to source video + + # Frame-accurate in-point / out-point in the SOURCE movie + in_point_s: float # matched frame onset in source seconds + out_point_s: float # computed out-point (in_point + beat duration) + in_point_frame: int # matched frame number in source movie + + # Match quality + match_score: float # cv2.matchTemplate peak value [0.0, 1.0] + match_location: tuple[int, int] = field(default_factory=lambda: (0, 0)) + # (x, y) pixel location of the best match within the source frame + + # Provenance + vibe_hit: Optional[VibeHit] = None # the candidate that led here + is_confirmed: bool = True + segments: tuple[MatchSegment, ...] = field(default_factory=tuple) + + @property + def duration_s(self) -> float: + return self.out_point_s - self.in_point_s + + def __repr__(self) -> str: + return ( + f"MatchResult(beat={self.beat_id} → scene={self.scene_id}, " + f"in={self.in_point_s:.3f}s, score={self.match_score:.3f})" + ) + + +# =========================================================================== +# Phase 3 — Edit timeline (pre-export) +# =========================================================================== + +@dataclass(frozen=True) +class EditClip: + """ + One clip on the final edit timeline, ready for FCPXML / EDL export. + + Combines beat dramaturgy + the CV-confirmed source in/out points. + """ + clip_index: int # position on the timeline (0-based) + beat: TrailerBeat + match: MatchResult + + # Timeline position (in the OUTPUT trailer) + timeline_start_s: float + timeline_end_s: float + source_duration_s: float | None = None + trailer_tail_s: float = 0.0 + + # Optional audio override (e.g. VO or music) + audio_path: Optional[Path] = None + audio_offset_s: float = 0.0 + + @property + def timeline_duration_s(self) -> float: + return self.timeline_end_s - self.timeline_start_s + + @property + def source_timeline_duration_s(self) -> float: + if self.source_duration_s is not None: + return max(0.0, self.source_duration_s) + return self.timeline_duration_s + + def __repr__(self) -> str: + return ( + f"EditClip(#{self.clip_index}, " + f"tl={self.timeline_start_s:.2f}s–{self.timeline_end_s:.2f}s, " + f"src={self.match.in_point_s:.3f}s)" + ) + + +@dataclass(frozen=True) +class EditTimeline: + """ + The complete ordered sequence of EditClips that forms the trailer. + + Passed to the export layer (FCPXML / EDL writer). + """ + title: str + frame_rate: float # e.g. 23.976 + clips: tuple[EditClip, ...] # ordered by clip_index + + @property + def total_duration_s(self) -> float: + if not self.clips: + return 0.0 + last = max(self.clips, key=lambda c: c.timeline_end_s) + return last.timeline_end_s + + @property + def clip_count(self) -> int: + return len(self.clips) diff --git a/src/cv/__init__.py b/src/cv/__init__.py new file mode 100644 index 0000000..4d40340 --- /dev/null +++ b/src/cv/__init__.py @@ -0,0 +1 @@ +# src.cv package — Computer Vision engine diff --git a/src/cv/content_align.py b/src/cv/content_align.py new file mode 100644 index 0000000..f500a95 --- /dev/null +++ b/src/cv/content_align.py @@ -0,0 +1,240 @@ +from __future__ import annotations + +import math +import shutil +import subprocess +from pathlib import Path + +import numpy as np +from PIL import Image, ImageFilter, ImageOps + +from src.core.config import AppConfig +from src.core.models import TrailerBeat + + +def _run(cmd: list[str]) -> None: + result = subprocess.run(cmd, capture_output=True) + if result.returncode != 0: + raise RuntimeError(result.stderr.decode(errors="replace")) + + +def _extract_frames( + video_path: Path, + start_s: float, + duration_s: float, + fps: float, + out_dir: Path, + prefix: str, +) -> None: + out_dir.mkdir(parents=True, exist_ok=True) + _run([ + "ffmpeg", "-y", "-loglevel", "error", + "-ss", str(max(0.0, start_s)), + "-i", str(video_path), + "-t", str(max(0.04, duration_s)), + "-vf", f"scale=640:360,fps={fps}", + str(out_dir / f"{prefix}_%04d.png"), + ]) + + +def _cropped_image(path: Path, cfg: AppConfig) -> Image.Image: + image = Image.open(path).convert("L") + image = _trim_dark_borders(image) + w, h = image.size + # Final validation should see the composition. The broader text-safe crop + # used for coarse search can remove bodies, furniture and lower-frame + # spatial cues that distinguish otherwise similar face/window shots. + top = int(h * 0.05) + bottom = int(h * 0.95) + return image.crop((0, top, w, bottom)) + + +def _trim_dark_borders(image: Image.Image) -> Image.Image: + """Remove encoded black matte/pillarbox borders before content scoring.""" + gray = image.convert("L") + arr = np.asarray(gray, dtype=np.float32) + if arr.size == 0: + return image + h, w = arr.shape[:2] + col_signal = np.percentile(arr, 90, axis=0) + row_signal = np.percentile(arr, 90, axis=1) + active_cols = np.where(col_signal > 18.0)[0] + active_rows = np.where(row_signal > 18.0)[0] + if active_cols.size >= max(8, int(w * 0.35)): + x0 = max(0, int(active_cols[0]) - 2) + x1 = min(w, int(active_cols[-1]) + 3) + else: + x0, x1 = 0, w + if active_rows.size >= max(8, int(h * 0.35)): + y0 = max(0, int(active_rows[0]) - 2) + y1 = min(h, int(active_rows[-1]) + 3) + else: + y0, y1 = 0, h + if x1 - x0 < int(w * 0.35) or y1 - y0 < int(h * 0.35): + return image + return image.crop((x0, y0, x1, y1)) + + +def _feature(path: Path, cfg: AppConfig) -> np.ndarray: + image = _cropped_image(path, cfg) + w, h = image.size + image = image.crop((int(w * 0.10), int(h * 0.10), int(w * 0.90), int(h * 0.90))) + image = ImageOps.equalize(image).filter(ImageFilter.FIND_EDGES).resize((160, 62)) + arr = np.asarray(image, dtype=np.float32) + return (arr - arr.mean()) / (arr.std() + 1e-6) + + +def _luma_feature(path: Path, cfg: AppConfig) -> np.ndarray: + image = ImageOps.equalize(_cropped_image(path, cfg)).resize((160, 80)) + arr = np.asarray(image, dtype=np.float32) + return (arr - arr.mean()) / (arr.std() + 1e-6) + + +def _hist_feature(path: Path, cfg: AppConfig) -> np.ndarray: + image = _trim_dark_borders(Image.open(path).convert("RGB")) + w, h = image.size + top = int(h * 0.05) + bottom = int(h * 0.95) + arr = np.asarray(image.crop((0, top, w, bottom)).resize((160, 80)), dtype=np.float32) + hist_parts = [] + for channel in range(3): + hist, _ = np.histogram(arr[:, :, channel], bins=32, range=(0, 255)) + hist = hist.astype(np.float32) + hist_parts.append(hist / (hist.sum() + 1e-6)) + return np.concatenate(hist_parts) + + +def _spatial_hist_feature(path: Path, cfg: AppConfig) -> np.ndarray: + image = _trim_dark_borders(Image.open(path).convert("RGB")) + w, h = image.size + top = int(h * 0.05) + bottom = int(h * 0.95) + arr = np.asarray(image.crop((0, top, w, bottom)).resize((160, 80)), dtype=np.float32) + cells = [] + grid_y = 4 + grid_x = 4 + cell_h = arr.shape[0] // grid_y + cell_w = arr.shape[1] // grid_x + for gy in range(grid_y): + for gx in range(grid_x): + cell = arr[gy * cell_h:(gy + 1) * cell_h, gx * cell_w:(gx + 1) * cell_w, :] + for channel in range(3): + hist, _ = np.histogram(cell[:, :, channel], bins=16, range=(0, 255)) + hist = hist.astype(np.float32) + cells.append(hist / (hist.sum() + 1e-6)) + return np.concatenate(cells) + + +def _is_dark(path: Path, cfg: AppConfig) -> bool: + image = _trim_dark_borders(Image.open(path).convert("L")) + w, h = image.size + top = int(h * 0.05) + bottom = int(h * 0.95) + arr = np.asarray(image.crop((0, top, w, bottom)), dtype=np.float32) + return float(arr.mean()) < 28.0 and float(np.percentile(arr, 90)) < 58.0 + + +def _corr(a: np.ndarray, b: np.ndarray) -> float: + return float((a * b).mean()) + + +def _hist_intersection(a: np.ndarray, b: np.ndarray) -> float: + return float(np.minimum(a, b).sum() / (np.maximum(a, b).sum() + 1e-6)) + + +def _paired_frame_score(ref_path: Path, src_path: Path, cfg: AppConfig) -> float: + edge_score = _corr(_feature(ref_path, cfg), _feature(src_path, cfg)) + luma_score = _corr(_luma_feature(ref_path, cfg), _luma_feature(src_path, cfg)) + hist_score = _hist_intersection(_hist_feature(ref_path, cfg), _hist_feature(src_path, cfg)) + spatial_score = _hist_intersection(_spatial_hist_feature(ref_path, cfg), _spatial_hist_feature(src_path, cfg)) + return ( + edge_score * 0.24 + + luma_score * 0.24 + + hist_score * 0.14 + + spatial_score * 0.38 + ) + + +def align_cached_match_by_content( + beat: TrailerBeat, + estimated_in_point_s: float, + cfg: AppConfig, + search_window_s: float | None = None, + fps: float = 25.0, +) -> tuple[float, float]: + """ + Measure the local source offset directly from rendered frame content. + + This is intentionally independent from the global OpenCV matcher: it only + needs FFmpeg, Pillow and numpy, and it scans a small window around an + already plausible candidate. + """ + window_s = ( + search_window_s + if search_window_s is not None + else cfg.cv.deep_scan.content_align_window_seconds + ) + sample_step_s = max(1.0 / fps, cfg.cv.deep_scan.content_align_sample_step_s) + source_start_s = max(0.0, estimated_in_point_s - window_s) + source_duration_s = beat.duration_s + (2.0 * window_s) + 0.5 + + tmp = cfg.paths.output_dir / "align_tmp" / f"beat_{beat.beat_id:03d}" + shutil.rmtree(tmp, ignore_errors=True) + tmp.mkdir(parents=True, exist_ok=True) + try: + ref_dir = tmp / "ref" + src_dir = tmp / "src" + _extract_frames(beat.trailer_path, beat.start_s, beat.duration_s, fps, ref_dir, "ref") + _extract_frames(cfg.paths.source_movie, source_start_s, source_duration_s, fps, src_dir, "src") + + ref_frames = sorted(ref_dir.glob("ref_*.png")) + src_frames = sorted(src_dir.glob("src_*.png")) + if not ref_frames or not src_frames: + return estimated_in_point_s, 0.0 + + sample_frame_step = max(1, int(round(sample_step_s * fps))) + min_matchable_frames = max(1, len(ref_frames) - int(round(0.24 * fps))) + template_offsets: list[int] = [] + templates: list[tuple[int, np.ndarray]] = [] + for idx in range(0, min_matchable_frames, sample_frame_step): + path = ref_frames[idx] + if _is_dark(path, cfg): + continue + template_offsets.append(idx) + templates.append((idx, _feature(path, cfg))) + + if len(templates) < 3: + template_offsets = list(range(0, min_matchable_frames, sample_frame_step)) + templates = [ + (idx, _feature(ref_frames[idx], cfg)) + for idx in template_offsets + ] + + search_start_frame = 0 + search_end_frame = max(0, len(src_frames) - min_matchable_frames) + estimated_frame = int(round((estimated_in_point_s - source_start_s) * fps)) + best_frame = estimated_frame + best_score = -1.0 + + for candidate_frame in range(search_start_frame, search_end_frame + 1): + scores: list[float] = [] + for offset_frame in template_offsets: + src_idx = candidate_frame + offset_frame + if src_idx < 0 or src_idx >= len(src_frames): + break + scores.append(_paired_frame_score(ref_frames[offset_frame], src_frames[src_idx], cfg)) + if len(scores) < max(3, math.ceil(len(templates) * 0.65)): + continue + + avg_score = sum(scores) / len(scores) + min_score = min(scores) + score = (avg_score * 0.68) + (min_score * 0.32) + if score > best_score + 0.003: + best_score = score + best_frame = candidate_frame + elif score >= best_score - 0.003 and abs(candidate_frame - estimated_frame) < abs(best_frame - estimated_frame): + best_frame = candidate_frame + + return source_start_s + (best_frame / fps), max(0.0, best_score) + finally: + shutil.rmtree(tmp, ignore_errors=True) diff --git a/src/cv/deep_scan.py b/src/cv/deep_scan.py new file mode 100644 index 0000000..d8adcab --- /dev/null +++ b/src/cv/deep_scan.py @@ -0,0 +1,253 @@ +""" +src/cv/deep_scan.py — Phase 2: Frame-accurate template matching (Deep Scan) + +Responsibility: + Given a TrailerBeat and a ranked list of VibeHit candidates, open the + source video and scan each candidate scene in two passes: + + 1. Coarse pass: step through at coarse_step_seconds intervals, + comparing via cv2.matchTemplate. + 2. Refine pass: if coarse score > threshold, zoom in ± refine_window_seconds + at refine_step_seconds resolution to pin the exact in-point. + +Returns a MatchResult if a confident hit is found, otherwise None. +""" + +from __future__ import annotations + +import logging +from pathlib import Path +from typing import Sequence + +import cv2 +import numpy as np + +from src.core.config import AppConfig +from src.core.models import MatchResult, Scene, TrailerBeat, VibeHit +from src.cv.fingerprinting import text_safe_crop +from src.cv.frame_extractor import ( + grab_frame_at, + grab_frame_at_path, + iter_frames_stepped, + open_video, +) + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Template preparation +# --------------------------------------------------------------------------- + +def _prepare_template( + trailer_beat: TrailerBeat, + cfg: AppConfig, + proxy_w: int, + proxy_h: int, +) -> np.ndarray | None: + """ + Extract, crop, and resize the representative frame from the trailer beat. + + This frame becomes the cv2.matchTemplate "needle". + """ + vc = cfg.cv.vibe_check + ds = cfg.cv.deep_scan + + beat_frame = grab_frame_at_path( + trailer_beat.trailer_path, + trailer_beat.midpoint_s, + ) + if beat_frame is None: + logger.warning("Beat %d: cannot decode midpoint frame.", trailer_beat.beat_id) + return None + + cropped = text_safe_crop(beat_frame, vc.crop_top_fraction, vc.crop_bottom_fraction) + resized = cv2.resize(cropped, (proxy_w, proxy_h), interpolation=cv2.INTER_AREA) + + # Crop the template by 10% on all sides to allow sliding window (translation invariance) + # when matching against the source movie, which might have slight pan/scan shifts. + margin_y = int(proxy_h * 0.10) + margin_x = int(proxy_w * 0.10) + template = resized[margin_y : proxy_h - margin_y, margin_x : proxy_w - margin_x] + + return template + + +# --------------------------------------------------------------------------- +# Single-frame match +# --------------------------------------------------------------------------- + +def _match_frame( + source_frame: np.ndarray, + template: np.ndarray, + method: int, + proxy_w: int, + proxy_h: int, + crop_top: float, + crop_bottom: float, +) -> tuple[float, tuple[int, int]]: + """ + Run cv2.matchTemplate between *source_frame* and *template*. + + Returns: + (score, (x, y)) where score ∈ [0, 1] for CCOEFF_NORMED. + """ + cropped = text_safe_crop(source_frame, crop_top, crop_bottom) + haystack = cv2.resize(cropped, (proxy_w, proxy_h), interpolation=cv2.INTER_AREA) + + # Match the slightly smaller template inside the full proxy frame + result = cv2.matchTemplate(haystack, template, method) + _, max_val, _, max_loc = cv2.minMaxLoc(result) + return float(max_val), (int(max_loc[0]), int(max_loc[1])) + + +# --------------------------------------------------------------------------- +# Deep Scan core +# --------------------------------------------------------------------------- + +def scan_scene( + beat: TrailerBeat, + scene: Scene, + template: np.ndarray, + cfg: AppConfig, +) -> tuple[float, float, tuple[int, int]] | None: + """ + Scan one source scene in two passes (coarse → refine). + + Returns: + (best_timestamp_s, best_score, best_location) or None if no hit. + """ + ds = cfg.cv.deep_scan + vc = cfg.cv.vibe_check + proxy_w = cfg.video.proxy_width + proxy_h = cfg.video.proxy_height + + best_t = scene.start_s + best_score = 0.0 + best_loc = (0, 0) + + # ---- Coarse pass -------------------------------------------------------- + with open_video(scene.source_path) as cap: + for t, frame in iter_frames_stepped( + cap, scene.start_s, scene.end_s, ds.coarse_step_seconds + ): + score, loc = _match_frame( + frame, template, ds.match_method, + proxy_w, proxy_h, vc.crop_top_fraction, vc.crop_bottom_fraction, + ) + if score > best_score: + best_score = score + best_t = t + best_loc = loc + + if best_score < ds.match_threshold: + return None # scene doesn't contain a match worth refining + + # ---- Refine pass ---------------------------------------------------- + refine_start = max(scene.start_s, best_t - ds.refine_window_seconds) + refine_end = min(scene.end_s, best_t + ds.refine_window_seconds) + + refined_t = best_t + refined_score = best_score + refined_loc = best_loc + + for t, frame in iter_frames_stepped( + cap, refine_start, refine_end, ds.refine_step_seconds + ): + score, loc = _match_frame( + frame, template, ds.match_method, + proxy_w, proxy_h, vc.crop_top_fraction, vc.crop_bottom_fraction, + ) + if score > refined_score: + refined_score = score + refined_t = t + refined_loc = loc + + logger.debug( + "Beat %d → Scene %d: coarse=%.3f refined=%.3f @%.3fs", + beat.beat_id, scene.scene_id, best_score, refined_score, refined_t, + ) + return refined_t, refined_score, refined_loc + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + +def run_deep_scan( + beat: TrailerBeat, + candidates: Sequence[VibeHit], + scenes_by_id: dict[int, Scene], + cfg: AppConfig, +) -> MatchResult | None: + """ + Phase 2 Deep Scan: iterate over Vibe Check candidates and template-match. + + Args: + beat: The trailer beat to source. + candidates: Ranked VibeHit list from Phase 1 (best first). + scenes_by_id: Lookup dict: scene_id → Scene. + cfg: Application configuration. + + Returns: + The best MatchResult above threshold, or None if no match found. + """ + proxy_w = cfg.video.proxy_width + proxy_h = cfg.video.proxy_height + + template = _prepare_template(beat, cfg, proxy_w, proxy_h) + if template is None: + return None + + best_result: MatchResult | None = None + + for vibe_hit in candidates: + scene = scenes_by_id.get(vibe_hit.scene_id) + if scene is None: + logger.warning("VibeHit references unknown scene_id=%d", vibe_hit.scene_id) + continue + + hit = scan_scene(beat, scene, template, cfg) + if hit is None: + continue + + in_point_s, match_score, match_loc = hit + + # Frame number: approximate via FPS (refined later if needed) + from src.cv.frame_extractor import get_video_info + info = get_video_info(scene.source_path) + fps = float(info["fps"]) or 24.0 + in_point_frame = int(in_point_s * fps) + + candidate_result = MatchResult( + beat_id=beat.beat_id, + scene_id=scene.scene_id, + source_path=scene.source_path, + in_point_s=in_point_s, + out_point_s=in_point_s + beat.duration_s, + in_point_frame=in_point_frame, + match_score=match_score, + match_location=match_loc, + vibe_hit=vibe_hit, + ) + + if best_result is None or match_score > best_result.match_score: + best_result = candidate_result + + # Early exit: if score is very high, no need to check other candidates + if match_score >= 0.90: + logger.info( + "Beat %d: early-exit match (score=%.3f) in scene %d @%.3fs", + beat.beat_id, match_score, scene.scene_id, in_point_s, + ) + break + + if best_result: + logger.info("Beat %d → MATCH scene=%d score=%.3f in=%.3fs", + beat.beat_id, best_result.scene_id, + best_result.match_score, best_result.in_point_s) + else: + logger.warning("Beat %d → NO MATCH found in %d candidates.", + beat.beat_id, len(candidates)) + + return best_result diff --git a/src/cv/fingerprinting.py b/src/cv/fingerprinting.py new file mode 100644 index 0000000..dc0c0b0 --- /dev/null +++ b/src/cv/fingerprinting.py @@ -0,0 +1,228 @@ +""" +src/cv/fingerprinting.py — Image fingerprinting for the Vibe Check phase + +Responsibilities (Single Responsibility Principle): + - Text-Safe Crop: strip top/bottom fractions to hide logos & letterbox + - Luma + Saturation histogram extraction (scale-invariant) + - Perceptual hash (pHash) via imagehash + +This module is PURELY functional — no file I/O, no video decoding, +no search logic. It takes numpy arrays and returns numeric descriptors. +""" + +from __future__ import annotations + +import pickle +from typing import TYPE_CHECKING + +import cv2 +import numpy as np + +try: + import imagehash + from PIL import Image as PilImage + _HAS_IMAGEHASH = True +except ImportError: + _HAS_IMAGEHASH = False + +if TYPE_CHECKING: + from src.core.config import VibeCheckConfig + + +# --------------------------------------------------------------------------- +# Text-Safe Crop +# --------------------------------------------------------------------------- + +def text_safe_crop( + frame: np.ndarray, + crop_top: float, + crop_bottom: float, +) -> np.ndarray: + """ + Remove the top and bottom fractions of a frame. + + This eliminates title cards, logos (top) and letterbox / subtitles + (bottom) before any colour analysis, preventing false positives. + + Args: + frame: BGR or greyscale frame as (H, W[, C]) ndarray. + crop_top: Fraction [0, 1) of height to remove from the top. + crop_bottom: Fraction [0, 1) of height to remove from the bottom. + + Returns: + Cropped view (no copy — avoids memory overhead). + + Raises: + ValueError: If crop fractions are out of range or overlap. + """ + if not (0.0 <= crop_top < 1.0): + raise ValueError(f"crop_top must be in [0, 1); got {crop_top}") + if not (0.0 <= crop_bottom < 1.0): + raise ValueError(f"crop_bottom must be in [0, 1); got {crop_bottom}") + if crop_top + crop_bottom >= 1.0: + raise ValueError( + f"crop_top ({crop_top}) + crop_bottom ({crop_bottom}) must be < 1.0" + ) + + h = frame.shape[0] + y_start = int(h * crop_top) + y_end = int(h * (1.0 - crop_bottom)) + return frame[y_start:y_end] + + +# --------------------------------------------------------------------------- +# Histogram extraction +# --------------------------------------------------------------------------- + +def extract_hs_histograms( + frame_bgr: np.ndarray, + bins_luma: int | None = None, + bins_sat: int | None = None, + *, + bins_hue: int | None = None, +) -> tuple[np.ndarray, np.ndarray]: + """ + Compute normalised Luma (Value) and Saturation histograms from a BGR frame. + + We use Luma and Saturation (ignoring Hue) because Hue is highly sensitive + to color grading differences between the trailer and the source movie. + + Args: + frame_bgr: BGR frame (H, W, 3) uint8. + bins_luma: Number of histogram bins for the Luma channel [0, 256). + bins_hue: Backwards-compatible alias for bins_luma. + bins_sat: Number of histogram bins for the Saturation channel [0, 256). + + Returns: + (luma_hist, sat_hist) — each a 1-D float32 ndarray, L2-normalised. + """ + if bins_luma is None: + bins_luma = bins_hue + elif bins_hue is not None and bins_hue != bins_luma: + raise ValueError("bins_hue is an alias for bins_luma; pass only one value") + if bins_luma is None or bins_sat is None: + raise TypeError("bins_luma/bins_hue and bins_sat are required") + + hsv = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2HSV) + luma = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY) + + # Use perceptual grayscale luma rather than HSV Value. Value would make + # saturated red and blue look identical, weakening the scene-level filter. + luma_hist = cv2.calcHist( + [luma], [0], None, [bins_luma], [0, 256] + ).flatten().astype(np.float32) + + sat_hist = cv2.calcHist( + [hsv], [1], None, [bins_sat], [0, 256] + ).flatten().astype(np.float32) + + # L2-normalise so scene size doesn't affect scores + cv2.normalize(luma_hist, luma_hist, alpha=1.0, norm_type=cv2.NORM_L2) + cv2.normalize(sat_hist, sat_hist, alpha=1.0, norm_type=cv2.NORM_L2) + + return luma_hist, sat_hist + + +def compare_histograms( + hist_a: np.ndarray, + hist_b: np.ndarray, + method: int, +) -> float: + """ + Compare two histograms using cv2.compareHist. + + Args: + hist_a, hist_b: 1-D float32 ndarrays of identical shape. + method: cv2.HISTCMP_* constant (e.g. cv2.HISTCMP_CORREL = 0). + + Returns: + Raw score from cv2.compareHist (range depends on method). + For CORREL: [-1, 1], higher = more similar. + For BHATTACHARYYA: [0, 1], lower = more similar. + """ + return float(cv2.compareHist(hist_a, hist_b, method)) + + +# --------------------------------------------------------------------------- +# Perceptual Hash +# --------------------------------------------------------------------------- + +def compute_phash(frame_bgr: np.ndarray, hash_size: int = 8) -> str: + """ + Compute a perceptual hash (pHash) of a BGR frame. + + pHash is rotation- and scale-invariant; it catches visual similarity + even when resolution differs between trailer proxy and source movie. + + Args: + frame_bgr: BGR frame (H, W, 3) uint8. + hash_size: DCT block size; 8 → 64-bit hash (default). + + Returns: + Hex string representation of the 64-bit hash (e.g. "f8e0e0e0..."). + + Raises: + RuntimeError: If imagehash is not installed. + """ + if not _HAS_IMAGEHASH: + raise RuntimeError( + "imagehash is not installed. Run: pip install imagehash" + ) + rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB) + pil = PilImage.fromarray(rgb) + phash = imagehash.phash(pil, hash_size=hash_size) + return str(phash) + + +def phash_distance(hash_a: str, hash_b: str) -> int: + """ + Compute Hamming distance between two pHash hex strings. + + Args: + hash_a, hash_b: Hex strings as returned by compute_phash(). + + Returns: + Integer Hamming distance [0, 64]. 0 = identical. + """ + if not _HAS_IMAGEHASH: + raise RuntimeError("imagehash is not installed.") + return int(imagehash.hex_to_hash(hash_a) - imagehash.hex_to_hash(hash_b)) + + +# --------------------------------------------------------------------------- +# Serialisation helpers (histograms ↔ bytes for caching) +# --------------------------------------------------------------------------- + +def hist_to_bytes(hist: np.ndarray) -> bytes: + """Serialise a numpy histogram array for storage in a Scene/Beat model.""" + return pickle.dumps(hist, protocol=pickle.HIGHEST_PROTOCOL) + + +def bytes_to_hist(data: bytes) -> np.ndarray: + """Deserialise a numpy histogram array from bytes.""" + return pickle.loads(data) # noqa: S301 (trusted internal cache only) + + +# --------------------------------------------------------------------------- +# High-level convenience: fingerprint one frame using config +# --------------------------------------------------------------------------- + +def fingerprint_frame( + frame_bgr: np.ndarray, + cfg: "VibeCheckConfig", +) -> tuple[bytes, bytes, str]: + """ + Apply Text-Safe Crop, histogram extraction, and pHash in one call. + + Args: + frame_bgr: Full BGR frame (H, W, 3) uint8. + cfg: VibeCheckConfig carrying crop fractions and bin counts. + + Returns: + (luma_hist_bytes, sat_hist_bytes, phash_hex) + """ + cropped = text_safe_crop(frame_bgr, cfg.crop_top_fraction, cfg.crop_bottom_fraction) + luma_hist, sat_hist = extract_hs_histograms(cropped, cfg.hist_bins_hue, cfg.hist_bins_saturation) + phash_hex = compute_phash(cropped) + + return hist_to_bytes(luma_hist), hist_to_bytes(sat_hist), phash_hex diff --git a/src/cv/frame_extractor.py b/src/cv/frame_extractor.py new file mode 100644 index 0000000..5cedd19 --- /dev/null +++ b/src/cv/frame_extractor.py @@ -0,0 +1,172 @@ +""" +src/cv/frame_extractor.py — Low-level video frame access + +Responsibility: + Provide a thin, testable wrapper around cv2.VideoCapture for: + - seeking to an exact timestamp and returning one BGR frame + - iterating frames with a configurable step size + - extracting the "representative" middle frame of a Scene / TrailerBeat + +No fingerprinting, no matching — only raw frame delivery. +""" + +from __future__ import annotations + +import logging +from contextlib import contextmanager +from pathlib import Path +from typing import Generator, Iterator + +import cv2 +import numpy as np + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Context-managed VideoCapture +# --------------------------------------------------------------------------- + +@contextmanager +def open_video(path: Path) -> Generator[cv2.VideoCapture, None, None]: + """ + Context manager that opens a VideoCapture and guarantees release. + + Args: + path: Absolute path to the video file. + + Raises: + FileNotFoundError: If the file does not exist. + RuntimeError: If OpenCV cannot open the file. + """ + if not path.exists(): + raise FileNotFoundError(f"Video not found: {path}") + + cap = cv2.VideoCapture(str(path)) + if not cap.isOpened(): + raise RuntimeError(f"OpenCV could not open video: {path}") + + try: + yield cap + finally: + cap.release() + + +# --------------------------------------------------------------------------- +# Video metadata +# --------------------------------------------------------------------------- + +def get_video_info(path: Path) -> dict[str, float | int]: + """ + Return basic metadata without keeping the file open. + + Returns: + dict with keys: fps, frame_count, duration_s, width, height + """ + with open_video(path) as cap: + fps = cap.get(cv2.CAP_PROP_FPS) + frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + + duration_s = frame_count / fps if fps > 0 else 0.0 + return { + "fps": fps, + "frame_count": frame_count, + "duration_s": duration_s, + "width": width, + "height": height, + } + + +# --------------------------------------------------------------------------- +# Single frame extraction +# --------------------------------------------------------------------------- + +def grab_frame_at(cap: cv2.VideoCapture, timestamp_s: float) -> np.ndarray | None: + """ + Seek to *timestamp_s* and return the BGR frame at that position. + + Uses CAP_PROP_POS_MSEC for sub-frame accuracy. + + Args: + cap: An already-open VideoCapture. + timestamp_s: Target time in seconds. + + Returns: + BGR ndarray (H, W, 3) or None if seeking / decoding failed. + """ + cap.set(cv2.CAP_PROP_POS_MSEC, timestamp_s * 1000.0) + ok, frame = cap.read() + if not ok or frame is None: + logger.debug("grab_frame_at: failed at %.3fs", timestamp_s) + return None + return frame + + +def grab_frame_at_path(path: Path, timestamp_s: float) -> np.ndarray | None: + """ + One-shot convenience: open → seek → grab → release. + Prefer open_video() when grabbing multiple frames from the same file. + """ + with open_video(path) as cap: + return grab_frame_at(cap, timestamp_s) + + +# --------------------------------------------------------------------------- +# Middle-frame extraction (representative frame for fingerprinting) +# --------------------------------------------------------------------------- + +def grab_midpoint_frame( + cap: cv2.VideoCapture, + start_s: float, + end_s: float, +) -> np.ndarray | None: + """ + Grab the frame at the exact midpoint of a [start_s, end_s] interval. + + Args: + cap: Open VideoCapture for the source video. + start_s: Interval start in seconds. + end_s: Interval end in seconds. + + Returns: + BGR frame or None if decoding failed. + """ + mid = start_s + (end_s - start_s) / 2.0 + return grab_frame_at(cap, mid) + + +# --------------------------------------------------------------------------- +# Stepped-frame iterator (used by Deep Scan coarse pass) +# --------------------------------------------------------------------------- + +def iter_frames_stepped( + cap: cv2.VideoCapture, + start_s: float, + end_s: float, + step_s: float, +) -> Iterator[tuple[float, np.ndarray]]: + """ + Yield (timestamp_s, frame) for every *step_s* increment in [start_s, end_s]. + + Frames that fail to decode are silently skipped. + + Args: + cap: Open VideoCapture. + start_s: Scan window start in seconds. + end_s: Scan window end in seconds. + step_s: Step between samples in seconds. + + Yields: + (timestamp_s, bgr_frame) + """ + if step_s <= 0: + raise ValueError(f"step_s must be > 0; got {step_s}") + + t = start_s + while t <= end_s: + frame = grab_frame_at(cap, t) + if frame is not None: + yield t, frame + t = round(t + step_s, 6) # avoid float accumulation drift diff --git a/src/cv/global_scan.py b/src/cv/global_scan.py new file mode 100644 index 0000000..89b0930 --- /dev/null +++ b/src/cv/global_scan.py @@ -0,0 +1,1509 @@ +import logging +import cv2 +import numpy as np +import subprocess as sp +from typing import Sequence +import time +from dataclasses import replace + +from src.core.config import AppConfig +from src.core.models import MatchResult, TrailerBeat +from src.cv.fingerprinting import text_safe_crop +from src.cv.frame_extractor import grab_frame_at_path, get_video_info, open_video, grab_frame_at + +logger = logging.getLogger(__name__) +SeedPoint = float | tuple[float, float] +_REFERENCE_CUT_CACHE: dict[tuple[str, float, float, float], list[float]] = {} + + +def _prepare_template(frame: np.ndarray, cfg: AppConfig) -> np.ndarray: + proxy_w = cfg.video.proxy_width + proxy_h = cfg.video.proxy_height + cb = text_safe_crop( + frame, + cfg.cv.vibe_check.crop_top_fraction, + cfg.cv.vibe_check.crop_bottom_fraction, + ) + rb = cv2.resize(cb, (proxy_w, proxy_h), interpolation=cv2.INTER_AREA) + + margin_y = int(proxy_h * 0.10) + margin_x = int(proxy_w * 0.10) + return _feature_image(rb[margin_y:proxy_h-margin_y, margin_x:proxy_w-margin_x]) + + +def _prepare_haystack(frame: np.ndarray, cfg: AppConfig) -> np.ndarray: + cb = text_safe_crop( + frame, + cfg.cv.vibe_check.crop_top_fraction, + cfg.cv.vibe_check.crop_bottom_fraction, + ) + rb = cv2.resize(cb, (cfg.video.proxy_width, cfg.video.proxy_height), interpolation=cv2.INTER_AREA) + return _feature_image(rb) + + +def _center_crop_feature(feature: np.ndarray, cfg: AppConfig) -> np.ndarray: + h, w = feature.shape[:2] + margin_y = int(h * 0.10) + margin_x = int(w * 0.10) + return feature[margin_y:h-margin_y, margin_x:w-margin_x] + + +def _feature_image(frame: np.ndarray) -> np.ndarray: + """ + Convert frames to a look-tolerant matching feature. + + Trailer shots may be desaturated, contrast-shifted, or contain a different + grade than the source movie. Matching luma plus edges is more stable than + raw BGR pixels and rejects unrelated scenes with similar colors. + """ + gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) + gray = cv2.equalizeHist(gray) + edges = cv2.Canny(gray, 60, 140) + return cv2.addWeighted(gray, 0.70, edges, 0.30, 0) + + +def _match_score(frame: np.ndarray, template: np.ndarray, cfg: AppConfig) -> float: + haystack = _prepare_haystack(frame, cfg) + res = cv2.matchTemplate(haystack, template, cv2.TM_CCOEFF_NORMED) + _, max_val, _, _ = cv2.minMaxLoc(res) + return float(max_val) + + +def _fixed_position_score(frame: np.ndarray, template: np.ndarray, cfg: AppConfig) -> float: + fixed = _center_crop_feature(_prepare_haystack(frame, cfg), cfg) + if fixed.shape != template.shape: + fixed = cv2.resize(fixed, (template.shape[1], template.shape[0]), interpolation=cv2.INTER_AREA) + res = cv2.matchTemplate(fixed, template, cv2.TM_CCOEFF_NORMED) + _, max_val, _, _ = cv2.minMaxLoc(res) + return float(max_val) + + +def _fixed_feature(frame: np.ndarray, template_shape: tuple[int, ...], cfg: AppConfig) -> np.ndarray: + fixed = _center_crop_feature(_prepare_haystack(frame, cfg), cfg) + if fixed.shape != template_shape: + fixed = cv2.resize(fixed, (template_shape[1], template_shape[0]), interpolation=cv2.INTER_AREA) + return fixed + + +def _corr_same_size(a: np.ndarray, b: np.ndarray) -> float: + if a.shape != b.shape: + b = cv2.resize(b, (a.shape[1], a.shape[0]), interpolation=cv2.INTER_AREA) + res = cv2.matchTemplate(a, b, cv2.TM_CCOEFF_NORMED) + _, max_val, _, _ = cv2.minMaxLoc(res) + if np.isnan(max_val): + return 0.0 + return float(max_val) + + +def _validation_crop(frame: np.ndarray) -> np.ndarray: + frame = _trim_dark_borders(frame) + h = frame.shape[0] + return frame[int(h * 0.05):int(h * 0.95), :] + + +def _trim_dark_borders(frame: np.ndarray) -> np.ndarray: + """ + Remove encoded black matte/pillarbox borders before fixed-position checks. + + The reference trailer can contain vertical black bars while the source movie + does not. Whole-frame spatial validation should compare picture content, not + container matte. + """ + if frame.size == 0: + return frame + gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) + h, w = gray.shape[:2] + col_signal = np.percentile(gray, 90, axis=0) + row_signal = np.percentile(gray, 90, axis=1) + active_cols = np.where(col_signal > 18.0)[0] + active_rows = np.where(row_signal > 18.0)[0] + if active_cols.size >= max(8, int(w * 0.35)): + x0 = max(0, int(active_cols[0]) - 2) + x1 = min(w, int(active_cols[-1]) + 3) + else: + x0, x1 = 0, w + if active_rows.size >= max(8, int(h * 0.35)): + y0 = max(0, int(active_rows[0]) - 2) + y1 = min(h, int(active_rows[-1]) + 3) + else: + y0, y1 = 0, h + if x1 - x0 < int(w * 0.35) or y1 - y0 < int(h * 0.35): + return frame + return frame[y0:y1, x0:x1] + + +def _fixed_luma_feature(frame: np.ndarray, cfg: AppConfig) -> np.ndarray: + cropped = _validation_crop(frame) + gray = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY) + gray = cv2.equalizeHist(gray) + resized = cv2.resize(gray, (160, 80), interpolation=cv2.INTER_AREA).astype(np.float32) + return (resized - float(np.mean(resized))) / (float(np.std(resized)) + 1e-6) + + +def _fixed_edge_feature(frame: np.ndarray, cfg: AppConfig) -> np.ndarray: + cropped = _validation_crop(frame) + gray = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY) + gray = cv2.equalizeHist(gray) + edges = cv2.Canny(gray, 60, 140) + resized = cv2.resize(edges, (160, 80), interpolation=cv2.INTER_AREA).astype(np.float32) + return (resized - float(np.mean(resized))) / (float(np.std(resized)) + 1e-6) + + +def _fixed_hist_feature(frame: np.ndarray, cfg: AppConfig) -> np.ndarray: + cropped = _validation_crop(frame) + resized = cv2.resize(cropped, (160, 80), interpolation=cv2.INTER_AREA) + chans = cv2.split(resized) + parts = [] + for channel in chans: + hist = cv2.calcHist([channel], [0], None, [32], [0, 256]).astype(np.float32).flatten() + parts.append(hist / (float(np.sum(hist)) + 1e-6)) + return np.concatenate(parts) + + +def _fixed_spatial_hist_feature(frame: np.ndarray, cfg: AppConfig) -> np.ndarray: + cropped = _validation_crop(frame) + resized = cv2.resize(cropped, (160, 80), interpolation=cv2.INTER_AREA) + grid_y = 4 + grid_x = 4 + cell_h = resized.shape[0] // grid_y + cell_w = resized.shape[1] // grid_x + parts = [] + for gy in range(grid_y): + for gx in range(grid_x): + cell = resized[gy * cell_h:(gy + 1) * cell_h, gx * cell_w:(gx + 1) * cell_w, :] + for channel in cv2.split(cell): + hist = cv2.calcHist([channel], [0], None, [16], [0, 256]).astype(np.float32).flatten() + parts.append(hist / (float(np.sum(hist)) + 1e-6)) + return np.concatenate(parts) + + +def _array_corr(a: np.ndarray, b: np.ndarray) -> float: + if a.shape != b.shape: + return 0.0 + return float(np.mean(a * b)) + + +def _hist_intersection(a: np.ndarray, b: np.ndarray) -> float: + if a.shape != b.shape: + return 0.0 + return float(np.minimum(a, b).sum() / (np.maximum(a, b).sum() + 1e-6)) + + +def _fixed_content_features(frame: np.ndarray, cfg: AppConfig) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + return ( + _fixed_luma_feature(frame, cfg), + _fixed_edge_feature(frame, cfg), + _fixed_hist_feature(frame, cfg), + _fixed_spatial_hist_feature(frame, cfg), + ) + + +def _fixed_content_pair_score( + ref_features: tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray], + source_frame: np.ndarray, + cfg: AppConfig, +) -> float: + src_luma, src_edge, src_hist, src_spatial = _fixed_content_features(source_frame, cfg) + ref_luma, ref_edge, ref_hist, ref_spatial = ref_features + luma_score = _array_corr(ref_luma, src_luma) + edge_score = _array_corr(ref_edge, src_edge) + hist_score = _hist_intersection(ref_hist, src_hist) + spatial_score = _hist_intersection(ref_spatial, src_spatial) + return ( + edge_score * 0.24 + + luma_score * 0.24 + + hist_score * 0.14 + + spatial_score * 0.38 + ) + + +def _prepare_validation_templates( + beat: TrailerBeat, + cfg: AppConfig, +) -> list[tuple[float, tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]]: + step_s = max(0.20, cfg.cv.deep_scan.content_align_sample_step_s * 1.5) + matchable_s = estimate_matchable_reference_duration(beat, cfg, sample_step_s=step_s) + templates: list[tuple[float, tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]] = [] + t = 0.0 + while t <= matchable_s: + frame = grab_frame_at_path(beat.trailer_path, beat.start_s + t) + if frame is not None and _is_scoreable_reference_frame(frame, cfg): + templates.append((t, _fixed_content_features(frame, cfg))) + t = round(t + step_s, 6) + + if len(templates) >= 3: + return templates + + fallback: list[tuple[float, tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]] = [] + for offset_s in _beat_offsets(matchable_s): + frame = grab_frame_at_path(beat.trailer_path, beat.start_s + offset_s) + if frame is not None and _is_scoreable_reference_frame(frame, cfg): + fallback.append((offset_s, _fixed_content_features(frame, cfg))) + return fallback + + +def _prepare_rerank_templates( + beat: TrailerBeat, + cfg: AppConfig, +) -> list[tuple[float, tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]]: + matchable_s = estimate_matchable_reference_duration(beat, cfg) + templates: list[tuple[float, tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]] = [] + for offset_s in _beat_offsets(matchable_s): + frame = grab_frame_at_path(beat.trailer_path, beat.start_s + offset_s) + if frame is not None and _is_scoreable_reference_frame(frame, cfg): + templates.append((offset_s, _fixed_content_features(frame, cfg))) + return templates + + +def _fixed_content_sequence_score( + cap: cv2.VideoCapture, + in_point_s: float, + templates: list[tuple[float, tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]], + cfg: AppConfig, +) -> float: + if not templates: + return 0.0 + + scores: list[float] = [] + for offset_s, ref_features in templates: + frame = grab_frame_at(cap, in_point_s + offset_s) + if frame is None: + return 0.0 + scores.append(_fixed_content_pair_score(ref_features, frame, cfg)) + + if not scores: + return 0.0 + return float((sum(scores) / len(scores)) * 0.68 + min(scores) * 0.32) + + +def _reference_internal_cut_offsets(beat: TrailerBeat, cfg: AppConfig) -> list[float]: + """Detect hard visual cuts inside a single trailer beat.""" + cache_key = ( + str(beat.trailer_path), + round(float(beat.start_s), 3), + round(float(beat.end_s), 3), + round(float(cfg.vision.multi_shot_cut_corr_threshold), 3), + ) + cached = _REFERENCE_CUT_CACHE.get(cache_key) + if cached is not None: + return cached + + step_s = max(1.0 / cfg.export.edl_frame_rate, 0.08) + previous: np.ndarray | None = None + cuts: list[float] = [] + t = 0.0 + while t <= beat.duration_s: + frame = grab_frame_at_path(beat.trailer_path, beat.start_s + t) + if frame is not None and _is_scoreable_reference_frame(frame, cfg): + feature = _prepare_haystack(frame, cfg) + if previous is not None: + corr = _corr_same_size(previous, feature) + if ( + corr < cfg.vision.multi_shot_cut_corr_threshold + and 0.18 < t < beat.duration_s - 0.18 + and (not cuts or t - cuts[-1] > 0.24) + ): + cuts.append(round(t, 3)) + previous = feature + t = round(t + step_s, 6) + if cuts: + logger.debug('Beat %d: detected internal trailer cuts at %s', beat.beat_id, cuts) + _REFERENCE_CUT_CACHE[cache_key] = cuts + return cuts + + +def _scene_fps_estimate(scene, cfg: AppConfig) -> float: + duration_s = max(0.0, float(scene.end_s) - float(scene.start_s)) + frame_count = max(0, int(scene.end_frame) - int(scene.start_frame)) + if duration_s <= 0.0 or frame_count <= 0: + return cfg.export.edl_frame_rate + return frame_count / duration_s + + +def _contiguous_scene_coverage_duration( + beat: TrailerBeat, + in_point_s: float, + scenes: Sequence | None, + matchable_duration_s: float, + cfg: AppConfig, +) -> float: + """ + Allow a source span to cross scene boundaries only when the trailer beat has + matching internal cuts at the same relative offsets. + """ + if not scenes or matchable_duration_s <= 0: + return 0.0 + + start_idx = None + for idx, scene in enumerate(scenes): + if float(scene.start_s) <= in_point_s < float(scene.end_s): + start_idx = idx + break + if start_idx is None: + return 0.0 + + cut_offsets = _reference_internal_cut_offsets(beat, cfg) + target_end = in_point_s + matchable_duration_s + current_end = in_point_s + for scene in scenes[start_idx:]: + scene_end = float(scene.end_s) + fps = _scene_fps_estimate(scene, cfg) + tail_s = max(0.0, cfg.cv.deep_scan.trim_tail_frames / fps) + if target_end <= scene_end: + return matchable_duration_s + + boundary_offset = scene_end - in_point_s + boundary_matches_ref_cut = any( + abs(boundary_offset - cut_offset) <= cfg.vision.multi_shot_boundary_tolerance_s + for cut_offset in cut_offsets + ) + if not boundary_matches_ref_cut: + return max(0.0, scene_end - in_point_s - tail_s) + + current_end = scene_end + + return max(0.0, current_end - in_point_s) + + +def _rerank_candidates_by_content( + beat: TrailerBeat, + candidates: list[tuple[float, float]], + cfg: AppConfig, + scenes: Sequence | None = None, + matchable_duration_s: float | None = None, +) -> list[tuple[float, float, float]]: + templates = _prepare_rerank_templates(beat, cfg) + if not templates: + return [(score, score, t_sec) for score, t_sec in candidates] + + reranked: list[tuple[float, float, float]] = [] + with open_video(cfg.paths.source_movie) as cap: + for coarse_score, t_sec in candidates: + content_score = _fixed_content_sequence_score(cap, t_sec, templates, cfg) + coverage_score = 1.0 + if scenes is not None and matchable_duration_s and matchable_duration_s > 0: + usable_s = _contiguous_scene_coverage_duration( + beat, + t_sec, + scenes, + matchable_duration_s, + cfg, + ) + coverage_score = min(1.0, usable_s / matchable_duration_s) + rank_score = ( + content_score * 0.62 + + coarse_score * 0.18 + + coverage_score * 0.20 + ) + reranked.append((rank_score, coarse_score, t_sec)) + + return sorted(reranked, key=lambda item: item[0], reverse=True) + + +def _dense_weighted_seed_candidates( + beat: TrailerBeat, + seed_candidates: list[tuple[float, float]], + cfg: AppConfig, + scenes: Sequence | None, + matchable_duration_s: float, +) -> list[tuple[float, float]]: + """Scan vision-selected source scenes densely with fixed-position content features.""" + if not scenes or not seed_candidates: + return [] + + weighted_floor = cfg.cv.deep_scan.coarse_candidate_threshold + 0.05 + seeded_scenes: dict[int, tuple[object, float]] = {} + for seed_score, seed_t in seed_candidates: + if seed_score <= weighted_floor: + continue + scene = _find_scene_for_time(scenes, seed_t, cfg) + if scene is None: + continue + previous = seeded_scenes.get(scene.scene_id) + if previous is None or seed_score > previous[1]: + seeded_scenes[scene.scene_id] = (scene, seed_score) + + if not seeded_scenes: + return [] + + templates = _prepare_rerank_templates(beat, cfg) + if not templates: + return [] + + cut_offsets = _reference_internal_cut_offsets(beat, cfg) + dense: list[tuple[float, float, float, float, int]] = [] + with open_video(cfg.paths.source_movie) as cap: + for scene, seed_score in seeded_scenes.values(): + fps = _source_fps_from_scene(scene) or cfg.export.edl_frame_rate + tail_s = max(0.0, cfg.cv.deep_scan.trim_tail_frames / fps) + start_s = max(0.0, float(scene.start_s)) + end_s = max(start_s, float(scene.end_s) - tail_s) + if end_s <= start_s: + continue + span_s = end_s - start_s + step_s = max(0.04, cfg.vision.local_scan_step_s) + max_points = max(2, cfg.vision.local_scan_max_points_per_scene) + point_count = int(span_s / step_s) + 1 + if point_count > max_points: + step_s = span_s / float(max_points - 1) + + t_sec = start_s + while t_sec <= end_s + 0.001: + content_score = _fixed_content_sequence_score(cap, t_sec, templates, cfg) + usable_s = max(0.0, float(scene.end_s) - t_sec - tail_s) + coverage_score = ( + min(1.0, usable_s / matchable_duration_s) + if matchable_duration_s > 0 else 0.0 + ) + rank_score = ( + content_score * 0.50 + + coverage_score * 0.35 + + seed_score * 0.15 + ) + coarse_score = max( + weighted_floor, + min(0.99, seed_score * 0.80 + content_score * 0.20), + ) + dense.append((rank_score, coarse_score, t_sec, content_score, scene.scene_id)) + t_sec += step_s + + for cut_offset in cut_offsets: + shifted_t = max(0.0, float(scene.start_s) - cut_offset) + coverage_score = ( + min( + 1.0, + _contiguous_scene_coverage_duration( + beat, + shifted_t, + scenes, + matchable_duration_s, + cfg, + ) / matchable_duration_s, + ) + if matchable_duration_s > 0 else 0.0 + ) + if coverage_score < 0.80: + continue + content_score = _fixed_content_sequence_score(cap, shifted_t, templates, cfg) + rank_score = ( + content_score * 0.56 + + coverage_score * 0.34 + + seed_score * 0.10 + ) + coarse_score = max( + weighted_floor, + min(0.99, seed_score * 0.78 + content_score * 0.22), + ) + dense.append((rank_score, coarse_score, shifted_t, content_score, scene.scene_id)) + + dense.sort(key=lambda item: item[0], reverse=True) + top = dense[: max(0, cfg.vision.local_scan_top_candidates)] + if top: + logger.info( + 'Beat %d: dense vision content scan kept %d/%d candidates; best scene=%d in=%.3fs content=%.3f rank=%.3f.', + beat.beat_id, + len(top), + len(dense), + top[0][4], + top[0][2], + top[0][3], + top[0][0], + ) + return [(coarse_score, t_sec) for _, coarse_score, t_sec, _, _ in top] + + +def _beat_offsets(duration_s: float) -> list[float]: + """Use several frames across the beat, including the leading edge.""" + if duration_s < 1.0: + return [0.0, duration_s * 0.35, duration_s * 0.70] + if duration_s < 2.5: + return [duration_s * r for r in (0.00, 0.15, 0.35, 0.55, 0.78)] + return [duration_s * r for r in (0.00, 0.12, 0.30, 0.50, 0.70, 0.88)] + + +def _prepare_beat_templates(beat: TrailerBeat, cfg: AppConfig) -> list[tuple[float, np.ndarray]]: + templates: list[tuple[float, np.ndarray]] = [] + matchable_s = estimate_matchable_reference_duration(beat, cfg) + for offset_s in _beat_offsets(matchable_s): + frame = grab_frame_at_path(beat.trailer_path, beat.start_s + offset_s) + if frame is None or not _is_scoreable_reference_frame(frame, cfg): + continue + templates.append((offset_s, _prepare_template(frame, cfg))) + return templates + + +def _prepare_beat_templates_stepped( + beat: TrailerBeat, + cfg: AppConfig, + step_s: float = 0.12, +) -> list[tuple[float, np.ndarray]]: + templates: list[tuple[float, np.ndarray]] = [] + matchable_s = estimate_matchable_reference_duration(beat, cfg, sample_step_s=step_s) + t = 0.0 + while t <= matchable_s: + frame = grab_frame_at_path(beat.trailer_path, beat.start_s + t) + if frame is not None and _is_scoreable_reference_frame(frame, cfg): + templates.append((t, _prepare_template(frame, cfg))) + t = round(t + step_s, 6) + return templates + + +def _prepare_motion_templates( + beat: TrailerBeat, + cfg: AppConfig, + step_s: float = 0.12, +) -> list[tuple[float, float, np.ndarray, tuple[int, ...]]]: + """ + Build reference frame-difference templates for motion-phase alignment. + + Absolute image similarity can match the right shot at the wrong point in a + repeated movement. Frame-to-frame deltas make the refine pass care about the + phase and direction of motion as well. + """ + result: list[tuple[float, float, np.ndarray, tuple[int, ...]]] = [] + max_offset = max(0.0, beat.duration_s - step_s) + t = 0.0 + while t <= max_offset: + f0 = grab_frame_at_path(beat.trailer_path, beat.start_s + t) + f1 = grab_frame_at_path(beat.trailer_path, beat.start_s + t + step_s) + if ( + f0 is not None + and f1 is not None + and _is_scoreable_reference_frame(f0, cfg) + and _is_scoreable_reference_frame(f1, cfg) + ): + feat0 = _prepare_template(f0, cfg) + feat1 = _prepare_template(f1, cfg) + result.append((t, step_s, cv2.absdiff(feat1, feat0), feat0.shape)) + t = round(t + step_s, 6) + return result + + +def _is_dark_reference_frame(frame: np.ndarray, cfg: AppConfig) -> bool: + cropped = text_safe_crop( + frame, + cfg.cv.vibe_check.crop_top_fraction, + cfg.cv.vibe_check.crop_bottom_fraction, + ) + gray = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY) + return float(np.mean(gray)) < 28.0 and float(np.percentile(gray, 90)) < 58.0 + + +def _reference_visibility_stats(frame: np.ndarray, cfg: AppConfig) -> tuple[float, float, float]: + cropped = text_safe_crop( + frame, + cfg.cv.vibe_check.crop_top_fraction, + cfg.cv.vibe_check.crop_bottom_fraction, + ) + gray = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY) + p10 = float(np.percentile(gray, 10)) + p90 = float(np.percentile(gray, 90)) + return float(np.mean(gray)), p90, p90 - p10 + + +def _is_scoreable_reference_frame(frame: np.ndarray, cfg: AppConfig) -> bool: + """Exclude black, fade, and low-visibility reference frames from scoring.""" + if _is_dark_reference_frame(frame, cfg): + return False + + mean_luma, p90_luma, contrast = _reference_visibility_stats(frame, cfg) + low_visibility = ( + mean_luma < cfg.cv.deep_scan.scoreable_luma_mean_min + and p90_luma < cfg.cv.deep_scan.scoreable_luma_p90_min + ) + return not low_visibility and contrast >= cfg.cv.deep_scan.scoreable_contrast_min + + +def estimate_matchable_reference_duration( + beat: TrailerBeat, + cfg: AppConfig, + sample_step_s: float | None = None, +) -> float: + """ + Estimate the part of a trailer beat that should be source-matchable. + + Trailer beats often include trailing black/title/credit frames that do not + exist in the source movie. Those frames should not force the source match to + cover the full beat duration. + """ + step_s = sample_step_s if sample_step_s is not None else cfg.cv.deep_scan.span_sample_step_s + samples: list[tuple[float, bool]] = [] + t = 0.0 + while t <= beat.duration_s: + frame = grab_frame_at_path(beat.trailer_path, beat.start_s + t) + if frame is not None: + samples.append((t, _is_dark_reference_frame(frame, cfg))) + t = round(t + step_s, 6) + + if not samples: + return beat.duration_s + + dark_run_start: float | None = None + saw_visible = False + min_dark_break_s = max(0.24, step_s * 2.0) + for offset_s, is_dark in samples: + if not is_dark: + saw_visible = True + dark_run_start = None + continue + + if saw_visible: + if dark_run_start is None: + dark_run_start = offset_s + if offset_s - dark_run_start >= min_dark_break_s: + break + + if dark_run_start is None: + return beat.duration_s + + # Keep a small buffer before the first sustained dark/title break so the + # source clip does not visibly end before the trailer begins its fade/card. + # Long beats can contain later credit/title islands; those should not force + # one source clip to validate unrelated images. + return max(step_s, min(beat.duration_s, dark_run_start + step_s)) + + +def _sequence_score( + cap: cv2.VideoCapture, + in_point_s: float, + templates: list[tuple[float, np.ndarray]], + cfg: AppConfig, +) -> float: + weighted_scores: list[float] = [] + raw_scores: list[float] = [] + for offset_s, template in templates: + frame = grab_frame_at(cap, in_point_s + offset_s) + if frame is None: + return -1.0 + floating_score = _match_score(frame, template, cfg) + fixed_score = _fixed_position_score(frame, template, cfg) + score = (floating_score * 0.55) + (fixed_score * 0.45) + # The first frames matter most for perceived sync. Weight them higher + # so a match that begins a few frames early loses to a better aligned hit. + weight = 1.35 if offset_s <= 0.16 else 1.0 + weighted_scores.append(score * weight) + raw_scores.append(score) + if not raw_scores: + return -1.0 + + # Reward consistently good temporal alignment. A single strong frame is not + # enough if the other beat frames drift away. + weighted_avg = sum(weighted_scores) / (len(raw_scores) + 0.35 * sum(1 for o, _ in templates if o <= 0.16)) + return float(weighted_avg * 0.70 + min(raw_scores) * 0.30) + + +def _content_alignment_templates( + beat: TrailerBeat, + cfg: AppConfig, +) -> list[tuple[float, np.ndarray]]: + matchable_s = estimate_matchable_reference_duration( + beat, + cfg, + sample_step_s=cfg.cv.deep_scan.content_align_sample_step_s, + ) + step_s = max(1.0 / cfg.export.edl_frame_rate, cfg.cv.deep_scan.content_align_sample_step_s) + max_offset_s = max(0.0, min(beat.duration_s, matchable_s) - step_s) + offsets = [0.0] + t = step_s + while t <= max_offset_s: + offsets.append(round(t, 6)) + t = round(t + step_s, 6) + if matchable_s > step_s and offsets[-1] < max_offset_s: + offsets.append(round(max_offset_s, 6)) + + templates: list[tuple[float, np.ndarray]] = [] + for offset_s in offsets: + frame = grab_frame_at_path(beat.trailer_path, beat.start_s + offset_s) + if frame is not None: + if not _is_scoreable_reference_frame(frame, cfg): + continue + templates.append((offset_s, _prepare_template(frame, cfg))) + if not templates: + return _prepare_beat_templates(beat, cfg) + return templates + + +def _content_alignment_score( + cap: cv2.VideoCapture, + in_point_s: float, + templates: list[tuple[float, np.ndarray]], + cfg: AppConfig, +) -> float: + if not templates: + return -1.0 + + weighted_total = 0.0 + weight_total = 0.0 + raw_scores: list[float] = [] + early_scores: list[float] = [] + + for offset_s, template in templates: + frame = grab_frame_at(cap, in_point_s + offset_s) + if frame is None: + return -1.0 + + # For offset detection the fixed frame position is intentionally more + # important than free template placement. Free placement can make the + # right shot look acceptable even when the movement is a few frames off. + fixed_score = _fixed_position_score(frame, template, cfg) + floating_score = _match_score(frame, template, cfg) + score = fixed_score * 0.72 + floating_score * 0.28 + + weight = 1.45 if offset_s <= 0.20 else 1.0 + weighted_total += score * weight + weight_total += weight + raw_scores.append(score) + if offset_s <= 0.36: + early_scores.append(score) + + avg_score = weighted_total / weight_total if weight_total > 0 else -1.0 + min_score = min(raw_scores) if raw_scores else -1.0 + early_score = sum(early_scores) / len(early_scores) if early_scores else avg_score + return float(avg_score * 0.55 + min_score * 0.25 + early_score * 0.20) + + +def align_in_point_by_content( + beat: TrailerBeat, + estimated_in_point_s: float, + cfg: AppConfig, + search_window_s: float | None = None, +) -> tuple[float, float]: + """ + Find the frame offset directly from image content around a rough match. + + This is deliberately local: once a candidate shot is plausible, scanning a + small window around it with many reference frames is faster and more robust + than repeating a global scan or applying a fixed frame preroll. + """ + templates = _content_alignment_templates(beat, cfg) + if not templates: + return estimated_in_point_s, 0.0 + + with open_video(cfg.paths.source_movie) as cap: + fps = float(cap.get(cv2.CAP_PROP_FPS)) or cfg.export.edl_frame_rate + frame_step_s = 1.0 / fps + window_s = ( + search_window_s + if search_window_s is not None + else cfg.cv.deep_scan.content_align_window_seconds + ) + start_s = max(0.0, estimated_in_point_s - window_s) + end_s = estimated_in_point_s + window_s + tie_delta = cfg.cv.deep_scan.start_tie_break_score_delta + + best_in = estimated_in_point_s + best_score = -1.0 + t = start_s + while t <= end_s: + score = _content_alignment_score(cap, t, templates, cfg) + if score > best_score + tie_delta: + best_score = score + best_in = t + elif score >= best_score - tie_delta and abs(t - estimated_in_point_s) < abs(best_in - estimated_in_point_s): + best_in = t + t = round(t + frame_step_s, 6) + + return best_in, max(0.0, best_score) + + +def _motion_phase_score( + cap: cv2.VideoCapture, + in_point_s: float, + motion_templates: list[tuple[float, float, np.ndarray, tuple[int, ...]]], + cfg: AppConfig, +) -> float: + scores: list[float] = [] + for offset_s, step_s, ref_delta, template_shape in motion_templates: + f0 = grab_frame_at(cap, in_point_s + offset_s) + f1 = grab_frame_at(cap, in_point_s + offset_s + step_s) + if f0 is None or f1 is None: + return -1.0 + src0 = _fixed_feature(f0, template_shape, cfg) + src1 = _fixed_feature(f1, template_shape, cfg) + scores.append(_corr_same_size(cv2.absdiff(src1, src0), ref_delta)) + + if not scores: + return 0.0 + return float((sum(scores) / len(scores)) * 0.65 + min(scores) * 0.35) + + +def estimate_usable_source_duration( + beat: TrailerBeat, + in_point_s: float, + cfg: AppConfig, + sample_step_s: float | None = None, + min_keep_s: float = 0.5, +) -> tuple[float, float]: + """ + Estimate how long the source stays visually aligned with the beat. + + This catches cases where the source dissolves/cuts into the next shot while + the trailer beat continues into a title card or black fade. + + Returns: + (usable_duration_s, average_good_score) + """ + step_s = sample_step_s if sample_step_s is not None else cfg.cv.deep_scan.span_sample_step_s + templates = _prepare_beat_templates_stepped(beat, cfg, step_s) + if not templates: + return beat.duration_s, 0.0 + + scores: list[tuple[float, float]] = [] + source_fps = cfg.export.edl_frame_rate + with open_video(cfg.paths.source_movie) as cap: + source_fps = float(cap.get(cv2.CAP_PROP_FPS)) or cfg.export.edl_frame_rate + for offset_s, template in templates: + frame = grab_frame_at(cap, in_point_s + offset_s) + if frame is None: + break + scores.append((offset_s, _match_score(frame, template, cfg))) + + if not scores: + return 0.0, 0.0 + + warmup_scores = [score for offset, score in scores if offset <= min(1.0, beat.duration_s * 0.35)] + baseline = max(warmup_scores) if warmup_scores else max(score for _, score in scores) + min_score = max(0.34, baseline * 0.48) + + last_good = 0.0 + bad_run = 0 + good_scores: list[float] = [] + + for offset_s, score in scores: + if score >= min_score: + last_good = offset_s + bad_run = 0 + good_scores.append(score) + continue + + if offset_s < min_keep_s: + continue + + bad_run += 1 + if bad_run >= 3: + break + + tail_safety_s = max(0.0, cfg.cv.deep_scan.trim_tail_frames / source_fps) + usable = min(beat.duration_s, max(0.0, last_good - tail_safety_s)) + if usable < min_keep_s and scores: + usable = min(beat.duration_s, max(min_keep_s, scores[0][0] + step_s - tail_safety_s)) + + avg_good = float(sum(good_scores) / len(good_scores)) if good_scores else 0.0 + return usable, avg_good + + +def refine_timestamp(template: np.ndarray, t_sec: float, cfg: AppConfig) -> float: + best_score = -1.0 + best_t = t_sec + tie_delta = cfg.cv.deep_scan.start_tie_break_score_delta + + with open_video(cfg.paths.source_movie) as cap: + fps = float(cap.get(cv2.CAP_PROP_FPS)) + step = 1.0 / fps + start_t = max(0.0, t_sec - 0.5) + end_t = t_sec + 0.5 + + t = start_t + while t <= end_t: + frame = grab_frame_at(cap, t) + if frame is not None: + max_val = _match_score(frame, template, cfg) + if max_val > best_score + tie_delta: + best_score = max_val + best_t = t + elif max_val >= best_score - tie_delta and t < best_t: + best_t = t + t += step + + return best_t + + +def refine_in_point_with_sequence( + beat: TrailerBeat, + estimated_in_point_s: float, + cfg: AppConfig, + search_window_s: float | None = None, +) -> tuple[float, float]: + """ + Refine a rough source in-point by comparing several frames across the beat. + + Returns: + (best_in_point_s, sequence_score) + """ + return align_in_point_by_content(beat, estimated_in_point_s, cfg, search_window_s) + + +def _find_scene_for_time(scenes: Sequence | None, t_sec: float, cfg: AppConfig): + if not scenes: + return None + for idx, scene in enumerate(scenes): + if scene.start_s <= t_sec < scene.end_s: + if ( + scene.end_s - t_sec <= cfg.cv.deep_scan.scene_boundary_epsilon_s + and idx + 1 < len(scenes) + ): + return scenes[idx + 1] + return scene + return None + + +def _source_fps_from_scene(scene) -> float: + duration_s = max(0.0, scene.end_s - scene.start_s) + frame_count = max(0, scene.end_frame - scene.start_frame) + return frame_count / duration_s if duration_s > 0 and frame_count > 0 else 0.0 + + +def _apply_start_preroll(in_point_s: float, source_fps: float, cfg: AppConfig) -> float: + if cfg.cv.deep_scan.start_preroll_frames <= 0: + return in_point_s + fps = source_fps or cfg.export.edl_frame_rate + return max(0.0, in_point_s - (cfg.cv.deep_scan.start_preroll_frames / fps)) + + +def _clamp_to_scene_start(in_point_s: float, scene) -> float: + if scene is None: + return in_point_s + return max(float(scene.start_s), in_point_s) + + +def _add_top_candidate( + candidates: list[tuple[float, float]], + score: float, + t_sec: float, + max_candidates: int, + min_distance_s: float, +) -> list[tuple[float, float]]: + """ + Keep diverse coarse candidates as (score, midpoint_time). + + A single best midpoint frame is too brittle: repeated actors, similar color + palettes, cars, forests, and title-card darkness can all create plausible + false positives. Keeping a ranked pool lets the multi-frame sequence pass + choose the temporally consistent match. + """ + for idx, (old_score, old_t) in enumerate(candidates): + if abs(old_t - t_sec) < min_distance_s: + if score > old_score: + candidates[idx] = (score, t_sec) + return sorted(candidates, key=lambda item: item[0], reverse=True)[:max_candidates] + + candidates.append((score, t_sec)) + return sorted(candidates, key=lambda item: item[0], reverse=True)[:max_candidates] + + +def run_global_scan( + beats: Sequence[TrailerBeat], + cfg: AppConfig, + scenes: Sequence | None = None, + seed_in_points: dict[int, Sequence[SeedPoint]] | None = None, +) -> list[MatchResult]: + logger.info('[Global Scan] Preparing templates for %d beats...', len(beats)) + templates = [] + midpoint_templates = [] + beat_valid = [] + + for b in beats: + bf = grab_frame_at_path(cfg.paths.reference_trailer, b.start_s + (b.end_s - b.start_s)/2) + if bf is None: + midpoint_templates.append(None) + templates.append([]) + beat_valid.append(False) + continue + + midpoint_templates.append(_prepare_template(bf, cfg)) + beat_templates = _prepare_beat_templates(b, cfg) + templates.append(beat_templates) + beat_valid.append(bool(beat_templates)) + + top_candidates: list[list[tuple[float, float]]] = [[] for _ in beats] + seed_candidates: list[list[tuple[float, float]]] = [[] for _ in beats] + has_weighted_seeds = False + for idx, beat in enumerate(beats): + for seed in (seed_in_points or {}).get(beat.beat_id, ()): + if isinstance(seed, tuple): + seed_t = float(seed[0]) + seed_score = max( + cfg.cv.deep_scan.coarse_candidate_threshold, + min(0.99, float(seed[1])), + ) + has_weighted_seeds = True + else: + seed_t = float(seed) + seed_score = cfg.cv.deep_scan.coarse_candidate_threshold + seed_candidate = ( + seed_score, + max(0.0, seed_t), + ) + seed_candidates[idx].append(seed_candidate) + top_candidates[idx] = _add_top_candidate( + top_candidates[idx], + seed_candidate[0], + seed_candidate[1], + max_candidates=cfg.cv.deep_scan.sequence_candidate_count, + min_distance_s=cfg.cv.deep_scan.sequence_min_distance_s, + ) + if (seed_in_points or {}).get(beat.beat_id): + logger.info( + 'Beat %d: added %d seeded in-point candidates.', + beat.beat_id, + len((seed_in_points or {}).get(beat.beat_id, ())), + ) + + skip_coarse_scan = ( + cfg.vision.enabled + and cfg.cv.deep_scan.skip_coarse_scan_with_weighted_seeds + and has_weighted_seeds + and all(top_candidates[i] for i, valid in enumerate(beat_valid) if valid) + ) + + if skip_coarse_scan: + logger.info('[Global Scan] Weighted vision seeds present; skipping full FFmpeg coarse scan.') + else: + fps = 2.0 + cmd = [ + 'ffmpeg', '-i', str(cfg.paths.source_movie), + '-vf', f'scale={cfg.video.proxy_width}:{cfg.video.proxy_height},fps={fps}', + '-f', 'image2pipe', '-vcodec', 'rawvideo', '-pix_fmt', 'bgr24', '-' + ] + logger.info('[Global Scan] Streaming %s via FFmpeg (%.1f fps) ...', cfg.paths.source_movie.name, fps) + + p = sp.Popen(cmd, stdout=sp.PIPE, stderr=sp.DEVNULL) + frame_size = cfg.video.proxy_width * cfg.video.proxy_height * 3 + frame_idx = 0 + start_t = time.time() + + while True: + raw = p.stdout.read(frame_size) + if len(raw) != frame_size: break + + frame = np.frombuffer(raw, dtype=np.uint8).reshape((cfg.video.proxy_height, cfg.video.proxy_width, 3)) + haystack = _prepare_haystack(frame, cfg) + + for i, beat_templates in enumerate(templates): + if not beat_valid[i]: continue + source_t = frame_idx / fps + for beat_offset_s, template in beat_templates: + res = cv2.matchTemplate(haystack, template, cv2.TM_CCOEFF_NORMED) + _, max_val, _, _ = cv2.minMaxLoc(res) + candidate_in_s = source_t - beat_offset_s + if candidate_in_s < 0.0: + continue + + top_candidates[i] = _add_top_candidate( + top_candidates[i], + float(max_val), + candidate_in_s, + max_candidates=cfg.cv.deep_scan.sequence_candidate_count, + min_distance_s=cfg.cv.deep_scan.sequence_min_distance_s, + ) + + frame_idx += 1 + if frame_idx % 1000 == 0: + logger.info('[Global Scan] Processed %d frames (%.1fs movie time)...', frame_idx, frame_idx / fps) + + p.stdout.close() + p.wait() + + logger.info('[Global Scan] Finished streaming %d frames in %.1fs.', frame_idx, time.time() - start_t) + + results = [] + source_info = get_video_info(cfg.paths.source_movie) + source_fps = float(source_info['fps']) or 24.0 + + for i, b in enumerate(beats): + if not beat_valid[i]: continue + + candidates = top_candidates[i] + if not candidates: + continue + + score = float(candidates[0][0]) + + if score >= cfg.cv.deep_scan.coarse_candidate_threshold: + matchable_duration_s = estimate_matchable_reference_duration(b, cfg) + logger.info( + 'Beat %d: refining %d temporal candidates (best offset score %.3f, matchable %.2fs / beat %.2fs).', + b.beat_id, + len(candidates), + score, + matchable_duration_s, + b.duration_s, + ) + + best_result: MatchResult | None = None + best_short_result: MatchResult | None = None + best_short_coverage = -1.0 + best_duration_coverage = -1.0 + best_content_score = -1.0 + rejected_short_candidates = 0 + rejected_content_candidates = 0 + scan_cfg = cfg.cv.deep_scan + content_gate = ( + min(scan_cfg.provisional_content_threshold, cfg.vision.content_threshold) + if skip_coarse_scan and has_weighted_seeds + else scan_cfg.provisional_content_threshold + ) + + candidate_pool = candidates[:scan_cfg.content_rerank_candidate_count] + for seed_candidate in seed_candidates[i]: + candidate_pool = _add_top_candidate( + candidate_pool, + seed_candidate[0], + seed_candidate[1], + max_candidates=scan_cfg.content_rerank_candidate_count + len(seed_candidates[i]), + min_distance_s=scan_cfg.sequence_min_distance_s, + ) + if skip_coarse_scan and has_weighted_seeds: + dense_candidates = _dense_weighted_seed_candidates( + b, + seed_candidates[i], + cfg, + scenes, + matchable_duration_s, + ) + for dense_candidate in dense_candidates: + candidate_pool = _add_top_candidate( + candidate_pool, + dense_candidate[0], + dense_candidate[1], + max_candidates=( + scan_cfg.content_rerank_candidate_count + + len(seed_candidates[i]) + + len(dense_candidates) + ), + min_distance_s=max(0.04, cfg.vision.local_scan_step_s * 0.5), + ) + reranked_candidates = _rerank_candidates_by_content( + b, + candidate_pool, + cfg, + scenes=scenes, + matchable_duration_s=matchable_duration_s, + ) + refine_limit = ( + min(scan_cfg.max_refine_candidates, cfg.vision.max_refine_candidates) + if skip_coarse_scan and has_weighted_seeds + else scan_cfg.max_refine_candidates + ) + refine_candidates = [ + (coarse_score, in_point_s) + for _, coarse_score, in_point_s in reranked_candidates[:refine_limit] + ] + validation_templates = _prepare_validation_templates(b, cfg) + logger.info( + 'Beat %d: content-reranked top %d / %d candidates.', + b.beat_id, + len(refine_candidates), + len(candidate_pool), + ) + + for coarse_score, coarse_in_s in refine_candidates: + rough_in_s = coarse_in_s + is_weighted_seed_candidate = ( + skip_coarse_scan + and has_weighted_seeds + and coarse_score > scan_cfg.coarse_candidate_threshold + 0.05 + ) + if midpoint_templates[i] is not None and not is_weighted_seed_candidate: + midpoint_t = coarse_in_s + (b.duration_s / 2) + fine_t = refine_timestamp(midpoint_templates[i], midpoint_t, cfg) + rough_in_s = max(0.0, fine_t - (b.duration_s / 2)) + local_align_window_s = ( + min(cfg.vision.local_scan_step_s, cfg.cv.deep_scan.content_align_window_seconds) + if is_weighted_seed_candidate + else None + ) + refined_in_s, sequence_score = refine_in_point_with_sequence( + b, + rough_in_s, + cfg, + search_window_s=local_align_window_s, + ) + scene = _find_scene_for_time(scenes, refined_in_s, cfg) + scene_fps = _source_fps_from_scene(scene) if scene is not None else source_fps + adjusted_in_s = _apply_start_preroll(refined_in_s, scene_fps, cfg) + adjusted_in_s = _clamp_to_scene_start(adjusted_in_s, scene) + scene = _find_scene_for_time(scenes, adjusted_in_s, cfg) + usable_duration_s, span_score = estimate_usable_source_duration(b, adjusted_in_s, cfg) + out_s = adjusted_in_s + usable_duration_s + if scene is not None: + out_s = min(out_s, scene.end_s) + duration_s = max(0.0, out_s - adjusted_in_s) + duration_coverage = min(1.0, duration_s / matchable_duration_s) if matchable_duration_s > 0 else 0.0 + with open_video(cfg.paths.source_movie) as validation_cap: + original_content_score = _fixed_content_sequence_score( + validation_cap, + adjusted_in_s, + validation_templates, + cfg, + ) + content_score = original_content_score + content_in_s, align_content_score = align_in_point_by_content( + b, + adjusted_in_s, + cfg, + search_window_s=( + local_align_window_s + if local_align_window_s is not None + else min(0.8, cfg.cv.deep_scan.content_align_window_seconds) + ), + ) + if abs(content_in_s - adjusted_in_s) <= cfg.cv.deep_scan.content_align_window_seconds: + with open_video(cfg.paths.source_movie) as validation_cap: + aligned_content_score = _fixed_content_sequence_score( + validation_cap, + content_in_s, + validation_templates, + cfg, + ) + if aligned_content_score >= original_content_score + 0.01: + adjusted_in_s = content_in_s + content_score = min(align_content_score, aligned_content_score) + scene = _find_scene_for_time(scenes, adjusted_in_s, cfg) + usable_duration_s = max(0.0, duration_s) + out_s = adjusted_in_s + usable_duration_s + if scene is not None: + out_s = min(out_s, scene.end_s) + duration_s = max(0.0, out_s - adjusted_in_s) + duration_coverage = ( + min(1.0, duration_s / matchable_duration_s) + if matchable_duration_s > 0 else 0.0 + ) + + if is_weighted_seed_candidate and scene is not None and content_score >= content_gate: + contiguous_usable_s = _contiguous_scene_coverage_duration( + b, + adjusted_in_s, + scenes, + matchable_duration_s, + cfg, + ) + scene_duration_s = min(b.duration_s, contiguous_usable_s) + if scene_duration_s > duration_s: + usable_duration_s = scene_duration_s + out_s = adjusted_in_s + usable_duration_s + duration_s = usable_duration_s + duration_coverage = ( + min(1.0, duration_s / matchable_duration_s) + if matchable_duration_s > 0 else 0.0 + ) + span_score = max(span_score, content_score) + + final_score = ( + sequence_score * scan_cfg.sequence_score_weight + + span_score * scan_cfg.span_score_weight + + coarse_score * scan_cfg.coarse_score_weight + + duration_coverage * scan_cfg.duration_score_weight + ) + final_score = ( + final_score * (1.0 - scan_cfg.content_validation_weight) + + content_score * scan_cfg.content_validation_weight + ) + if is_weighted_seed_candidate: + vision_provisional_score = ( + content_score * 0.55 + + duration_coverage * 0.33 + + coarse_score * 0.12 + ) + final_score = max(final_score, vision_provisional_score) + if content_score < scan_cfg.match_threshold and not is_weighted_seed_candidate: + final_score = min(final_score, content_score) + if content_score < content_gate: + logger.debug( + 'Beat %d rejected by content validation in=%.3fs scene=%s content=%.3f min=%.3f', + b.beat_id, + adjusted_in_s, + scene.scene_id if scene is not None else 'none', + content_score, + content_gate, + ) + rejected_content_candidates += 1 + continue + candidate_result = MatchResult( + beat_id=b.beat_id, + scene_id=scene.scene_id if scene is not None else 0, + source_path=cfg.paths.source_movie, + in_point_s=max(0.0, adjusted_in_s), + out_point_s=out_s, + in_point_frame=int(max(0.0, adjusted_in_s) * source_fps), + match_score=final_score, + ) + + if duration_coverage < scan_cfg.min_duration_coverage: + rejected_short_candidates += 1 + logger.debug( + 'Beat %d short candidate in=%.3fs scene=%s sequence=%.3f span=%.3f coarse=%.3f content=%.3f coverage=%.2f final=%.3f', + b.beat_id, + adjusted_in_s, + scene.scene_id if scene is not None else 'none', + sequence_score, + span_score, + coarse_score, + content_score, + duration_coverage, + final_score, + ) + long_enough_for_review = duration_s >= max(0.5, matchable_duration_s * 0.45) + visually_plausible = ( + sequence_score >= scan_cfg.provisional_match_threshold + or final_score >= scan_cfg.provisional_match_threshold + ) + if long_enough_for_review and visually_plausible: + if ( + best_short_result is None + or candidate_result.match_score + > best_short_result.match_score + scan_cfg.duration_tie_break_score_delta + or ( + candidate_result.match_score + >= best_short_result.match_score - scan_cfg.duration_tie_break_score_delta + and duration_coverage > best_short_coverage + ) + ): + best_short_result = candidate_result + best_short_coverage = duration_coverage + continue + + logger.debug( + 'Beat %d candidate in=%.3fs scene=%s sequence=%.3f span=%.3f coarse=%.3f content=%.3f coverage=%.2f final=%.3f', + b.beat_id, + adjusted_in_s, + scene.scene_id if scene is not None else 'none', + sequence_score, + span_score, + coarse_score, + content_score, + duration_coverage, + final_score, + ) + + clearly_better_score = ( + best_result is None + or candidate_result.match_score + > best_result.match_score + scan_cfg.duration_tie_break_score_delta + ) + similar_score_better_duration = ( + best_result is not None + and candidate_result.match_score + >= best_result.match_score - scan_cfg.duration_tie_break_score_delta + and duration_coverage > best_duration_coverage + 0.03 + ) + similar_vision_score_earlier_phase = ( + is_weighted_seed_candidate + and best_result is not None + and candidate_result.scene_id == best_result.scene_id + and candidate_result.match_score + >= best_result.match_score - cfg.vision.local_scan_tie_break_score_delta + and content_score >= best_content_score - 0.005 + and duration_coverage >= best_duration_coverage - 0.03 + and candidate_result.in_point_s < best_result.in_point_s + ) + similar_vision_score_better_phase = ( + is_weighted_seed_candidate + and best_result is not None + and candidate_result.scene_id == best_result.scene_id + and candidate_result.match_score + >= best_result.match_score - cfg.vision.local_scan_tie_break_score_delta + and content_score > best_content_score + 0.008 + and duration_coverage >= best_duration_coverage - 0.03 + ) + + if ( + clearly_better_score + or similar_score_better_duration + or similar_vision_score_earlier_phase + or similar_vision_score_better_phase + ): + best_result = candidate_result + best_duration_coverage = duration_coverage + best_content_score = content_score + + if best_result is None: + if best_short_result is not None: + logger.warning( + 'Beat %d: using short provisional automatic match scene=%d in=%.3fs dur=%.3fs coverage=%.2f score=%.3f', + b.beat_id, + best_short_result.scene_id, + best_short_result.in_point_s, + best_short_result.duration_s, + best_short_coverage, + best_short_result.match_score, + ) + best_result = best_short_result + best_duration_coverage = best_short_coverage + else: + if rejected_content_candidates > 0 and rejected_short_candidates == 0: + logger.warning( + 'Beat %d: NO MATCH after refinement (%d candidates rejected by content validation)', + b.beat_id, + rejected_content_candidates, + ) + else: + logger.warning( + 'Beat %d: NO MATCH after refinement (%d candidates rejected below %.0f%% duration coverage, %d by content validation)', + b.beat_id, + rejected_short_candidates, + scan_cfg.min_duration_coverage * 100.0, + rejected_content_candidates, + ) + continue + is_confirmed = best_result.match_score >= cfg.cv.deep_scan.match_threshold + if best_result.match_score < cfg.cv.deep_scan.provisional_match_threshold: + logger.warning( + 'Beat %d: NO MATCH after refinement (best final score %.3f, provisional threshold %.3f)', + b.beat_id, + best_result.match_score, + cfg.cv.deep_scan.provisional_match_threshold, + ) + continue + if not is_confirmed: + logger.warning( + 'Beat %d: provisional automatic match scene=%d in=%.3fs score=%.3f (confirmed threshold %.3f)', + b.beat_id, + best_result.scene_id, + best_result.in_point_s, + best_result.match_score, + cfg.cv.deep_scan.match_threshold, + ) + + logger.info( + 'Beat %d: best automatic match scene=%d in=%.3fs dur=%.3fs coverage=%.2f score=%.3f', + b.beat_id, + best_result.scene_id, + best_result.in_point_s, + best_result.duration_s, + best_duration_coverage, + best_result.match_score, + ) + + results.append(MatchResult( + beat_id=b.beat_id, + scene_id=best_result.scene_id, + source_path=cfg.paths.source_movie, + in_point_s=best_result.in_point_s, + out_point_s=best_result.out_point_s, + in_point_frame=best_result.in_point_frame, + match_score=best_result.match_score, + is_confirmed=is_confirmed, + )) + else: + logger.warning( + 'Beat %d: NO MATCH (best coarse score %.3f, coarse threshold %.3f)', + b.beat_id, + score, + cfg.cv.deep_scan.coarse_candidate_threshold, + ) + + if skip_coarse_scan and not results and cfg.vision.fullscan_fallback: + logger.warning( + '[Global Scan] Weighted vision-seed pass found no valid matches; retrying with full FFmpeg coarse scan.' + ) + retry_cfg = replace( + cfg, + cv=replace( + cfg.cv, + deep_scan=replace(cfg.cv.deep_scan, skip_coarse_scan_with_weighted_seeds=False), + ), + ) + return run_global_scan(beats, retry_cfg, scenes=scenes, seed_in_points=seed_in_points) + + return results diff --git a/src/cv/scene_indexer.py b/src/cv/scene_indexer.py new file mode 100644 index 0000000..10bb47e --- /dev/null +++ b/src/cv/scene_indexer.py @@ -0,0 +1,229 @@ +""" +src/cv/scene_indexer.py — Source-movie scene segmentation + fingerprinting + +Responsibility: + 1. Run PySceneDetect on the source movie → list of raw scene boundaries + 2. For each scene, extract the midpoint frame and fingerprint it + 3. Optionally run Whisper dialogue on each scene (injected as dependency) + 4. Persist results to .cache/ as JSON for fast re-runs + +Returns: list[Scene] with luma_hist, sat_hist, phash populated. +""" + +from __future__ import annotations + +import json +import logging +import pickle +from pathlib import Path +from typing import Callable, Sequence + +import numpy as np + +from src.core.config import AppConfig +from src.core.models import Scene +from src.cv.fingerprinting import fingerprint_frame +from src.cv.frame_extractor import grab_midpoint_frame, open_video + +logger = logging.getLogger(__name__) + +# Type alias for an optional dialogue-injection callback +DialogueCallback = Callable[[Scene], Scene] + + +# --------------------------------------------------------------------------- +# Cache helpers +# --------------------------------------------------------------------------- + +def _cache_path(cfg: AppConfig) -> Path: + p = cfg.paths.cache_dir / "scene_index.json" + p.parent.mkdir(parents=True, exist_ok=True) + return p + + +def _scene_to_dict(s: Scene) -> dict: + return { + "scene_id": s.scene_id, + "source_path": str(s.source_path), + "start_s": s.start_s, + "end_s": s.end_s, + "start_frame": s.start_frame, + "end_frame": s.end_frame, + # histograms serialised as hex so JSON can hold them + "luma_hist": s.luma_hist.hex() if s.luma_hist else None, + "sat_hist": s.sat_hist.hex() if s.sat_hist else None, + "phash": s.phash, + } + + +def _scene_from_dict(d: dict) -> Scene: + return Scene( + scene_id=d["scene_id"], + source_path=Path(d["source_path"]), + start_s=d["start_s"], + end_s=d["end_s"], + start_frame=d["start_frame"], + end_frame=d["end_frame"], + luma_hist=bytes.fromhex(d["luma_hist"]) if d.get("luma_hist") else None, + sat_hist= bytes.fromhex(d["sat_hist"]) if d.get("sat_hist") else None, + phash=d.get("phash"), + ) + + +def _save_cache(scenes: list[Scene], cfg: AppConfig) -> None: + data = [_scene_to_dict(s) for s in scenes] + _cache_path(cfg).write_text(json.dumps(data, indent=2), encoding="utf-8") + logger.info("Scene index cached → %s (%d scenes)", _cache_path(cfg), len(scenes)) + + +def _load_cache(cfg: AppConfig) -> list[Scene] | None: + p = _cache_path(cfg) + if not p.exists(): + return None + try: + data = json.loads(p.read_text(encoding="utf-8")) + scenes = [_scene_from_dict(d) for d in data] + logger.info("Loaded %d scenes from cache (%s)", len(scenes), p) + return scenes + except Exception as exc: + logger.warning("Cache corrupt, re-indexing: %s", exc) + return None + + +# --------------------------------------------------------------------------- +# PySceneDetect integration +# --------------------------------------------------------------------------- + +def _detect_scenes_pyscenedetect(cfg: AppConfig) -> list[tuple[float, float, int, int]]: + """ + Run PySceneDetect ContentDetector on the source movie. + + Returns: + List of (start_s, end_s, start_frame, end_frame) tuples. + """ + try: + from scenedetect import open_video as sd_open_video, SceneManager + from scenedetect.detectors import ContentDetector + except ImportError: + raise ImportError( + "scenedetect is not installed. Run: pip install scenedetect[opencv]" + ) + + video = sd_open_video(str(cfg.paths.source_movie)) + manager = SceneManager() + manager.add_detector( + ContentDetector( + threshold=cfg.scene_detection.content_threshold, + min_scene_len=int( + cfg.scene_detection.min_scene_duration_s + * video.frame_rate + ), + ) + ) + + logger.info("Detecting scenes in %s …", cfg.paths.source_movie.name) + manager.detect_scenes(video=video, show_progress=True) + + raw = manager.get_scene_list() + result: list[tuple[float, float, int, int]] = [] + for start_tc, end_tc in raw: + result.append(( + start_tc.get_seconds(), + end_tc.get_seconds(), + start_tc.get_frames(), + end_tc.get_frames(), + )) + + logger.info("PySceneDetect found %d scenes.", len(result)) + return result + + +# --------------------------------------------------------------------------- +# Fingerprint enrichment +# --------------------------------------------------------------------------- + +def _fingerprint_scenes( + raw_scenes: list[tuple[float, float, int, int]], + cfg: AppConfig, +) -> list[Scene]: + """ + For each raw scene boundary, extract the midpoint frame and fingerprint it. + """ + scenes: list[Scene] = [] + vc_cfg = cfg.cv.vibe_check + + logger.info("Fingerprinting %d scenes …", len(raw_scenes)) + + with open_video(cfg.paths.source_movie) as cap: + for idx, (start_s, end_s, start_frame, end_frame) in enumerate(raw_scenes): + frame = grab_midpoint_frame(cap, start_s, end_s) + + if frame is None: + logger.warning("Scene %d: midpoint frame decode failed, skipping fingerprint.", idx) + scenes.append(Scene( + scene_id=idx, + source_path=cfg.paths.source_movie, + start_s=start_s, end_s=end_s, + start_frame=start_frame, end_frame=end_frame, + )) + continue + + luma_bytes, sat_bytes, phash_hex = fingerprint_frame(frame, vc_cfg) + + scenes.append(Scene( + scene_id=idx, + source_path=cfg.paths.source_movie, + start_s=start_s, end_s=end_s, + start_frame=start_frame, end_frame=end_frame, + luma_hist=luma_bytes, + sat_hist=sat_bytes, + phash=phash_hex, + )) + + if (idx + 1) % 50 == 0: + logger.info(" … %d / %d scenes fingerprinted", idx + 1, len(raw_scenes)) + + return scenes + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + +def build_scene_index( + cfg: AppConfig, + force_reindex: bool = False, + dialogue_callback: DialogueCallback | None = None, +) -> list[Scene]: + """ + Build (or load from cache) the full scene index for the source movie. + + Steps: + 1. Load from .cache/scene_index.json if available and force_reindex=False. + 2. Otherwise: detect scenes via PySceneDetect → fingerprint → cache. + 3. Optionally enrich each scene with dialogue via dialogue_callback. + + Args: + cfg: Application configuration. + force_reindex: Ignore cache and re-run detection + fingerprinting. + dialogue_callback: Optional function Scene → Scene that adds dialogue. + Injected here so this module stays audio-free. + + Returns: + List of Scene objects with fingerprints populated. + """ + if not force_reindex: + cached = _load_cache(cfg) + if cached is not None: + if dialogue_callback: + cached = [dialogue_callback(s) for s in cached] + return cached + + raw = _detect_scenes_pyscenedetect(cfg) + scenes = _fingerprint_scenes(raw, cfg) + _save_cache(scenes, cfg) + + if dialogue_callback: + scenes = [dialogue_callback(s) for s in scenes] + + return scenes diff --git a/src/cv/vibe_check.py b/src/cv/vibe_check.py new file mode 100644 index 0000000..ed1d1fd --- /dev/null +++ b/src/cv/vibe_check.py @@ -0,0 +1,190 @@ +""" +src/cv/vibe_check.py — Phase 1: Scene-level histogram / pHash filter + +Responsibility: + Given ONE TrailerBeat (with pre-computed fingerprints) and a list of + source Scenes (also fingerprinted), return the Top-K candidates ranked + by a combined histogram + pHash score. + +This module contains ZERO file I/O and ZERO frame decoding — those live +in the pipeline layer. Input = model objects, output = sorted VibeHit list. +""" + +from __future__ import annotations + +import logging +from dataclasses import replace +from typing import Sequence + +import cv2 +import numpy as np + +from src.core.models import Scene, TrailerBeat, VibeHit +from src.cv.fingerprinting import bytes_to_hist, phash_distance + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Scoring +# --------------------------------------------------------------------------- + +# Weight applied to histogram score vs pHash score in the combined metric. +# pHash gets less weight because it's sensitive to text overlays on source. +_HIST_WEIGHT = 0.70 +_PHASH_WEIGHT = 0.30 +_PHASH_MAX_BITS = 64 # maximum possible Hamming distance + + +def _hist_combined_score( + beat: TrailerBeat, + scene: Scene, + hist_method: int, +) -> float: + """ + Average CORREL score of luma + saturation histograms. + + Returns a value in [-1, 1] (CORREL) or [0, 1] depending on method. + Higher is always more similar (we invert BHATTACHARYYA if needed). + """ + if beat.luma_hist is None or scene.luma_hist is None: + return 0.0 + if beat.sat_hist is None or scene.sat_hist is None: + return 0.0 + + luma_score = cv2.compareHist( + bytes_to_hist(beat.luma_hist), + bytes_to_hist(scene.luma_hist), + hist_method, + ) + sat_score = cv2.compareHist( + bytes_to_hist(beat.sat_hist), + bytes_to_hist(scene.sat_hist), + hist_method, + ) + + # Normalise BHATTACHARYYA to [0, 1] similarity (invert distance) + if hist_method == cv2.HISTCMP_BHATTACHARYYA: + luma_score = 1.0 - float(luma_score) + sat_score = 1.0 - float(sat_score) + + return float((luma_score + sat_score) / 2.0) + + +def _phash_score(beat: TrailerBeat, scene: Scene) -> float: + """ + Convert Hamming distance to a [0, 1] similarity score. + + 0 Hamming distance → 1.0 (identical) + 64 Hamming distance → 0.0 (completely different) + """ + if beat.phash is None or scene.phash is None: + return 0.0 + dist = phash_distance(beat.phash, scene.phash) + return 1.0 - (dist / _PHASH_MAX_BITS) + + +def _combined_score( + beat: TrailerBeat, + scene: Scene, + hist_method: int, +) -> float: + """Weighted aggregate of histogram + pHash similarity.""" + hist = _hist_combined_score(beat, scene, hist_method) + phash = _phash_score(beat, scene) + return _HIST_WEIGHT * hist + _PHASH_WEIGHT * phash + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + +def run_vibe_check( + beat: TrailerBeat, + scenes: Sequence[Scene], + top_k: int, + hist_method: int, + phash_max_distance: int, +) -> list[VibeHit]: + """ + Phase 1: Score all source scenes against one trailer beat and return + the top-K candidates for Deep Scan. + + Args: + beat: The trailer beat to match (must have fingerprints). + scenes: All detected scenes from the source movie. + top_k: Maximum number of candidates to return. + hist_method: cv2.HISTCMP_* constant (e.g. 0 = CORREL). + phash_max_distance: Scenes with pHash Hamming distance > this value + are excluded before ranking (hard filter). + + Returns: + List of VibeHit, sorted by combined_score descending, length ≤ top_k. + Empty list if beat has no fingerprints or no scenes pass the filter. + """ + if beat.luma_hist is None and beat.phash is None: + logger.warning( + "Beat %d has no fingerprints — skipping Vibe Check.", beat.beat_id + ) + return [] + + candidates: list[VibeHit] = [] + + for scene in scenes: + # Hard pHash filter: skip scenes that are too visually distant + if beat.phash and scene.phash: + dist = phash_distance(beat.phash, scene.phash) + if dist > phash_max_distance: + continue # fast rejection — avoids full histogram compare + + hist = _hist_combined_score(beat, scene, hist_method) + phash = _phash_score(beat, scene) + combined = _HIST_WEIGHT * hist + _PHASH_WEIGHT * phash + + candidates.append(VibeHit( + beat_id=beat.beat_id, + scene_id=scene.scene_id, + hist_score=round(hist, 4), + phash_distance=( + phash_distance(beat.phash, scene.phash) + if beat.phash and scene.phash + else _PHASH_MAX_BITS + ), + combined_score=round(combined, 4), + )) + + # Sort by combined score, descending; return top-K + candidates.sort(key=lambda h: h.combined_score, reverse=True) + top = candidates[:top_k] + + logger.info( + "Vibe Check beat=%d: %d scenes scored, %d candidates forwarded to Deep Scan. " + "Best score: %.3f (scene %s)", + beat.beat_id, + len(candidates), + len(top), + top[0].combined_score if top else 0.0, + top[0].scene_id if top else "—", + ) + + return top + + +def batch_vibe_check( + beats: Sequence[TrailerBeat], + scenes: Sequence[Scene], + top_k: int, + hist_method: int, + phash_max_distance: int, +) -> dict[int, list[VibeHit]]: + """ + Run Vibe Check for every beat and return a mapping beat_id → [VibeHit]. + + Convenience wrapper for the pipeline layer. + """ + return { + beat.beat_id: run_vibe_check( + beat, scenes, top_k, hist_method, phash_max_distance + ) + for beat in beats + } diff --git a/src/export/__init__.py b/src/export/__init__.py new file mode 100644 index 0000000..da61106 --- /dev/null +++ b/src/export/__init__.py @@ -0,0 +1 @@ +# src.export package — FCPXML / EDL export diff --git a/src/export/edl_writer.py b/src/export/edl_writer.py new file mode 100644 index 0000000..d593b99 --- /dev/null +++ b/src/export/edl_writer.py @@ -0,0 +1,114 @@ +""" +src/export/edl_writer.py — EditTimeline → CMX 3600 EDL + +Generates a standard CMX 3600 Edit Decision List compatible with +Avid, DaVinci Resolve, Premiere Pro, and most NLEs. + +CMX 3600 format reference: + https://en.wikipedia.org/wiki/Edit_decision_list#CMX_3600 +""" + +from __future__ import annotations + +import logging +from pathlib import Path + +from src.core.config import AppConfig +from src.core.models import EditClip, EditTimeline +from src.export.timecode import seconds_to_smpte + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# EDL line builders +# --------------------------------------------------------------------------- + +def _edl_header(title: str) -> str: + return f"TITLE: {title}\nFCM: NON-DROP FRAME\n" + + +def _edl_event( + event_num: int, + clip: EditClip, + fps: float, +) -> str: + """ + Build one CMX 3600 event block for a single clip. + + Format: + NNN AX V C + * FROM CLIP NAME: ... + * COMMENT: ... + """ + src_in = seconds_to_smpte(clip.match.in_point_s, fps) + source_duration_s = clip.source_timeline_duration_s + src_out = seconds_to_smpte(clip.match.in_point_s + source_duration_s, fps) + rec_in = seconds_to_smpte(clip.timeline_start_s, fps) + rec_out = seconds_to_smpte(clip.timeline_start_s + source_duration_s, fps) + + event_line = f"{event_num:03d} AX V C {src_in} {src_out} {rec_in} {rec_out}" + name_line = f"* FROM CLIP NAME: {clip.match.source_path.name}" + comment_line = ( + f"* BEAT {clip.beat.beat_id:03d} | {clip.beat.beat_type.name} | " + f"score={clip.match.match_score:.3f}" + ) + + return "\n".join([event_line, name_line, comment_line, ""]) + + +def _edl_black_tail_event(event_num: int, clip: EditClip, fps: float) -> str: + rec_in = seconds_to_smpte(clip.timeline_start_s + clip.source_timeline_duration_s, fps) + rec_out = seconds_to_smpte(clip.timeline_end_s, fps) + event_line = f"{event_num:03d} BL V C 00:00:00:00 00:00:00:00 {rec_in} {rec_out}" + comment_line = ( + f"* BEAT {clip.beat.beat_id:03d} TRAILER-ONLY TAIL | " + "add fade/dissolve to black" + ) + return "\n".join([event_line, "* FROM CLIP NAME: BLACK", comment_line, ""]) + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + +def write_edl( + timeline: EditTimeline, + cfg: AppConfig, + output_path: Path | None = None, +) -> Path: + """ + Write the EditTimeline as a CMX 3600 EDL file. + + Args: + timeline: EditTimeline from build_timeline(). + cfg: Application configuration. + output_path: Override destination. Defaults to + /.edl. + + Returns: + Path to the written .edl file. + """ + if output_path is None: + output_path = cfg.paths.output_dir / f"{timeline.title}.edl" + + output_path.parent.mkdir(parents=True, exist_ok=True) + + fps = timeline.frame_rate + lines = [_edl_header(timeline.title), "\n"] + + event_num = 1 + for clip in sorted(timeline.clips, key=lambda c: c.clip_index): + lines.append(_edl_event(event_num, clip, fps)) + event_num += 1 + if clip.trailer_tail_s > 0: + lines.append("\n") + lines.append(_edl_black_tail_event(event_num, clip, fps)) + event_num += 1 + lines.append("\n") + + edl_text = "\n".join(lines) + output_path.write_text(edl_text, encoding="utf-8") + + logger.info("EDL written → %s (%d events)", output_path, timeline.clip_count) + return output_path diff --git a/src/export/fcpxml_writer.py b/src/export/fcpxml_writer.py new file mode 100644 index 0000000..bba4098 --- /dev/null +++ b/src/export/fcpxml_writer.py @@ -0,0 +1,222 @@ +""" +src/export/fcpxml_writer.py — EditTimeline → Final Cut Pro XML (FCPXML 1.10) + +Generates a standards-compliant FCPXML file that can be imported directly +into Final Cut Pro X, DaVinci Resolve, or Premiere Pro (via FCPXML plugin). + +Spec reference: https://developer.apple.com/documentation/professional_video_applications/fcpxml_reference +""" + +from __future__ import annotations + +import logging +from pathlib import Path +from urllib.parse import quote +from xml.etree import ElementTree as ET +from xml.etree.ElementTree import Element, SubElement + +from src.core.config import AppConfig +from src.core.models import EditClip, EditTimeline +from src.export.timecode import ( + fcpxml_format_name, + fcpxml_frame_duration, + seconds_to_fcpxml, +) + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Asset registry — one per unique source file +# --------------------------------------------------------------------------- + +class _AssetRegistry: + def __init__(self) -> None: + self._assets: dict[Path, str] = {} # path → asset id + self._counter = 2 # r1 reserved for format + + def get_or_create(self, path: Path) -> str: + if path not in self._assets: + rid = f"r{self._counter}" + self._assets[path] = rid + self._counter += 1 + return self._assets[path] + + @property + def items(self) -> dict[Path, str]: + return dict(self._assets) + + +# --------------------------------------------------------------------------- +# Builder +# --------------------------------------------------------------------------- + +def _path_to_url(path: Path) -> str: + """Convert an absolute Path to a file:// URL as required by FCPXML.""" + posix = path.as_posix() + if not posix.startswith("/"): + # Windows drive letter: C:/foo → /C:/foo + posix = "/" + posix + return "file://" + quote(posix, safe="/:@") + + +def build_fcpxml( + timeline: EditTimeline, + cfg: AppConfig, + source_duration_s: float = 7200.0, # 2-hour fallback if not probed +) -> ET.ElementTree: + """ + Build a complete FCPXML ElementTree from an EditTimeline. + + Args: + timeline: Ordered sequence of EditClips. + cfg: Application configuration. + source_duration_s: Duration of the source movie asset (used for + duration attribute). Will be probed + automatically when possible. + + Returns: + xml.etree.ElementTree.ElementTree — call .write() to serialise. + """ + fps = timeline.frame_rate + + # ---- root --------------------------------------------------------------- + root = Element("fcpxml", version=cfg.export.fcpxml_version) + root.set("xmlns", "http://www.apple.com/dt/FCPXML/1_10") + + # ---- resources ---------------------------------------------------------- + resources = SubElement(root, "resources") + + format_id = "r1" + format_name = fcpxml_format_name(fps) + fmt = SubElement(resources, "format", + id=format_id, + name=format_name, + frameDuration=fcpxml_frame_duration(fps), + width="1920", + height="1080", + colorSpace="1-1-1 (Rec. 709)", + ) + + registry = _AssetRegistry() + + # Pre-register all unique source paths so elements come before + # the block (required by FCPXML spec). + for clip in timeline.clips: + registry.get_or_create(clip.match.source_path) + + # Probe actual source duration when possible + _durations: dict[Path, float] = {} + for path in registry.items: + try: + from src.cv.frame_extractor import get_video_info + info = get_video_info(path) + _durations[path] = float(info["duration_s"]) + except Exception: + _durations[path] = source_duration_s + + for path, rid in registry.items.items(): + dur_s = _durations.get(path, source_duration_s) + SubElement(resources, "asset", + id=rid, + name=path.stem, + src=_path_to_url(path), + start="0s", + duration=seconds_to_fcpxml(dur_s, fps), + hasVideo="1", + hasAudio="1", + format=format_id, + ) + + # ---- library / event / project ------------------------------------------ + library = SubElement(root, "library") + event = SubElement(library, "event", name=timeline.title) + project = SubElement(event, "project", name=timeline.title) + sequence = SubElement(project, "sequence", + duration=seconds_to_fcpxml(timeline.total_duration_s, fps), + format=format_id, + tcStart="0s", + tcFormat="NDF", + audioLayout="stereo", + audioRate="48k", + ) + spine = SubElement(sequence, "spine") + + # ---- clips -------------------------------------------------------------- + for clip in sorted(timeline.clips, key=lambda c: c.clip_index): + asset_id = registry.get_or_create(clip.match.source_path) + + source_duration_s = clip.source_timeline_duration_s + clip_elem = SubElement(spine, "clip", + name=f"Beat_{clip.beat.beat_id:03d}_{clip.beat.beat_type.name}", + ref=asset_id, + # offset = position on the timeline + offset=seconds_to_fcpxml(clip.timeline_start_s, fps), + # duration = matched source part only; trailer-only tails become gaps. + duration=seconds_to_fcpxml(source_duration_s, fps), + # start = in-point inside the source asset + start=seconds_to_fcpxml(clip.match.in_point_s, fps), + ) + + # Inline audio role + SubElement(clip_elem, "audio", + role="dialogue", + srcCh="1, 2", + outCh="L, R", + ) + + if clip.trailer_tail_s > 0: + gap = SubElement(spine, "gap", + name=f"Beat_{clip.beat.beat_id:03d}_TRAILER_TAIL_BLACK_FADE", + offset=seconds_to_fcpxml(clip.timeline_start_s + source_duration_s, fps), + duration=seconds_to_fcpxml(clip.trailer_tail_s, fps), + start="0s", + ) + SubElement(gap, "marker", + start="0s", + value="Trailer-only tail: add fade/dissolve to black here", + completed="0", + ) + + return ET.ElementTree(root) + + +# --------------------------------------------------------------------------- +# Writer +# --------------------------------------------------------------------------- + +def write_fcpxml( + timeline: EditTimeline, + cfg: AppConfig, + output_path: Path | None = None, +) -> Path: + """ + Serialise the EditTimeline to a .fcpxml file. + + Args: + timeline: EditTimeline from build_timeline(). + cfg: Application configuration. + output_path: Override destination. Defaults to + /.fcpxml. + + Returns: + Path to the written .fcpxml file. + """ + if output_path is None: + output_path = cfg.paths.output_dir / f"{timeline.title}.fcpxml" + + output_path.parent.mkdir(parents=True, exist_ok=True) + + tree = build_fcpxml(timeline, cfg) + + # Add XML declaration + DOCTYPE manually (ElementTree doesn't support DOCTYPE) + xml_bytes = ET.tostring(tree.getroot(), encoding="unicode", xml_declaration=False) + header = ( + '\n' + '\n' + ) + + output_path.write_text(header + xml_bytes, encoding="utf-8") + + logger.info("FCPXML written → %s (%d clips)", output_path, timeline.clip_count) + return output_path diff --git a/src/export/timecode.py b/src/export/timecode.py new file mode 100644 index 0000000..89a6ffd --- /dev/null +++ b/src/export/timecode.py @@ -0,0 +1,146 @@ +""" +src/export/timecode.py — Timecode / rational-time conversion helpers + +FCPXML uses rational fractions ("1001/24000s") for all time values. +EDL uses SMPTE timecode strings ("HH:MM:SS:FF"). + +All conversion functions are pure — no I/O, no state. +""" + +from __future__ import annotations + +import math +from fractions import Fraction + + +# --------------------------------------------------------------------------- +# Common frame-rate denominators +# --------------------------------------------------------------------------- + +_FPS_RATIONAL: dict[float, tuple[int, int]] = { + 23.976: (24000, 1001), + 24.0: (24, 1), + 25.0: (25, 1), + 29.97: (30000, 1001), + 30.0: (30, 1), + 50.0: (50, 1), + 59.94: (60000, 1001), + 60.0: (60, 1), +} + +_TOLERANCE = 0.01 # fps match tolerance + + +def _fps_to_rational(fps: float) -> tuple[int, int]: + """Return (numerator, denominator) for common fps values.""" + for ref_fps, rational in _FPS_RATIONAL.items(): + if abs(fps - ref_fps) < _TOLERANCE: + return rational + # Fallback: convert float to exact fraction + f = Fraction(fps).limit_denominator(1001) + return f.numerator, f.denominator + + +# --------------------------------------------------------------------------- +# Seconds → FCPXML rational string +# --------------------------------------------------------------------------- + +def seconds_to_fcpxml(seconds: float, fps: float) -> str: + """ + Convert *seconds* to FCPXML rational time string. + + FCPXML requires exact rational arithmetic to avoid drift. + Example: 10.0s @23.976fps → "240240/24000s" + + Args: + seconds: Time in seconds (float). + fps: Project frame rate. + + Returns: + FCPXML time string, e.g. "240240/24000s". + """ + if seconds == 0.0: + return "0s" + + num, den = _fps_to_rational(fps) # frames per second = num/den + # seconds × (num/den) = frames (float); round to nearest frame + frames = round(seconds * num / den) + # frames ÷ (num/den) = frames × den/num → rational seconds + total_num = frames * den + total_den = num + # Reduce fraction + g = math.gcd(total_num, total_den) + return f"{total_num // g}/{total_den // g}s" + + +def seconds_to_frame_count(seconds: float, fps: float) -> int: + """Convert seconds to integer frame count.""" + return round(seconds * fps) + + +# --------------------------------------------------------------------------- +# Seconds → SMPTE timecode (for EDL) +# --------------------------------------------------------------------------- + +def seconds_to_smpte(seconds: float, fps: float, drop_frame: bool = False) -> str: + """ + Convert *seconds* to SMPTE timecode string "HH:MM:SS:FF". + + Drop-frame timecode (;) is not implemented — always returns NDF (:). + + Args: + seconds: Time in float seconds. + fps: Frame rate (23.976, 24, 25, etc.). + drop_frame: Ignored; placeholder for future DF support. + + Returns: + "HH:MM:SS:FF" string. + """ + total_frames = seconds_to_frame_count(seconds, fps) + nominal_fps = round(fps) # e.g. 23.976 → 24 + + ff = total_frames % nominal_fps + total_s = total_frames // nominal_fps + ss = total_s % 60 + total_m = total_s // 60 + mm = total_m % 60 + hh = total_m // 60 + + return f"{hh:02d}:{mm:02d}:{ss:02d}:{ff:02d}" + + +# --------------------------------------------------------------------------- +# FCPXML format ID helpers +# --------------------------------------------------------------------------- + +def fcpxml_format_name(fps: float, width: int = 1920, height: int = 1080) -> str: + """ + Return an FCPXML format name string for a given frame rate and resolution. + + Example: fps=23.976, 1080p → "FFVideoFormat1080p2398" + """ + res = f"{height}p" + fps_tag = { + 23.976: "2398", + 24.0: "24", + 25.0: "25", + 29.97: "2997", + 30.0: "30", + 50.0: "50", + 59.94: "5994", + 60.0: "60", + }.get(fps, str(int(fps * 100))) + return f"FFVideoFormat{res}{fps_tag}" + + +def fcpxml_frame_duration(fps: float) -> str: + """ + Return FCPXML frameDuration attribute for a given fps. + + frame duration = 1 frame = 1/fps seconds = den/num seconds + Example: 23.976fps → num=24000, den=1001 → frame duration = 1001/24000s + """ + num, den = _fps_to_rational(fps) # fps = num/den (e.g. 24000/1001) + # frame duration = den/num seconds + g = math.gcd(den, num) + return f"{den // g}/{num // g}s" diff --git a/src/llm/__init__.py b/src/llm/__init__.py new file mode 100644 index 0000000..a20d165 --- /dev/null +++ b/src/llm/__init__.py @@ -0,0 +1 @@ +# src.llm package — Thematic segmentation / dramaturgy (NO vision matching) diff --git a/src/llm/dramaturg.py b/src/llm/dramaturg.py new file mode 100644 index 0000000..defcd18 --- /dev/null +++ b/src/llm/dramaturg.py @@ -0,0 +1,202 @@ +""" +src/llm/dramaturg.py — LLM-based thematic beat classification (OpenRouter) + +Responsibility: + - Receive a list of TrailerBeat objects (with dialogue lines attached) + - Send a single structured prompt to the LLM + - Parse the JSON response to assign BeatType to each beat + +IMPORTANT: This module does ZERO visual analysis. + It classifies narrative dramaturgy from dialogue text only. + Visual matching is handled exclusively by the CV engine. +""" + +from __future__ import annotations + +import json +import logging +from dataclasses import replace +from typing import Sequence + +from src.core.config import AppConfig +from src.core.models import BeatType, TrailerBeat + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Prompt builder +# --------------------------------------------------------------------------- + +_SYSTEM_PROMPT = """You are a film trailer editor and narrative analyst. +Your task is to classify each beat of a trailer into one of these dramatic roles: + HOOK - Opening attention grabber (first impression, shocking image, logo) + SETUP - World/character introduction + CONFLICT - Inciting incident, rising tension, threat revealed + CLIMAX - Peak action/emotion, highest stakes + RESOLUTION - Cool-down, tagline, final title card + +You will receive a JSON array of beats with their index and dialogue text. +Respond ONLY with a valid JSON array, one object per beat, with keys: + "beat_id" (int) and "beat_type" (one of the strings above). +Do NOT include any explanation or markdown fences.""" + +_USER_TEMPLATE = """Classify the following {n} trailer beats: + +{beats_json}""" + + +def _build_beats_payload(beats: Sequence[TrailerBeat]) -> str: + payload = [] + for b in beats: + dialogue_text = " / ".join(line.text for line in b.dialogue) or "(no dialogue)" + payload.append({ + "beat_id": b.beat_id, + "duration": round(b.duration_s, 2), + "dialogue": dialogue_text, + }) + return json.dumps(payload, ensure_ascii=False, indent=2) + + +# --------------------------------------------------------------------------- +# OpenRouter / OpenAI-compatible HTTP client +# --------------------------------------------------------------------------- + +def _call_llm(prompt_user: str, cfg: AppConfig) -> str: + """ + Send a chat completion request to the configured LLM provider. + + Supports: openrouter, openai, ollama (all use the OpenAI-compatible API). + + Returns: + The raw text content of the first assistant message. + + Raises: + RuntimeError: On HTTP errors or missing API key. + """ + import urllib.request + import urllib.error + + llm = cfg.llm + + if llm.provider in ("openrouter", "openai") and not llm.api_key: + raise RuntimeError( + f"LLM provider is '{llm.provider}' but no API key found. " + "Set OPENROUTER_API_KEY (or OPENAI_API_KEY) in your .env file." + ) + + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {llm.api_key}", + } + if llm.provider == "openrouter": + headers["HTTP-Referer"] = "https://github.com/ai-trailer-2026" + headers["X-Title"] = "AI Trailer Generator v2" + + body = json.dumps({ + "model": llm.model, + "messages": [ + {"role": "system", "content": _SYSTEM_PROMPT}, + {"role": "user", "content": prompt_user}, + ], + "temperature": llm.temperature, + "max_tokens": llm.max_tokens, + }).encode("utf-8") + + url = f"{llm.base_url.rstrip('/')}/chat/completions" + + req = urllib.request.Request(url, data=body, headers=headers, method="POST") + + try: + with urllib.request.urlopen(req, timeout=llm.timeout_seconds) as resp: + data = json.loads(resp.read().decode("utf-8")) + return data["choices"][0]["message"]["content"] + except urllib.error.HTTPError as exc: + body_text = exc.read().decode(errors="replace") + raise RuntimeError( + f"LLM HTTP {exc.code} from {url}:\n{body_text}" + ) from exc + + +# --------------------------------------------------------------------------- +# Response parser +# --------------------------------------------------------------------------- + +_BEAT_TYPE_MAP: dict[str, BeatType] = {bt.name: bt for bt in BeatType} + + +def _parse_response(raw: str, beats: Sequence[TrailerBeat]) -> dict[int, BeatType]: + """ + Parse the LLM JSON array response into a beat_id → BeatType mapping. + + Falls back to BeatType.UNKNOWN for any beat that cannot be parsed. + """ + # Strip accidental markdown fences + clean = raw.strip() + if clean.startswith("```"): + clean = "\n".join(clean.split("\n")[1:]) + if clean.endswith("```"): + clean = clean[: clean.rfind("```")] + clean = clean.strip() + + result: dict[int, BeatType] = {b.beat_id: BeatType.UNKNOWN for b in beats} + + try: + parsed = json.loads(clean) + if not isinstance(parsed, list): + raise ValueError("Expected JSON array at top level.") + + for item in parsed: + bid = int(item["beat_id"]) + name = str(item.get("beat_type", "UNKNOWN")).upper() + result[bid] = _BEAT_TYPE_MAP.get(name, BeatType.UNKNOWN) + + except (json.JSONDecodeError, KeyError, ValueError) as exc: + logger.warning("LLM response parse error (%s) — all beats → UNKNOWN.\nRaw: %s", exc, raw[:300]) + + return result + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + +def classify_beats( + beats: Sequence[TrailerBeat], + cfg: AppConfig, +) -> list[TrailerBeat]: + """ + Use the LLM to assign a BeatType to each TrailerBeat. + + Args: + beats: TrailerBeat list (dialogue should be populated for best results). + cfg: Application configuration (llm section + api key). + + Returns: + New list of TrailerBeat objects with beat_type set. + On LLM error, all beats keep BeatType.UNKNOWN (no exception raised). + """ + if not beats: + return list(beats) + + logger.info( + "Classifying %d beats via %s / %s …", + len(beats), cfg.llm.provider, cfg.llm.model, + ) + + payload = _build_beats_payload(beats) + prompt = _USER_TEMPLATE.format(n=len(beats), beats_json=payload) + + try: + raw_response = _call_llm(prompt, cfg) + except Exception as exc: + logger.error("LLM classification failed: %s — keeping BeatType.UNKNOWN.", exc) + return list(beats) + + type_map = _parse_response(raw_response, beats) + + enriched = [replace(b, beat_type=type_map.get(b.beat_id, BeatType.UNKNOWN)) for b in beats] + + classified = sum(1 for b in enriched if b.beat_type != BeatType.UNKNOWN) + logger.info("Beat classification done: %d / %d classified.", classified, len(beats)) + return enriched diff --git a/src/llm/vision_cache.py b/src/llm/vision_cache.py new file mode 100644 index 0000000..0e9c7e1 --- /dev/null +++ b/src/llm/vision_cache.py @@ -0,0 +1,316 @@ +""" +Cached vision descriptions for ambiguous trailer/source matching. + +This module is deliberately conservative: it never writes a final match and it +does not replace CV. It describes a small number of 3-frame beat/scene samples, +caches those descriptions, and returns extra source in-point seeds for the CV +scanner to verify. +""" + +from __future__ import annotations + +import base64 +import json +import logging +import re +import urllib.error +import urllib.request +from dataclasses import asdict +from pathlib import Path +from typing import Sequence + +import cv2 + +from src.core.config import AppConfig +from src.core.models import Scene, TrailerBeat + +logger = logging.getLogger(__name__) + +_CACHE_VERSION = 1 +_STOPWORDS = { + "the", "and", "with", "from", "that", "this", "there", "their", "into", + "scene", "frame", "image", "shot", "video", "visible", "looks", "appears", + "eine", "einer", "einem", "einen", "und", "oder", "mit", "der", "die", "das", +} + +_SYSTEM_PROMPT = """You describe film shots for automatic matching. +Return only compact JSON with these keys: +subject, setting, composition, action_phase, distinctive_objects, lighting_color, negatives. +Focus on stable visual facts and spatial layout. Ignore timecode overlays, subtitles, logos, compression, aspect ratio, and color grading differences.""" + + +def _cache_path(cfg: AppConfig) -> Path: + return cfg.paths.cache_dir / "vision_descriptions.json" + + +def _load_cache(cfg: AppConfig) -> dict: + path = _cache_path(cfg) + if not path.exists(): + return {"version": _CACHE_VERSION, "items": {}} + try: + data = json.loads(path.read_text(encoding="utf-8")) + except json.JSONDecodeError: + logger.warning("Vision cache is unreadable; rebuilding: %s", path) + return {"version": _CACHE_VERSION, "items": {}} + if data.get("version") != _CACHE_VERSION or not isinstance(data.get("items"), dict): + return {"version": _CACHE_VERSION, "items": {}} + return data + + +def _save_cache(cfg: AppConfig, cache: dict) -> None: + path = _cache_path(cfg) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(cache, indent=2, ensure_ascii=False), encoding="utf-8") + + +def _sample_times(start_s: float, end_s: float) -> list[float]: + duration_s = max(0.04, end_s - start_s) + return [ + start_s + min(duration_s * 0.12, max(0.0, duration_s - 0.04)), + start_s + duration_s * 0.50, + start_s + max(0.0, duration_s - min(duration_s * 0.12, 0.20)), + ] + + +def _frame_data_url(video_path: Path, t_s: float) -> str | None: + cap = cv2.VideoCapture(str(video_path)) + try: + if not cap.isOpened(): + return None + cap.set(cv2.CAP_PROP_POS_MSEC, max(0.0, t_s) * 1000.0) + ok, frame = cap.read() + if not ok or frame is None: + return None + h, w = frame.shape[:2] + if w > 640: + frame = cv2.resize(frame, (640, int(h * (640 / w))), interpolation=cv2.INTER_AREA) + ok, encoded = cv2.imencode(".jpg", frame, [int(cv2.IMWRITE_JPEG_QUALITY), 72]) + if not ok: + return None + payload = base64.b64encode(encoded.tobytes()).decode("ascii") + return f"data:image/jpeg;base64,{payload}" + finally: + cap.release() + + +def _call_vision_model(label: str, image_urls: list[str], cfg: AppConfig) -> str: + vision = cfg.vision + if vision.provider in ("openai", "openrouter") and not vision.api_key: + raise RuntimeError( + "Vision is enabled but no API key is available. Set VISION_API_KEY, " + "OPENROUTER_API_KEY, OPENAI_API_KEY, or LLM_API_KEY." + ) + + content: list[dict] = [{ + "type": "text", + "text": ( + f"Describe this 3-frame sample for matching. Label: {label}. " + "The frames are start, middle, and end of the same beat/scene." + ), + }] + content.extend({ + "type": "image_url", + "image_url": {"url": url, "detail": "low"}, + } for url in image_urls) + + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {vision.api_key}", + } + if vision.provider == "openrouter": + headers["HTTP-Referer"] = "https://github.com/ai-trailer-2026" + headers["X-Title"] = "AI Trailer Generator v2" + + body = json.dumps({ + "model": vision.model, + "messages": [ + {"role": "system", "content": _SYSTEM_PROMPT}, + {"role": "user", "content": content}, + ], + "temperature": vision.temperature, + "max_tokens": vision.max_tokens, + }).encode("utf-8") + + url = f"{vision.base_url.rstrip('/')}/chat/completions" + req = urllib.request.Request(url, data=body, headers=headers, method="POST") + try: + with urllib.request.urlopen(req, timeout=vision.timeout_seconds) as resp: + data = json.loads(resp.read().decode("utf-8")) + return str(data["choices"][0]["message"]["content"]).strip() + except urllib.error.HTTPError as exc: + body_text = exc.read().decode(errors="replace") + raise RuntimeError(f"Vision HTTP {exc.code} from {url}:\n{body_text}") from exc + + +def _description_key(kind: str, item_id: int, start_s: float, end_s: float, cfg: AppConfig) -> str: + path = cfg.paths.reference_trailer if kind == "beat" else cfg.paths.source_movie + try: + stamp = int(path.stat().st_mtime) + except OSError: + stamp = 0 + return ( + f"{kind}:{item_id}:" + f"{start_s:.3f}:{end_s:.3f}:" + f"{cfg.vision.provider}:{cfg.vision.model}:{stamp}" + ) + + +def _describe_sample( + *, + kind: str, + item_id: int, + label: str, + video_path: Path, + start_s: float, + end_s: float, + cfg: AppConfig, + cache: dict, + budget: list[int], +) -> str | None: + key = _description_key(kind, item_id, start_s, end_s, cfg) + cached = cache["items"].get(key) + if cached: + return str(cached.get("description", "")) + if budget[0] <= 0: + return None + + image_urls = [ + url for url in (_frame_data_url(video_path, t) for t in _sample_times(start_s, end_s)) + if url is not None + ] + if len(image_urls) < 2: + return None + + description = _call_vision_model(label, image_urls, cfg) + cache["items"][key] = { + "kind": kind, + "item_id": item_id, + "start_s": start_s, + "end_s": end_s, + "label": label, + "description": description, + } + budget[0] -= 1 + return description + + +def _terms(text: str) -> set[str]: + words = re.findall(r"[a-zA-Z][a-zA-Z0-9_'-]{2,}", text.lower()) + return {w for w in words if w not in _STOPWORDS} + + +def _text_similarity(a: str, b: str) -> float: + ta = _terms(a) + tb = _terms(b) + if not ta or not tb: + return 0.0 + overlap = len(ta & tb) + return float(overlap / max(8, min(len(ta), len(tb)))) + + +def _scene_seed_points(scene: Scene, max_points: int) -> list[float]: + if max_points <= 1 or scene.duration_s <= 0: + return [scene.start_s] + usable_end = max(scene.start_s, scene.end_s - 0.2) + if usable_end <= scene.start_s: + return [scene.start_s] + step = (usable_end - scene.start_s) / max(1, max_points - 1) + return [scene.start_s + step * idx for idx in range(max_points)] + + +def build_vision_seed_in_points( + beats: Sequence[TrailerBeat], + scenes: Sequence[Scene], + cfg: AppConfig, +) -> dict[int, list[tuple[float, float]]]: + """ + Return extra in-point seeds from cached vision descriptions. + + The function is intentionally small-budget: for each beat it describes the + beat once and only a few top scene-level candidates. Existing descriptions + are read from cache and cost nothing. + """ + if not cfg.vision.enabled: + return {} + if not beats or not scenes: + return {} + + from src.cv.vibe_check import run_vibe_check + + cache = _load_cache(cfg) + budget = [cfg.vision.max_new_descriptions_per_run] + scenes_by_id = {scene.scene_id: scene for scene in scenes} + seeds: dict[int, list[tuple[float, float]]] = {} + + for beat in beats: + beat_desc = _describe_sample( + kind="beat", + item_id=beat.beat_id, + label=f"trailer beat {beat.beat_id}", + video_path=beat.trailer_path, + start_s=beat.start_s, + end_s=beat.end_s, + cfg=cfg, + cache=cache, + budget=budget, + ) + if not beat_desc: + continue + + hits = run_vibe_check( + beat, + scenes, + top_k=cfg.vision.scene_candidate_top_k, + hist_method=cfg.cv.vibe_check.hist_compare_method, + phash_max_distance=64, + ) + + ranked: list[tuple[float, Scene]] = [] + for hit in hits: + scene = scenes_by_id.get(hit.scene_id) + if scene is None: + continue + scene_desc = _describe_sample( + kind="scene", + item_id=scene.scene_id, + label=f"source scene {scene.scene_id}", + video_path=scene.source_path, + start_s=scene.start_s, + end_s=scene.end_s, + cfg=cfg, + cache=cache, + budget=budget, + ) + if not scene_desc: + continue + score = _text_similarity(beat_desc, scene_desc) + if score >= cfg.vision.similarity_threshold: + ranked.append((score, scene)) + + ranked.sort(key=lambda item: item[0], reverse=True) + points: list[tuple[float, float]] = [] + for score, scene in ranked[:cfg.vision.max_seed_scenes]: + logger.info( + "Beat %d: vision seed scene=%d score=%.3f", + beat.beat_id, + scene.scene_id, + score, + ) + weighted_score = max( + cfg.cv.deep_scan.coarse_candidate_threshold, + min(0.98, cfg.vision.seed_score * (0.75 + min(1.0, score) * 0.25)), + ) + points.extend( + (point, weighted_score) + for point in _scene_seed_points(scene, cfg.vision.seed_points_per_scene) + ) + + if points: + merged: dict[float, float] = {} + for point, weighted_score in points: + key = round(max(0.0, point), 3) + merged[key] = max(weighted_score, merged.get(key, 0.0)) + seeds[beat.beat_id] = sorted((point, score) for point, score in merged.items()) + + _save_cache(cfg, cache) + return seeds diff --git a/src/pipeline/__init__.py b/src/pipeline/__init__.py new file mode 100644 index 0000000..53af47b --- /dev/null +++ b/src/pipeline/__init__.py @@ -0,0 +1,3 @@ +""" +src/pipeline/__init__.py — Orchestration layer +""" diff --git a/src/pipeline/matcher.py b/src/pipeline/matcher.py new file mode 100644 index 0000000..431c10a --- /dev/null +++ b/src/pipeline/matcher.py @@ -0,0 +1,291 @@ +""" +src/pipeline/matcher.py — Top-level CV matching orchestrator + +This is the single entry point for the full 2-phase CV pipeline: + + Phase 0: Load / build scene index (PySceneDetect + fingerprinting) + Phase 1: Vibe Check — histogram + pHash filter → Top-K candidates per beat + Phase 2: Deep Scan — template matching → frame-accurate MatchResult per beat + +Usage: + from src.core.config import load_config + from src.pipeline.matcher import run_matching + + cfg = load_config() + beats = [...] # list[TrailerBeat] from trailer analysis + results = run_matching(cfg, beats) +""" + +from __future__ import annotations + +import logging +from typing import Sequence + +from src.core.config import AppConfig +from src.core.models import MatchResult, Scene, TrailerBeat + +logger = logging.getLogger(__name__) +SeedPoint = float | tuple[float, float] + + +def _scene_seed_points(scene: Scene, max_points: int) -> list[float]: + if max_points <= 1 or scene.duration_s <= 0: + return [scene.start_s] + usable_end = max(scene.start_s, scene.end_s - 0.2) + if usable_end <= scene.start_s: + return [scene.start_s] + step = (usable_end - scene.start_s) / max(1, max_points - 1) + return [scene.start_s + step * idx for idx in range(max_points)] + + +def _build_scene_seed_in_points( + beats: Sequence[TrailerBeat], + scenes: Sequence[Scene], + cfg: AppConfig, +) -> dict[int, list[float]]: + from src.cv.vibe_check import run_vibe_check + + scenes_by_id = {scene.scene_id: scene for scene in scenes} + seeds: dict[int, list[float]] = {} + for beat in beats: + hits = run_vibe_check( + beat, + scenes, + top_k=cfg.cv.deep_scan.scene_seed_top_k, + hist_method=cfg.cv.vibe_check.hist_compare_method, + phash_max_distance=64, + ) + points: list[float] = [] + for hit in hits: + scene = scenes_by_id.get(hit.scene_id) + if scene is None: + continue + points.extend(_scene_seed_points(scene, cfg.cv.deep_scan.scene_seed_points_per_scene)) + if points: + seeds[beat.beat_id] = sorted({round(max(0.0, p), 3) for p in points}) + logger.info( + "Beat %d: added %d scene-level seed candidates from %d source scenes.", + beat.beat_id, + len(seeds[beat.beat_id]), + len(hits), + ) + return seeds + + +def _merge_seed_in_points( + *seed_maps: dict[int, Sequence[SeedPoint]] | None, +) -> dict[int, list[SeedPoint]]: + merged: dict[int, dict[float, float | None]] = {} + for seed_map in seed_maps: + if not seed_map: + continue + for beat_id, points in seed_map.items(): + beat_points = merged.setdefault(beat_id, {}) + for point in points: + if isinstance(point, tuple): + t_sec = round(max(0.0, float(point[0])), 3) + score = float(point[1]) + else: + t_sec = round(max(0.0, float(point)), 3) + score = None + old_score = beat_points.get(t_sec) + if old_score is None: + beat_points[t_sec] = score + elif score is not None: + beat_points[t_sec] = max(old_score, score) + + result: dict[int, list[SeedPoint]] = {} + for beat_id, points in merged.items(): + result[beat_id] = [ + (t_sec, score) if score is not None else t_sec + for t_sec, score in sorted(points.items()) + ] + return result + + +# --------------------------------------------------------------------------- +# Beat fingerprinting +# --------------------------------------------------------------------------- + +def fingerprint_beats( + beats: Sequence[TrailerBeat], + cfg: AppConfig, +) -> list[TrailerBeat]: + """ + Enrich every TrailerBeat with its visual fingerprint (histogram + pHash). + + Extracts the midpoint frame from the reference trailer and fingerprints it + using the same Text-Safe Crop parameters as the scene indexer. + + Args: + beats: TrailerBeat list (fingerprints will be None initially). + cfg: Application configuration. + + Returns: + New list of TrailerBeat objects with luma_hist, sat_hist, phash set. + """ + from dataclasses import replace + from src.cv.fingerprinting import fingerprint_frame + from src.cv.frame_extractor import grab_frame_at_path + + vc_cfg = cfg.cv.vibe_check + enriched: list[TrailerBeat] = [] + + for beat in beats: + frame = grab_frame_at_path(beat.trailer_path, beat.midpoint_s) + if frame is None: + logger.warning("Beat %d: cannot decode midpoint frame, leaving unfingerpinted.", beat.beat_id) + enriched.append(beat) + continue + + luma_b, sat_b, phash = fingerprint_frame(frame, vc_cfg) + enriched.append(replace(beat, luma_hist=luma_b, sat_hist=sat_b, phash=phash)) + + logger.info("Fingerprinted %d / %d beats.", sum(1 for b in enriched if b.phash), len(beats)) + return enriched + + +# --------------------------------------------------------------------------- +# Main pipeline entry point +# --------------------------------------------------------------------------- + +def run_matching( + cfg: AppConfig, + beats: Sequence[TrailerBeat], + force_reindex: bool = False, + seed_in_points: dict[int, Sequence[SeedPoint]] | None = None, +) -> list[MatchResult]: + """ + Execute the full 2-phase CV matching pipeline. + + Args: + cfg: Application configuration (loaded from config.toml). + beats: All trailer beats to source (must have trailer_path set). + force_reindex: If True, ignore the scene cache and re-run PySceneDetect. + + Returns: + List of MatchResult, one per beat (unmatched beats are omitted). + Results are in the same order as the input beats. + """ + from src.cv.scene_indexer import build_scene_index + + logger.info("=" * 60) + logger.info("AI Trailer Generator v2 — CV Matching Pipeline") + logger.info("Source : %s", cfg.paths.source_movie.name) + logger.info("Trailer: %s", cfg.paths.reference_trailer.name) + logger.info("Beats : %d", len(beats)) + logger.info("=" * 60) + + # ------------------------------------------------------------------ + # Phase 0: Scene index + # ------------------------------------------------------------------ + logger.info("[Phase 0] Building scene index …") + scenes: list[Scene] = build_scene_index(cfg, force_reindex=force_reindex) + scenes_by_id: dict[int, Scene] = {s.scene_id: s for s in scenes} + logger.info("[Phase 0] %d scenes indexed.", len(scenes)) + + # ------------------------------------------------------------------ + # Phase 0b: Fingerprint the beats + # ------------------------------------------------------------------ + logger.info("[Phase 0b] Fingerprinting %d trailer beats …", len(beats)) + beats = fingerprint_beats(beats, cfg) + + # ------------------------------------------------------------------ + # Phase 1 & 2: Global Scan (bypasses Scene Indexer / Vibe Check entirely) + # ------------------------------------------------------------------ + logger.info("[Phase 1 & 2] Running FFmpeg Global Scan for %d beats ...", len(beats)) + from src.cv.global_scan import run_global_scan + + scene_seed_in_points = _build_scene_seed_in_points(beats, scenes, cfg) + vision_seed_in_points = {} + if cfg.vision.enabled: + try: + from src.llm.vision_cache import build_vision_seed_in_points + + vision_seed_in_points = build_vision_seed_in_points(beats, scenes, cfg) + except Exception as exc: + logger.error("Vision seeding failed: %s — continuing with CV-only seeds.", exc) + results = run_global_scan( + beats, + cfg, + scenes=scenes, + seed_in_points=_merge_seed_in_points(seed_in_points, scene_seed_in_points, vision_seed_in_points), + ) + + logger.info("[Phase 1 & 2] Done. %d / %d beats matched.", len(results), len(beats)) + logger.info("=" * 60) + + return results + + +# --------------------------------------------------------------------------- +# Convenience: build an EditTimeline from match results +# --------------------------------------------------------------------------- + +def build_timeline( + beats: Sequence[TrailerBeat], + results: Sequence[MatchResult], + cfg: AppConfig, +) -> "src.core.models.EditTimeline": # type: ignore[name-defined] + """ + Combine beats + match results into an ordered EditTimeline. + + Unmatched beats are skipped; timeline positions are computed + sequentially from the usable source-match durations. + + Args: + beats: All trailer beats (defines order + durations). + results: MatchResult list from run_matching(). + cfg: Application configuration. + + Returns: + EditTimeline ready for FCPXML / EDL export. + """ + from src.core.models import EditClip, EditTimeline + + results_by_beat: dict[int, MatchResult] = {r.beat_id: r for r in results} + + clips: list[EditClip] = [] + cursor = 0.0 + + for beat in beats: + match = results_by_beat.get(beat.beat_id) + if match is None: + logger.warning("Beat %d has no match — gap in timeline.", beat.beat_id) + cursor += beat.duration_s + continue + + match_duration = max(0.0, match.duration_s) + source_duration = min(beat.duration_s, match_duration) if match_duration > 0 else beat.duration_s + trailer_tail_s = max(0.0, beat.duration_s - source_duration) + if trailer_tail_s > 0: + logger.warning( + "Beat %d uses %.2fs source + %.2fs generated trailer tail.", + beat.beat_id, + source_duration, + trailer_tail_s, + ) + + clip = EditClip( + clip_index=len(clips), + beat=beat, + match=match, + timeline_start_s=cursor, + timeline_end_s=cursor + beat.duration_s, + source_duration_s=source_duration, + trailer_tail_s=trailer_tail_s, + ) + clips.append(clip) + cursor += beat.duration_s + + timeline = EditTimeline( + title=cfg.paths.reference_trailer.stem, + frame_rate=cfg.export.edl_frame_rate, + clips=tuple(clips), + ) + + logger.info( + "Timeline built: %d clips, total duration %.2fs", + timeline.clip_count, timeline.total_duration_s, + ) + return timeline diff --git a/src/pipeline/reporter.py b/src/pipeline/reporter.py new file mode 100644 index 0000000..a84610d --- /dev/null +++ b/src/pipeline/reporter.py @@ -0,0 +1,427 @@ +""" +src/pipeline/reporter.py — Visual Match Report Generator + +Generates an HTML file containing side-by-side video clips of: + Left: The original beat from the reference trailer + Right: The matched scene from the source movie + +This allows instant visual verification of the CV pipeline's results. +""" + +from __future__ import annotations + +import logging +import subprocess +from pathlib import Path + +from src.core.config import AppConfig + +logger = logging.getLogger(__name__) + + +def _extract_clip(video_path: Path, start_s: float, duration_s: float, out_path: Path) -> None: + """Use ffmpeg to extract a silent, low-res preview clip.""" + out_path.parent.mkdir(parents=True, exist_ok=True) + + # Fast input seek close to the target, then accurate output seek for + # frame-faithful preview clips. A plain "-ss before -i" can land on a + # nearby keyframe and make the report look several frames out of sync. + preroll_s = 2.0 if start_s >= 2.0 else 0.0 + input_seek_s = max(0.0, start_s - preroll_s) + accurate_seek_s = start_s - input_seek_s + + cmd = [ + "ffmpeg", "-y", "-loglevel", "error", + "-ss", str(input_seek_s), + "-i", str(video_path), + "-ss", str(accurate_seek_s), + "-t", str(duration_s), + "-map", "0:v:0", + "-c:v", "libx264", + "-preset", "ultrafast", + "-crf", "28", + "-vf", "scale=640:-2", # scale down for lightweight report + "-an", # no audio + "-movflags", "+faststart", + str(out_path) + ] + + result = subprocess.run(cmd, capture_output=True) + if result.returncode != 0: + logger.error( + "ffmpeg clip extraction failed for %s:\n%s", + out_path.name, result.stderr.decode(errors="replace") + ) + + +def _extract_clip_with_black_tail( + video_path: Path, + start_s: float, + source_duration_s: float, + total_duration_s: float, + out_path: Path, +) -> None: + """Extract a source preview and append black frames for trailer-only tails.""" + tail_s = max(0.0, total_duration_s - source_duration_s) + if tail_s <= 0.02: + _extract_clip(video_path, start_s, source_duration_s, out_path) + return + + out_path.parent.mkdir(parents=True, exist_ok=True) + source_tmp = out_path.with_name(f"{out_path.stem}_source_tmp.mp4") + tail_tmp = out_path.with_name(f"{out_path.stem}_tail_tmp.mp4") + preroll_s = 2.0 if start_s >= 2.0 else 0.0 + input_seek_s = max(0.0, start_s - preroll_s) + accurate_seek_s = start_s - input_seek_s + + # First render the matched source portion with the same accurate seek path + # as _extract_clip(). Using trim=start=... after an input seek is brittle + # because FFmpeg may preserve non-zero packet timestamps around keyframes. + source_cmd = [ + "ffmpeg", "-y", "-loglevel", "error", + "-ss", str(input_seek_s), + "-i", str(video_path), + "-ss", str(accurate_seek_s), + "-t", str(source_duration_s), + "-map", "0:v:0", + "-c:v", "libx264", + "-preset", "ultrafast", + "-crf", "28", + "-vf", "scale=640:360,setsar=1,fps=25,setpts=PTS-STARTPTS", + "-an", + "-movflags", "+faststart", + str(source_tmp), + ] + + result = subprocess.run(source_cmd, capture_output=True) + if result.returncode != 0: + logger.error( + "ffmpeg source preview extraction failed for %s:\n%s", + out_path.name, + result.stderr.decode(errors="replace"), + ) + return + + tail_cmd = [ + "ffmpeg", "-y", "-loglevel", "error", + "-f", "lavfi", + "-i", f"color=c=black:s=640x360:r=25:d={tail_s}", + "-c:v", "libx264", + "-preset", "ultrafast", + "-crf", "28", + "-an", + "-movflags", "+faststart", + str(tail_tmp), + ] + result = subprocess.run(tail_cmd, capture_output=True) + if result.returncode != 0: + logger.error( + "ffmpeg black tail render failed for %s:\n%s", + out_path.name, + result.stderr.decode(errors="replace"), + ) + return + + concat_cmd = [ + "ffmpeg", "-y", "-loglevel", "error", + "-i", str(source_tmp), + "-i", str(tail_tmp), + "-filter_complex", "[0:v][1:v]concat=n=2:v=1:a=0[v]", + "-map", "[v]", + "-c:v", "libx264", + "-preset", "ultrafast", + "-crf", "28", + "-an", + "-movflags", "+faststart", + str(out_path), + ] + result = subprocess.run(concat_cmd, capture_output=True) + if result.returncode != 0: + logger.error( + "ffmpeg tailed preview concat failed for %s:\n%s", + out_path.name, + result.stderr.decode(errors="replace"), + ) + + for tmp in (source_tmp, tail_tmp): + try: + tmp.unlink(missing_ok=True) + except OSError: + pass + + +def _extract_segmented_clip( + video_path: Path, + segments: list, + total_duration_s: float, + out_path: Path, +) -> None: + """Render a beat-length source preview from multiple matched source islands.""" + if not segments: + _extract_clip_with_black_tail(video_path, 0.0, 0.0, total_duration_s, out_path) + return + + out_path.parent.mkdir(parents=True, exist_ok=True) + tmp_paths: list[Path] = [] + cursor = 0.0 + + def add_black(duration_s: float) -> None: + if duration_s <= 0.02: + return + tmp = out_path.with_name(f"{out_path.stem}_part_{len(tmp_paths):03d}_black.mp4") + cmd = [ + "ffmpeg", "-y", "-loglevel", "error", + "-f", "lavfi", + "-i", f"color=c=black:s=640x360:r=25:d={duration_s}", + "-c:v", "libx264", "-preset", "ultrafast", "-crf", "28", + "-an", "-movflags", "+faststart", + str(tmp), + ] + result = subprocess.run(cmd, capture_output=True) + if result.returncode == 0: + tmp_paths.append(tmp) + else: + logger.error("ffmpeg black segment render failed:\n%s", result.stderr.decode(errors="replace")) + + def add_source(start_s: float, duration_s: float) -> None: + if duration_s <= 0.02: + return + tmp = out_path.with_name(f"{out_path.stem}_part_{len(tmp_paths):03d}_src.mp4") + preroll_s = 2.0 if start_s >= 2.0 else 0.0 + input_seek_s = max(0.0, start_s - preroll_s) + accurate_seek_s = start_s - input_seek_s + cmd = [ + "ffmpeg", "-y", "-loglevel", "error", + "-ss", str(input_seek_s), + "-i", str(video_path), + "-ss", str(accurate_seek_s), + "-t", str(duration_s), + "-map", "0:v:0", + "-c:v", "libx264", "-preset", "ultrafast", "-crf", "28", + "-vf", "scale=640:360,setsar=1,fps=25,setpts=PTS-STARTPTS", + "-an", "-movflags", "+faststart", + str(tmp), + ] + result = subprocess.run(cmd, capture_output=True) + if result.returncode == 0 and tmp.exists(): + tmp_paths.append(tmp) + else: + logger.error("ffmpeg source segment render failed:\n%s", result.stderr.decode(errors="replace")) + + for segment in sorted(segments, key=lambda s: s.trailer_offset_s): + offset_s = max(0.0, float(segment.trailer_offset_s)) + duration_s = max(0.0, float(segment.duration_s)) + add_black(offset_s - cursor) + add_source(float(segment.in_point_s), duration_s) + cursor = max(cursor, offset_s + duration_s) + + add_black(total_duration_s - cursor) + + if len(tmp_paths) == 1: + tmp_paths[0].replace(out_path) + return + + inputs: list[str] = [] + labels: list[str] = [] + for idx, tmp in enumerate(tmp_paths): + inputs.extend(["-i", str(tmp)]) + labels.append(f"[{idx}:v]") + filter_complex = "".join(labels) + f"concat=n={len(tmp_paths)}:v=1:a=0[v]" + cmd = [ + "ffmpeg", "-y", "-loglevel", "error", + *inputs, + "-filter_complex", filter_complex, + "-map", "[v]", + "-c:v", "libx264", "-preset", "ultrafast", "-crf", "28", + "-an", "-movflags", "+faststart", + str(out_path), + ] + result = subprocess.run(cmd, capture_output=True) + if result.returncode != 0: + logger.error("ffmpeg segmented preview concat failed:\n%s", result.stderr.decode(errors="replace")) + + for tmp in tmp_paths: + try: + tmp.unlink(missing_ok=True) + except OSError: + pass + + +def _build_frame_locked_compare(ref_path: Path, src_path: Path, out_path: Path) -> None: + """Render reference and source into one side-by-side video stream.""" + out_path.parent.mkdir(parents=True, exist_ok=True) + normalize = ( + "fps=25,scale=640:360:force_original_aspect_ratio=decrease," + "pad=640:360:(ow-iw)/2:(oh-ih)/2,setsar=1,setpts=PTS-STARTPTS" + ) + filter_complex = ( + f"[0:v]{normalize}[ref];" + f"[1:v]{normalize}[src];" + "[ref][src]hstack=inputs=2[v]" + ) + cmd = [ + "ffmpeg", "-y", "-loglevel", "error", + "-i", str(ref_path), + "-i", str(src_path), + "-filter_complex", filter_complex, + "-map", "[v]", + "-c:v", "libx264", + "-preset", "ultrafast", + "-crf", "28", + "-an", + "-movflags", "+faststart", + str(out_path), + ] + result = subprocess.run(cmd, capture_output=True) + if result.returncode != 0: + logger.error( + "ffmpeg compare render failed for %s:\n%s", + out_path.name, + result.stderr.decode(errors="replace"), + ) + + +def generate_report(beats: list, results: list, cfg: AppConfig) -> Path: + """ + Generate an HTML side-by-side report. + Returns the path to the .html file. + """ + report_dir = cfg.paths.output_dir / "report" + report_dir.mkdir(parents=True, exist_ok=True) + + html_path = report_dir / "match_report.html" + results_by_beat = {r.beat_id: r for r in results} + + logger.info("Generating report clips in %s (this might take a moment) ...", report_dir) + + html = [ + "", + "AI Trailer Match Report", + "", + f"

AI Trailer Generator — Match Report

", + f"
Total Beats: {len(beats)} | Matched: {len(results)}
", + "" + ] + + for beat in beats: + res = results_by_beat.get(beat.beat_id) + + # Extract Reference Clip + ref_mp4 = report_dir / f"beat_{beat.beat_id:03d}_ref.mp4" + _extract_clip(beat.trailer_path, beat.start_s, beat.duration_s, ref_mp4) + + html.append("
") + + # Info Panel + html.append("
") + html.append(f"

Beat {beat.beat_id:03d}

") + html.append(f"

Type: {beat.beat_type.name}

") + html.append(f"

Trailer: {beat.start_s:.2f}s → {beat.end_s:.2f}s

") + + if res: + segments = list(getattr(res, "segments", ()) or []) + source_duration = sum(max(0.0, float(s.duration_s)) for s in segments) + if not segments: + source_duration = max(0.0, res.out_point_s - res.in_point_s) + preview_duration = min(beat.duration_s, source_duration) if source_duration > 0 else beat.duration_s + last_segment_end = max( + (float(s.trailer_offset_s) + float(s.duration_s) for s in segments), + default=preview_duration, + ) + trailer_tail_s = max(0.0, beat.duration_s - last_segment_end) + if getattr(res, "is_confirmed", True): + html.append("

MATCHED

") + else: + html.append("

PROVISIONAL MATCH

") + html.append(f"

Scene ID: {res.scene_id}

") + html.append(f"

Movie In: {res.in_point_s:.2f}s

") + html.append(f"

Source Dur: {source_duration:.2f}s

") + if len(segments) > 1: + html.append(f"

Segments: {len(segments)} matched visual islands

") + if trailer_tail_s > 0: + html.append(f"

Unmatched Tail: {trailer_tail_s:.2f}s placeholder

") + html.append(f"

Score: {res.match_score:.3f}

") + if trailer_tail_s > 0: + html.append("

Some trailer frames are still unmatched; report fills only those gaps with placeholder black.

") + + # Warn if score is low + if res.match_score < 0.80: + html.append("

⚠️ Score below 0.80. Verify visually.

") + + # Extract Source Clip + src_mp4 = report_dir / f"beat_{beat.beat_id:03d}_src.mp4" + compare_mp4 = report_dir / f"beat_{beat.beat_id:03d}_compare.mp4" + if segments: + _extract_segmented_clip(res.source_path, segments, beat.duration_s, src_mp4) + else: + _extract_clip_with_black_tail( + res.source_path, + res.in_point_s, + preview_duration, + beat.duration_s, + src_mp4, + ) + _build_frame_locked_compare(ref_mp4, src_mp4, compare_mp4) + else: + html.append("

NO MATCH

") + src_mp4 = None + compare_mp4 = None + + html.append(f"
python cli.py rematch --beat {beat.beat_id}
") + html.append("
") # /info + + # Video Panel + html.append("
") + if compare_mp4: + html.append(f"

Frame-Locked Compare

") + else: + html.append("
") + html.append(f"

Reference Trailer

") + html.append("

Matched Source

No Match
") + html.append("
") # /video-container + html.append("
") # /videos + html.append("
") # /beat-row + + html.append("") + + html_path.write_text("\n".join(html), encoding="utf-8") + return html_path diff --git a/src/pipeline/trailer_analyzer.py b/src/pipeline/trailer_analyzer.py new file mode 100644 index 0000000..a2e16b9 --- /dev/null +++ b/src/pipeline/trailer_analyzer.py @@ -0,0 +1,175 @@ +""" +src/pipeline/trailer_analyzer.py — Reference trailer → list[TrailerBeat] + +Responsibility: + 1. Run PySceneDetect on the REFERENCE TRAILER (not the source movie) + to detect cut boundaries → raw beat intervals + 2. Fingerprint the midpoint frame of each beat (for Vibe Check) + 3. Transcribe dialogue per beat via Whisper (optional, injected) + 4. Optionally classify BeatType via the LLM dramaturg (injected) + +Returns: list[TrailerBeat] ready to feed into run_matching(). +""" + +from __future__ import annotations + +import logging +from dataclasses import replace +from pathlib import Path +from typing import Callable, Sequence + +from src.core.config import AppConfig +from src.core.models import BeatType, DialogueLine, TrailerBeat +from src.cv.fingerprinting import fingerprint_frame +from src.cv.frame_extractor import grab_midpoint_frame, open_video + +logger = logging.getLogger(__name__) + +# Injection type aliases — keeps this module free of hard audio/LLM imports +TranscribeCallback = Callable[[Path, float, float, float], list[DialogueLine]] +ClassifyCallback = Callable[[list[TrailerBeat]], list[TrailerBeat]] + + +# --------------------------------------------------------------------------- +# Step 1: Scene detection on the reference trailer +# --------------------------------------------------------------------------- + +def _detect_trailer_beats(cfg: AppConfig) -> list[tuple[float, float, int, int]]: + """ + Run PySceneDetect on the reference trailer. + + Returns list of (start_s, end_s, start_frame, end_frame). + Uses the same ContentDetector thresholds as the source movie. + """ + try: + from scenedetect import open_video as sd_open_video, SceneManager + from scenedetect.detectors import ContentDetector + except ImportError: + raise ImportError("pip install scenedetect[opencv]") + + trailer_path = cfg.paths.reference_trailer + video = sd_open_video(str(trailer_path)) + manager = SceneManager() + manager.add_detector( + ContentDetector( + threshold=cfg.scene_detection.content_threshold, + min_scene_len=int( + cfg.scene_detection.min_scene_duration_s * video.frame_rate + ), + ) + ) + + logger.info("Detecting beats in reference trailer: %s …", trailer_path.name) + manager.detect_scenes(video=video, show_progress=False) + + raw = manager.get_scene_list() + result = [ + (s.get_seconds(), e.get_seconds(), s.get_frames(), e.get_frames()) + for s, e in raw + ] + logger.info("Detected %d beats in reference trailer.", len(result)) + return result + + +# --------------------------------------------------------------------------- +# Step 2: Fingerprint beats +# --------------------------------------------------------------------------- + +def _fingerprint_beats( + raw_beats: list[tuple[float, float, int, int]], + cfg: AppConfig, +) -> list[TrailerBeat]: + """Extract midpoint frame for each beat and compute fingerprints.""" + vc_cfg = cfg.cv.vibe_check + trailer_path = cfg.paths.reference_trailer + beats: list[TrailerBeat] = [] + + with open_video(trailer_path) as cap: + for idx, (start_s, end_s, start_frame, end_frame) in enumerate(raw_beats): + frame = grab_midpoint_frame(cap, start_s, end_s) + + if frame is None: + logger.warning("Beat %d: midpoint frame decode failed.", idx) + beats.append(TrailerBeat( + beat_id=idx, + trailer_path=trailer_path, + start_s=start_s, end_s=end_s, + start_frame=start_frame, end_frame=end_frame, + )) + continue + + luma_b, sat_b, phash = fingerprint_frame(frame, vc_cfg) + beats.append(TrailerBeat( + beat_id=idx, + trailer_path=trailer_path, + start_s=start_s, end_s=end_s, + start_frame=start_frame, end_frame=end_frame, + luma_hist=luma_b, + sat_hist=sat_b, + phash=phash, + )) + + return beats + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + +def analyze_reference_trailer( + cfg: AppConfig, + transcribe_callback: TranscribeCallback | None = None, + classify_callback: ClassifyCallback | None = None, +) -> list[TrailerBeat]: + """ + Full reference-trailer analysis pipeline. + + Args: + cfg: Application configuration. + transcribe_callback: Optional fn(path, start_s, end_s, offset_s) + → list[DialogueLine]. Injected to keep this + module free of faster-whisper imports. + classify_callback: Optional fn(beats) → beats with BeatType set. + Injected to keep this module LLM-free. + + Returns: + List of TrailerBeat objects with fingerprints (and optionally + dialogue + BeatType) populated. + """ + # Step 1 — cut detection + raw_beats = _detect_trailer_beats(cfg) + + # Step 2 — fingerprint + beats = _fingerprint_beats(raw_beats, cfg) + + # Step 3 — dialogue (optional) + if transcribe_callback is not None: + enriched: list[TrailerBeat] = [] + for beat in beats: + try: + lines = transcribe_callback( + beat.trailer_path, + beat.start_s, + beat.end_s, + beat.start_s, # time_offset so timestamps are absolute + ) + enriched.append(replace(beat, dialogue=tuple(lines))) + except Exception as exc: + logger.warning("Beat %d transcription failed: %s", beat.beat_id, exc) + enriched.append(beat) + beats = enriched + + # Step 4 — LLM dramaturgy (optional) + if classify_callback is not None: + try: + beats = classify_callback(beats) + except Exception as exc: + logger.warning("Beat classification failed: %s — keeping UNKNOWN.", exc) + + logger.info( + "Trailer analysis complete: %d beats, %d with dialogue, %d classified.", + len(beats), + sum(1 for b in beats if b.dialogue), + sum(1 for b in beats if b.beat_type != BeatType.UNKNOWN), + ) + return beats diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..65140f2 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +# tests package diff --git a/tests/test_config.py b/tests/test_config.py new file mode 100644 index 0000000..f0b728b --- /dev/null +++ b/tests/test_config.py @@ -0,0 +1,144 @@ +""" +tests/test_config.py — Smoke tests for config loading and model integrity. + +Run with: pytest tests/test_config.py -v +""" + +from pathlib import Path +import pytest + +from src.core.config import load_config, AppConfig +from src.core.models import ( + Scene, TrailerBeat, MatchResult, VibeHit, + EditClip, EditTimeline, BeatType, DialogueLine, +) + + +CONFIG_PATH = Path(__file__).parents[1] / "config.toml" + + +# --------------------------------------------------------------------------- +# Config loader +# --------------------------------------------------------------------------- + +class TestConfigLoader: + def test_loads_without_error(self) -> None: + cfg = load_config(CONFIG_PATH) + assert isinstance(cfg, AppConfig) + + def test_project_meta(self) -> None: + cfg = load_config(CONFIG_PATH) + assert cfg.version == "2.0.0" + assert cfg.log_level in ("DEBUG", "INFO", "WARNING", "ERROR") + + def test_cv_thresholds_in_range(self) -> None: + cfg = load_config(CONFIG_PATH) + ds = cfg.cv.deep_scan + assert 0.0 < ds.match_threshold < 1.0 + assert ds.coarse_step_seconds > 0 + + def test_vibe_check_crop_fractions(self) -> None: + cfg = load_config(CONFIG_PATH) + vc = cfg.cv.vibe_check + assert 0.0 < vc.crop_top_fraction < 1.0 + assert 0.0 < vc.crop_bottom_fraction < 1.0 + assert vc.crop_top_fraction + vc.crop_bottom_fraction < 1.0 + + def test_missing_config_raises(self, tmp_path: Path) -> None: + with pytest.raises(FileNotFoundError): + load_config(tmp_path / "nonexistent.toml") + + def test_paths_are_path_objects(self) -> None: + cfg = load_config(CONFIG_PATH) + assert isinstance(cfg.paths.source_movie, Path) + assert isinstance(cfg.paths.reference_trailer, Path) + + +# --------------------------------------------------------------------------- +# Data models — construction & properties +# --------------------------------------------------------------------------- + +class TestSceneModel: + def test_duration(self) -> None: + s = Scene( + scene_id=0, + source_path=Path("dummy.mp4"), + start_s=10.0, + end_s=25.5, + start_frame=240, + end_frame=612, + ) + assert s.duration_s == pytest.approx(15.5) + assert s.midpoint_s == pytest.approx(17.75) + + def test_immutable(self) -> None: + s = Scene( + scene_id=0, source_path=Path("x.mp4"), + start_s=0.0, end_s=1.0, + start_frame=0, end_frame=24, + ) + with pytest.raises(Exception): # FrozenInstanceError + s.scene_id = 99 # type: ignore[misc] + + +class TestTrailerBeatModel: + def test_beat_type_default(self) -> None: + b = TrailerBeat( + beat_id=0, trailer_path=Path("trailer.mp4"), + start_s=0.0, end_s=3.0, + start_frame=0, end_frame=72, + ) + assert b.beat_type == BeatType.UNKNOWN + + +class TestMatchResultModel: + def test_duration_computed(self) -> None: + mr = MatchResult( + beat_id=0, scene_id=3, + source_path=Path("movie.mp4"), + in_point_s=120.0, + out_point_s=123.5, + in_point_frame=2880, + match_score=0.87, + ) + assert mr.duration_s == pytest.approx(3.5) + + def test_repr_contains_key_info(self) -> None: + mr = MatchResult( + beat_id=1, scene_id=7, + source_path=Path("movie.mp4"), + in_point_s=60.0, out_point_s=63.0, + in_point_frame=1440, match_score=0.91, + ) + r = repr(mr) + assert "beat=1" in r + assert "scene=7" in r + + +class TestEditTimeline: + def _make_clip(self, idx: int, t_start: float, t_end: float) -> EditClip: + beat = TrailerBeat( + beat_id=idx, trailer_path=Path("t.mp4"), + start_s=t_start, end_s=t_end, + start_frame=0, end_frame=1, + ) + match = MatchResult( + beat_id=idx, scene_id=0, + source_path=Path("m.mp4"), + in_point_s=0.0, out_point_s=t_end - t_start, + in_point_frame=0, match_score=0.9, + ) + return EditClip( + clip_index=idx, beat=beat, match=match, + timeline_start_s=t_start, timeline_end_s=t_end, + ) + + def test_total_duration(self) -> None: + clips = (self._make_clip(0, 0.0, 5.0), self._make_clip(1, 5.0, 9.0)) + tl = EditTimeline(title="Test Trailer", frame_rate=23.976, clips=clips) + assert tl.total_duration_s == pytest.approx(9.0) + assert tl.clip_count == 2 + + def test_empty_timeline(self) -> None: + tl = EditTimeline(title="Empty", frame_rate=24.0, clips=()) + assert tl.total_duration_s == 0.0 diff --git a/tests/test_deep_scan.py b/tests/test_deep_scan.py new file mode 100644 index 0000000..c220ad3 --- /dev/null +++ b/tests/test_deep_scan.py @@ -0,0 +1,140 @@ +""" +tests/test_deep_scan.py — Unit tests for frame_extractor and deep_scan + +Uses synthetic in-memory videos (cv2.VideoWriter → temp file) so no real +video files are required. Tests cover the pure logic, not hardware decoding. +""" + +from __future__ import annotations + +import tempfile +from pathlib import Path + +import cv2 +import numpy as np +import pytest + +from src.cv.frame_extractor import ( + get_video_info, + grab_frame_at, + iter_frames_stepped, + open_video, +) +from src.cv.fingerprinting import text_safe_crop + + +# --------------------------------------------------------------------------- +# Helpers: build a tiny synthetic video on disk +# --------------------------------------------------------------------------- + +FPS = 24 +WIDTH = 320 +HEIGHT = 240 +SECS = 3 + + +def _make_synthetic_video(path: Path, color_bgr: tuple[int, int, int] = (0, 128, 255)) -> Path: + """Write a 3-second single-colour video to *path*.""" + fourcc = cv2.VideoWriter_fourcc(*"mp4v") + writer = cv2.VideoWriter(str(path), fourcc, float(FPS), (WIDTH, HEIGHT)) + frame = np.full((HEIGHT, WIDTH, 3), color_bgr, dtype=np.uint8) + for _ in range(FPS * SECS): + writer.write(frame) + writer.release() + return path + + +@pytest.fixture +def synthetic_video(tmp_path: Path) -> Path: + return _make_synthetic_video(tmp_path / "test.mp4") + + +# --------------------------------------------------------------------------- +# open_video +# --------------------------------------------------------------------------- + +class TestOpenVideo: + def test_opens_valid_file(self, synthetic_video: Path) -> None: + with open_video(synthetic_video) as cap: + assert cap.isOpened() + + def test_raises_on_missing_file(self, tmp_path: Path) -> None: + with pytest.raises(FileNotFoundError): + with open_video(tmp_path / "ghost.mp4"): + pass + + +# --------------------------------------------------------------------------- +# get_video_info +# --------------------------------------------------------------------------- + +class TestGetVideoInfo: + def test_returns_correct_fps(self, synthetic_video: Path) -> None: + info = get_video_info(synthetic_video) + assert info["fps"] == pytest.approx(FPS, rel=0.05) + + def test_duration_approx(self, synthetic_video: Path) -> None: + info = get_video_info(synthetic_video) + assert info["duration_s"] == pytest.approx(SECS, rel=0.1) + + def test_resolution(self, synthetic_video: Path) -> None: + info = get_video_info(synthetic_video) + assert info["width"] == WIDTH + assert info["height"] == HEIGHT + + +# --------------------------------------------------------------------------- +# grab_frame_at +# --------------------------------------------------------------------------- + +class TestGrabFrameAt: + def test_returns_ndarray(self, synthetic_video: Path) -> None: + with open_video(synthetic_video) as cap: + frame = grab_frame_at(cap, 1.0) + assert frame is not None + assert isinstance(frame, np.ndarray) + assert frame.shape == (HEIGHT, WIDTH, 3) + + def test_returns_none_past_end(self, synthetic_video: Path) -> None: + with open_video(synthetic_video) as cap: + frame = grab_frame_at(cap, 9999.0) + # May return None or a repeated last frame depending on codec; + # we only assert no exception is raised. + assert frame is None or isinstance(frame, np.ndarray) + + +# --------------------------------------------------------------------------- +# iter_frames_stepped +# --------------------------------------------------------------------------- + +class TestIterFramesStepped: + def test_yields_correct_count(self, synthetic_video: Path) -> None: + with open_video(synthetic_video) as cap: + frames = list(iter_frames_stepped(cap, 0.0, 1.0, 0.5)) + # Expect timestamps: 0.0, 0.5, 1.0 → 3 frames + assert len(frames) == 3 + + def test_timestamps_increasing(self, synthetic_video: Path) -> None: + with open_video(synthetic_video) as cap: + frames = list(iter_frames_stepped(cap, 0.0, 2.0, 0.5)) + timestamps = [t for t, _ in frames] + assert timestamps == sorted(timestamps) + + def test_invalid_step_raises(self, synthetic_video: Path) -> None: + with open_video(synthetic_video) as cap: + with pytest.raises(ValueError, match="step_s"): + list(iter_frames_stepped(cap, 0.0, 1.0, 0.0)) + + +# --------------------------------------------------------------------------- +# text_safe_crop integration (sanity: cropped height consistent) +# --------------------------------------------------------------------------- + +class TestCropSanity: + def test_crop_reduces_height(self, synthetic_video: Path) -> None: + with open_video(synthetic_video) as cap: + frame = grab_frame_at(cap, 0.5) + assert frame is not None + cropped = text_safe_crop(frame, 0.15, 0.30) + assert cropped.shape[0] < frame.shape[0] + assert cropped.shape[1] == frame.shape[1] # width unchanged diff --git a/tests/test_export.py b/tests/test_export.py new file mode 100644 index 0000000..bd24791 --- /dev/null +++ b/tests/test_export.py @@ -0,0 +1,218 @@ +""" +tests/test_export.py — Unit tests for timecode conversion and export writers + +Tests use synthetic EditTimeline objects (no real video files needed). +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from src.export.timecode import ( + seconds_to_fcpxml, + seconds_to_smpte, + fcpxml_frame_duration, + fcpxml_format_name, + seconds_to_frame_count, +) + + +# --------------------------------------------------------------------------- +# Timecode helpers +# --------------------------------------------------------------------------- + +class TestSecondsToFcpxml: + def test_zero(self) -> None: + assert seconds_to_fcpxml(0.0, 24.0) == "0s" + + def test_one_second_at_24fps(self) -> None: + # 1.0s @ 24fps → 24 frames → 24/24s = 1/1s + result = seconds_to_fcpxml(1.0, 24.0) + assert result == "1/1s" + + def test_one_second_at_23976(self) -> None: + # 1s @ 23.976 → 24000/24000 * 1001/1001 = 1001/1000 ... let's just check it's rational + result = seconds_to_fcpxml(1.0, 23.976) + assert result.endswith("s") + assert "/" in result + + def test_ten_seconds_at_25fps(self) -> None: + # 10s @ 25fps → 250 frames → 250/25s = 10/1s + result = seconds_to_fcpxml(10.0, 25.0) + assert result == "10/1s" + + def test_rational_is_reduced(self) -> None: + # Should never produce 24/24s + result = seconds_to_fcpxml(1.0, 24.0) + num, den = result.rstrip("s").split("/") + from math import gcd + assert gcd(int(num), int(den)) == 1 + + +class TestSecondsToSmpte: + def test_zero(self) -> None: + assert seconds_to_smpte(0.0, 24.0) == "00:00:00:00" + + def test_one_minute(self) -> None: + assert seconds_to_smpte(60.0, 25.0) == "00:01:00:00" + + def test_one_hour(self) -> None: + assert seconds_to_smpte(3600.0, 24.0) == "01:00:00:00" + + def test_frames_overflow(self) -> None: + # 25fps: 26 frames → 1s + 1 frame = 00:00:01:01 + result = seconds_to_smpte(26 / 25, 25.0) + assert result == "00:00:01:01" + + def test_format_length(self) -> None: + result = seconds_to_smpte(123.456, 23.976) + parts = result.split(":") + assert len(parts) == 4 + assert all(len(p) == 2 for p in parts) + + +class TestFcpxmlHelpers: + def test_frame_duration_24fps(self) -> None: + assert fcpxml_frame_duration(24.0) == "1/24s" + + def test_frame_duration_23976(self) -> None: + fd = fcpxml_frame_duration(23.976) + # Should be "1001/24000s" + assert fd == "1001/24000s" + + def test_format_name_1080p_2398(self) -> None: + name = fcpxml_format_name(23.976, 1920, 1080) + assert "1080" in name + assert "2398" in name + + def test_frame_count_roundtrip(self) -> None: + fps = 25.0 + seconds = 10.0 + frames = seconds_to_frame_count(seconds, fps) + assert frames == 250 + + +# --------------------------------------------------------------------------- +# EDL writer (string output) +# --------------------------------------------------------------------------- + +class TestEdlWriter: + def _make_timeline(self) -> "src.core.models.EditTimeline": # type: ignore + from src.core.models import ( + BeatType, EditClip, EditTimeline, MatchResult, TrailerBeat, + ) + + beat = TrailerBeat( + beat_id=0, trailer_path=Path("trailer.mp4"), + start_s=0.0, end_s=5.0, start_frame=0, end_frame=120, + beat_type=BeatType.HOOK, + ) + match = MatchResult( + beat_id=0, scene_id=3, + source_path=Path("movie.mp4"), + in_point_s=30.0, out_point_s=35.0, + in_point_frame=720, match_score=0.88, + ) + clip = EditClip( + clip_index=0, beat=beat, match=match, + timeline_start_s=0.0, timeline_end_s=5.0, + ) + return EditTimeline( + title="TestTrailer", frame_rate=25.0, clips=(clip,) + ) + + def test_edl_contains_title(self, tmp_path: Path) -> None: + from src.core.config import load_config + from src.export.edl_writer import write_edl + + cfg = load_config() + tl = self._make_timeline() + out = write_edl(tl, cfg, output_path=tmp_path / "test.edl") + + text = out.read_text(encoding="utf-8") + assert "TITLE: TestTrailer" in text + + def test_edl_has_event_line(self, tmp_path: Path) -> None: + from src.core.config import load_config + from src.export.edl_writer import write_edl + + cfg = load_config() + tl = self._make_timeline() + out = write_edl(tl, cfg, output_path=tmp_path / "test.edl") + + text = out.read_text(encoding="utf-8") + assert "001" in text # event number + assert "AX" in text # reel name + + +# --------------------------------------------------------------------------- +# FCPXML writer (XML structure) +# --------------------------------------------------------------------------- + +class TestFcpxmlWriter: + def _make_timeline(self) -> "src.core.models.EditTimeline": # type: ignore + from src.core.models import ( + BeatType, EditClip, EditTimeline, MatchResult, TrailerBeat, + ) + + beat = TrailerBeat( + beat_id=0, trailer_path=Path("trailer.mp4"), + start_s=0.0, end_s=5.0, start_frame=0, end_frame=120, + beat_type=BeatType.HOOK, + ) + match = MatchResult( + beat_id=0, scene_id=3, + source_path=Path("B:/Proxy/movie.mp4"), + in_point_s=30.0, out_point_s=35.0, + in_point_frame=720, match_score=0.88, + ) + clip = EditClip( + clip_index=0, beat=beat, match=match, + timeline_start_s=0.0, timeline_end_s=5.0, + ) + return EditTimeline( + title="TestTrailer", frame_rate=25.0, clips=(clip,) + ) + + def test_fcpxml_is_valid_xml(self, tmp_path: Path) -> None: + from xml.etree import ElementTree as ET + from src.core.config import load_config + from src.export.fcpxml_writer import write_fcpxml + + cfg = load_config() + tl = self._make_timeline() + out = write_fcpxml(tl, cfg, output_path=tmp_path / "test.fcpxml") + + text = out.read_text(encoding="utf-8") + text_no_doctype = "\n".join( + line for line in text.splitlines() + if not line.strip().startswith(" None: + from xml.etree import ElementTree as ET + from src.core.config import load_config + from src.export.fcpxml_writer import write_fcpxml + + cfg = load_config() + tl = self._make_timeline() + out = write_fcpxml(tl, cfg, output_path=tmp_path / "test.fcpxml") + + text = out.read_text(encoding="utf-8") + text_no_doctype = "\n".join( + line for line in text.splitlines() + if not line.strip().startswith(" np.ndarray: + """256×256 solid blue BGR frame.""" + frame = np.zeros((256, 256, 3), dtype=np.uint8) + frame[:, :] = (255, 0, 0) # BGR blue + return frame + + +@pytest.fixture +def solid_red_frame() -> np.ndarray: + """256×256 solid red BGR frame.""" + frame = np.zeros((256, 256, 3), dtype=np.uint8) + frame[:, :] = (0, 0, 255) # BGR red + return frame + + +# --------------------------------------------------------------------------- +# text_safe_crop +# --------------------------------------------------------------------------- + +class TestTextSafeCrop: + def test_removes_correct_rows(self, solid_blue_frame: np.ndarray) -> None: + cropped = text_safe_crop(solid_blue_frame, crop_top=0.15, crop_bottom=0.30) + h = solid_blue_frame.shape[0] # 256 + expected_h = int(h * (1.0 - 0.30)) - int(h * 0.15) + assert cropped.shape[0] == expected_h + + def test_zero_crop_returns_same_size(self, solid_blue_frame: np.ndarray) -> None: + cropped = text_safe_crop(solid_blue_frame, crop_top=0.0, crop_bottom=0.0) + assert cropped.shape == solid_blue_frame.shape + + def test_invalid_top_raises(self, solid_blue_frame: np.ndarray) -> None: + with pytest.raises(ValueError, match="crop_top"): + text_safe_crop(solid_blue_frame, crop_top=1.0, crop_bottom=0.0) + + def test_invalid_bottom_raises(self, solid_blue_frame: np.ndarray) -> None: + with pytest.raises(ValueError, match="crop_bottom"): + text_safe_crop(solid_blue_frame, crop_top=0.0, crop_bottom=-0.1) + + def test_overlapping_crops_raise(self, solid_blue_frame: np.ndarray) -> None: + with pytest.raises(ValueError, match="must be < 1.0"): + text_safe_crop(solid_blue_frame, crop_top=0.6, crop_bottom=0.5) + + +# --------------------------------------------------------------------------- +# Histograms +# --------------------------------------------------------------------------- + +class TestHistograms: + def test_output_shape(self, solid_blue_frame: np.ndarray) -> None: + luma, sat = extract_hs_histograms(solid_blue_frame, bins_hue=50, bins_sat=60) + assert luma.shape == (50,) + assert sat.shape == (60,) + + def test_normalised(self, solid_blue_frame: np.ndarray) -> None: + import numpy as np + luma, sat = extract_hs_histograms(solid_blue_frame, bins_hue=50, bins_sat=60) + # L2-normalised → norm ≈ 1.0 + assert np.linalg.norm(luma) == pytest.approx(1.0, abs=1e-5) + assert np.linalg.norm(sat) == pytest.approx(1.0, abs=1e-5) + + def test_same_frame_correl_is_one(self, solid_blue_frame: np.ndarray) -> None: + import cv2 + luma, _ = extract_hs_histograms(solid_blue_frame, bins_hue=50, bins_sat=60) + score = compare_histograms(luma, luma, method=cv2.HISTCMP_CORREL) + assert score == pytest.approx(1.0, abs=1e-5) + + def test_different_frames_correl_lower( + self, + solid_blue_frame: np.ndarray, + solid_red_frame: np.ndarray, + ) -> None: + import cv2 + luma_b, _ = extract_hs_histograms(solid_blue_frame, 50, 60) + luma_r, _ = extract_hs_histograms(solid_red_frame, 50, 60) + score = compare_histograms(luma_b, luma_r, method=cv2.HISTCMP_CORREL) + assert score < 1.0 + + +# --------------------------------------------------------------------------- +# Serialisation round-trip +# --------------------------------------------------------------------------- + +class TestSerialisation: + def test_round_trip(self, solid_blue_frame: np.ndarray) -> None: + luma, _ = extract_hs_histograms(solid_blue_frame, 50, 60) + restored = bytes_to_hist(hist_to_bytes(luma)) + np.testing.assert_array_almost_equal(luma, restored)