From 8e1bcf142fa042815b5b4ec412c00e3ebf89ff9d Mon Sep 17 00:00:00 2001 From: Melbar Date: Sat, 2 May 2026 09:07:41 +0200 Subject: [PATCH] Initial project import --- .env.example | 15 + .gitignore | 44 + README.md | 384 ++++++++ cli.py | 899 ++++++++++++++++++ config.toml | 198 ++++ pyproject.toml | 68 ++ requirements.txt | 37 + setup_venv.ps1 | 89 ++ src/__init__.py | 1 + src/audio/__init__.py | 1 + src/audio/transcriber.py | 182 ++++ src/core/__init__.py | 1 + src/core/config.py | 387 ++++++++ src/core/models.py | 287 ++++++ src/cv/__init__.py | 1 + src/cv/content_align.py | 240 +++++ src/cv/deep_scan.py | 253 +++++ src/cv/fingerprinting.py | 228 +++++ src/cv/frame_extractor.py | 172 ++++ src/cv/global_scan.py | 1509 ++++++++++++++++++++++++++++++ src/cv/scene_indexer.py | 229 +++++ src/cv/vibe_check.py | 190 ++++ src/export/__init__.py | 1 + src/export/edl_writer.py | 114 +++ src/export/fcpxml_writer.py | 222 +++++ src/export/timecode.py | 146 +++ src/llm/__init__.py | 1 + src/llm/dramaturg.py | 202 ++++ src/llm/vision_cache.py | 316 +++++++ src/pipeline/__init__.py | 3 + src/pipeline/matcher.py | 291 ++++++ src/pipeline/reporter.py | 427 +++++++++ src/pipeline/trailer_analyzer.py | 175 ++++ tests/__init__.py | 1 + tests/test_config.py | 144 +++ tests/test_deep_scan.py | 140 +++ tests/test_export.py | 218 +++++ tests/test_fingerprinting.py | 112 +++ 38 files changed, 7928 insertions(+) create mode 100644 .env.example create mode 100644 .gitignore create mode 100644 README.md create mode 100644 cli.py create mode 100644 config.toml create mode 100644 pyproject.toml create mode 100644 requirements.txt create mode 100644 setup_venv.ps1 create mode 100644 src/__init__.py create mode 100644 src/audio/__init__.py create mode 100644 src/audio/transcriber.py create mode 100644 src/core/__init__.py create mode 100644 src/core/config.py create mode 100644 src/core/models.py create mode 100644 src/cv/__init__.py create mode 100644 src/cv/content_align.py create mode 100644 src/cv/deep_scan.py create mode 100644 src/cv/fingerprinting.py create mode 100644 src/cv/frame_extractor.py create mode 100644 src/cv/global_scan.py create mode 100644 src/cv/scene_indexer.py create mode 100644 src/cv/vibe_check.py create mode 100644 src/export/__init__.py create mode 100644 src/export/edl_writer.py create mode 100644 src/export/fcpxml_writer.py create mode 100644 src/export/timecode.py create mode 100644 src/llm/__init__.py create mode 100644 src/llm/dramaturg.py create mode 100644 src/llm/vision_cache.py create mode 100644 src/pipeline/__init__.py create mode 100644 src/pipeline/matcher.py create mode 100644 src/pipeline/reporter.py create mode 100644 src/pipeline/trailer_analyzer.py create mode 100644 tests/__init__.py create mode 100644 tests/test_config.py create mode 100644 tests/test_deep_scan.py create mode 100644 tests/test_export.py create mode 100644 tests/test_fingerprinting.py diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..626846b --- /dev/null +++ b/.env.example @@ -0,0 +1,15 @@ +# ============================================================================= +# AI Trailer Generator v2 — Environment Variables +# ============================================================================= +# Copy this file to .env and fill in your actual keys. +# .env is listed in .gitignore and will NEVER be committed. +# ============================================================================= + +# OpenRouter API key (required when [llm] provider = "openrouter") +OPENROUTER_API_KEY=sk-or-v1-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + +# OpenAI API key (required when [llm] provider = "openai") +# OPENAI_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + +# Universal fallback (used if provider-specific key is not set) +# LLM_API_KEY= diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d3e1e3e --- /dev/null +++ b/.gitignore @@ -0,0 +1,44 @@ +# --------------------------------------------------------------------------- +# AI Trailer Generator v2 — .gitignore +# --------------------------------------------------------------------------- + +# Python +__pycache__/ +*.py[cod] +*.pyo +*.pyd +*.egg-info/ +dist/ +build/ +*.whl +.venv/ +venv/ +.mypy_cache/ +.ruff_cache/ +.pytest_cache/ + +# Project-generated artefacts (potentially huge) +.cache/ +output/ +proxy/ +*.mp4 +*.mov +*.mxf +*.wav +*.mp3 +*.jpg +*.jpeg +*.png + +# IDE +.vscode/ +.idea/ +*.swp + +# OS +.DS_Store +Thumbs.db + +# Secrets / local overrides +.env +config.local.toml diff --git a/README.md b/README.md new file mode 100644 index 0000000..5323691 --- /dev/null +++ b/README.md @@ -0,0 +1,384 @@ +# AI Trailer Generator v2 + +**Frame-accurate trailer reconstruction via pure Computer Vision** + +> Gibt einen Reference Trailer und den dazugehörigen Quellfilm hinein — bekommt eine fertige FCPXML/EDL heraus, die den Trailer Frame-genau aus dem Quellfilm nachbaut. + +--- + +## Das Kernprinzip + +Standardmäßig kein LLM für visuelles Matching. Optional kann ein Vision-Layer +gecachte 3-Frame-Beschreibungen als zusätzliche Suchanker liefern; der finale +Match bleibt aber CV-verifiziert. + +| Phase | Was passiert | Technologie | +|-------|-------------|-------------| +| **0 — Prep** | Reference Trailer analysieren & Beats extrahieren | PySceneDetect + OpenCV | +| **1 — Global Scan**| Gesamten Quellfilm via FFmpeg-Stream (2 FPS) gegen alle Beats scannen | FFmpeg Pipe + Luma-Histogramm | +| **1b — Optional Vision Seeds** | Unsichere Top-K Szenen mit 3-Frame-Beschreibungen cachen | OpenAI-kompatibles Vision-LLM | +| **2 — Refine** | Beste Treffer auf Frame-Ebene präzisieren | OpenCV `matchTemplate` | +| **3 — Dramaturgie** | Narrative BeatType-Klassifikation aus Dialog-Text | OpenRouter LLM | +| **4 — Export** | Timeline → FCPXML 1.10 oder CMX 3600 EDL | xml.etree + eigener Timecode-Layer | + +**Text-Safe Crop:** Obere 15% und untere 30% des Frames werden vor jedem Vergleich ausgeblendet, um Title Cards, Logos und Letterbox zu ignorieren. + +--- + +## Voraussetzungen + +- Python **3.11+** +- [ffmpeg](https://ffmpeg.org/download.html) im PATH (für Whisper Audio-Extraktion) +- CUDA-fähige GPU empfohlen (für faster-whisper; CPU funktioniert auch) + +--- + +## Setup + +### 1. Virtual Environment erstellen & aktivieren + +```powershell +# Im Projektordner +python -m venv .venv +.\.venv\Scripts\Activate.ps1 + +# Falls ExecutionPolicy blockiert: +# Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser +``` + +### 2. Abhängigkeiten installieren + +```powershell +pip install -r requirements.txt +``` + +### 3. API-Key konfigurieren + +```powershell +# .env aus dem Template kopieren +Copy-Item .env.example .env + +# Dann .env öffnen und den echten Key eintragen: +# OPENROUTER_API_KEY=sk-or-v1-... +``` + +### 4. Videodateien eintragen + +`config.toml` öffnen und die Pfade anpassen: + +```toml +[paths] +source_movie = "B:/Proxy/DeinFilm_FTR.mp4" +reference_trailer = "F:/Encodings/DeinFilm_Trailer.mp4" +``` + +--- + +## Verwendung + +```powershell +# Vollständige Pipeline (analyze → match → report → export) +python cli.py run + +# Ohne Whisper-Transkription (schneller) +python cli.py run --no-audio + +# Ohne LLM-Klassifikation +python cli.py run --no-audio --no-llm + +# Schrittweise +python cli.py analyze # Reference Trailer → Beats erkennen +python cli.py match # Globaler FFmpeg Scan (Szenen-unabhängig) +python cli.py report # HTML Report mit Video-Vergleich bauen +python cli.py export --format both # FCPXML + EDL ausgeben + +# Gezielt nur einen Beat bearbeiten (empfohlen für erste Iterationen) +python cli.py match --beat 5 +python cli.py match --beat 5 --vision # optionale gecachte Vision-Seeds +python cli.py report --beat 5 +python cli.py export --beat 5 --format both + +# Fehlerhafte Matches korrigieren +python cli.py rematch --beat 5 --threshold 0.50 # Schwelle anpassen (Globaler Scan wird für diesen Beat wiederholt) +python cli.py rematch --beat 5 --refine # Cached Match per lokalem Bildinhalt-Offset nachschärfen +``` + +Der HTML-Report regeneriert seine Preview-Clips bei jedem Lauf mit genauer +FFmpeg-Nachsuche und synchronisiert die beiden Video-Player pro Beat. Dadurch +ist der Report zur Frame-Prüfung geeignet und zeigt keine alten gecachten +Preview-Clips. +Source-Previews bekommen bei Trailer-only-Tails denselben schwarzen Tail wie der +Export, damit der Browser nicht einen zu kurzen Source-Clip gegen den längeren +Referenzbeat weiterspult oder loopt. +Zur Synchronprüfung rendert der Report ein einzelnes Frame-Locked-Compare-Video +mit Referenz und Source in demselben MP4-Stream. Dieses Compare-Video ist +maßgeblich, weil zwei getrennte Browser-Videoelemente nie zuverlässig +framegenau synchron bleiben. + +Wenn ein Trailer-Beat am Ende eine Blende, Schwarzfläche oder Textkarte enthält, +die im Source-Film nicht als normaler Shot vorhanden ist, endet der Source-Match +am letzten stabil passenden Frame. Exportierte Timelines behalten trotzdem die +volle Beat-Länge und fügen danach automatisch einen schwarzen Trailer-Tail mit +Marker für Fade/Dissolve ein. + +Gezielte Ein-Beat-Matches nutzen zusätzlich vorhandene automatische Nachbarbeats +aus dem Cache als zeitliche Suchanker. Das hilft bei aufeinanderfolgenden Shots, +ohne manuelle Szenen oder Timecodes zu kuratieren. +Bei `match --beat N` wird ein alter Cache-Treffer für genau diesen Beat entfernt +und nur ein neu gefundener automatischer Treffer wieder eingetragen. Ein +fehlgeschlagener neuer Lauf kann dadurch keinen alten falschen Report-Treffer +stehen lassen. + +Der globale Bildvergleich arbeitet auf kontrast-normalisierten Luma- und +Kantenfeatures statt auf rohen Farb-Pixeln. Dadurch bleiben Schwarzweiß- oder +anders gegradete Trailerbilder mit dem Source-Material vergleichbar, während +unähnliche Farbshots schlechter ranken. +Die Inpoint-Feinjustage bestimmt den Versatz lokal aus dem Bildinhalt: Um einen +groben Treffer herum werden mehrere Referenzframes gegen mehrere Source-Offsets +verglichen, und der beste gemeinsame Offset wird übernommen. Das ist schneller +als ein erneuter globaler Scan und vermeidet pauschale Frame-Prerolls. +Schwarze Referenzframes aus Blenden oder Titel-Tails werden für diese +Offset-Messung ausgelassen, damit echte Bildbewegung und nicht die Blende selbst +den Inpoint bestimmt. +`rematch --refine` nutzt denselben lokalen FFmpeg/Pillow-Aligner und schreibt +den korrigierten Inpoint direkt zurück in `.cache/match_results.json`. + +Zusätzlich werden aus den besten szenenweiten Luma/Histogramm-Kandidaten +mehrere Inpoint-Suchanker erzeugt. Diese Scene-Seeds verwenden keine harte +pHash-Sperre, weil pHash bei stark anders gegradeten Trailerbildern echte +Matches zu früh ausschließen kann. +Optional kann `python cli.py match --beat N --vision` einen Vision-Layer +zuschalten. Dann werden pro Trailer-Beat und pro wenigen Scene-Level-Kandidaten +je drei Frames (Anfang, Mitte, Ende) von einem visionfähigen OpenAI-kompatiblen +Modell beschrieben. Die Beschreibungen liegen in +`.cache/vision_descriptions.json` und werden wiederverwendet. Vision erzeugt +nur zusätzliche Suchanker; der eigentliche Match muss weiterhin durch CV, +Content-Reranking, Timing und Duration-Coverage bestätigt werden. +Der gewichtete Vision-Seed-Pfad ersetzt standardmäßig keinen normalen +FFmpeg-Vollscan. Vision-Beschreibungen sind semantische Hinweise, aber keine +Beweise; der volle CV-Scan bleibt deshalb aktiv, damit falsch bewertete +Vision-Szenen echte Treffer nicht verdrängen. Für schnelle Experimente kann +`skip_coarse_scan_with_weighted_seeds = true` gesetzt werden. +Gewichtete Vision-Seeds werden nicht zuerst durch den alten Midpoint-Template +Refine verschoben; sie gehen direkt in die lokale Content-Alignment-Prüfung. +Das schützt wiederholte Gesprächseinstellungen, bei denen ähnliche Momente +mehrfach in derselben Szene vorkommen. +Innerhalb der automatisch von Vision vorgeschlagenen Szenen läuft zusätzlich +eine dichte lokale Bildsequenzsuche. Sie misst den Phasenversatz in kleinen +Zeitschritten direkt am Bildinhalt und bevorzugt Kandidaten mit genügend +Restdauer in derselben Source-Szene. Das ist kein manueller Override: Vision +grenzt nur Suchbereiche ein, die Auswahl bleibt Content-, Timing- und +Coverage-getrieben. +Nach einem dichten Vision-Treffer darf der spätere lokale Aligner nur noch im +Bereich dieses Scan-Schritts nachjustieren. So kann ein korrekt gefundener +Bewegungsmoment nicht wieder um viele Frames in eine ähnlich aussehende Phase +derselben Szene verschoben werden. +Wenn mehrere Vision-Kandidaten in derselben Source-Szene ähnlich gut scoren +und die Beat-Dauer abdecken, bevorzugt der Matcher die frühere Phase. Das +verhindert, dass ein späterer, minimal stärkerer Standbildtreffer die +Bewegungsphase des Trailers sichtbar überholt. +Enthält ein Trailerbeat selbst einen harten Umschnitt, werden Kandidaten an +angrenzenden Source-Szenengrenzen zusätzlich als zusammenhängender Multi-Shot- +Span geprüft. Ein Match darf dann über eine Source-Szenengrenze laufen, aber +nur wenn die relative Source-Grenze zeitlich zu einem erkannten Trailer-Umschnitt +passt. So kann ein Beat aus Frage/Antwort-Shots vollständig erfasst werden, +ohne Szenen willkürlich zusammenzukleben. +Auch der lokale Content-Aligner darf einen Inpoint nur noch übernehmen, wenn +die feste Whole-Frame-/Spatial-Validation dadurch besser wird. +Vor dem teuren Frame-Refine wird der gesamte Kandidatenpool mit einer schnellen +festen Inhaltsprüfung neu sortiert. Dadurch können korrekte Treffer aus +wiederholten Einstellungen einer Szene nach oben kommen, auch wenn ein freier +Template-Peak an anderer Stelle numerisch stärker war. Suchanker bleiben im +Pool erhalten, dürfen aber erst nach der Inhaltsprüfung nach oben rücken. Wenn +ein Kandidat visuell plausibel ist, aber wegen Trailerblende oder kurzem +Source-Span die normale Coverage knapp verfehlt, wird er als provisional Match +behalten statt als `NO MATCH` verworfen. +Dieses Reranking berücksichtigt zusätzlich die verbleibende Szenenlänge ab dem +Kandidaten-Inpoint. Dadurch werden zu späte ähnliche Gesprächsphasen innerhalb +derselben Szene nicht mehr vor frühere, tragfähigere Phasen sortiert. +Das Inhalts-Reranking nutzt bewusst nur wenige repräsentative Referenzframes und +eine begrenzte Kandidatenzahl. So bleiben wiederholte Szenen auffindbar, ohne +dass der Lauf durch tausende Random-Seeks minutenlang festhängt. +Confirmed Matches werden zusätzlich durch eine feste nahezu-Whole-Frame-Prüfung +aus Luma, Kanten, Farbhistogramm und räumlichen 4x4-Farbhistogrammen gedeckelt. +Dadurch kann ein freier Template-Hit mit ähnlicher Fenster-/Gesichtsstruktur +nicht mehr als sicherer Match gelten, wenn die Gesamtkomposition oder die +Bewegungsphase sichtbar eine andere Szene ist. +Für gewichtete Vision-Kandidaten gibt es zusätzlich eine eigene Provisional- +Bewertung aus Content-Score, Restdauer und Seed-Stärke. Dadurch können echte, +aber durch Trailer-Grading/Crop numerisch schwache Treffer im Report landen, +ohne als confirmed Match durchzugehen. +Die Cache-Normalisierung für Report/Export verwendet dieselbe niedrigere +Content-Untergrenze für nicht bestätigte Vision-Provisional-Treffer, damit ein +gerade gefundener automatischer Match nicht beim Report-Aufbau wieder +weggefiltert wird. +Sie übernimmt auch die Multi-Shot-Coverage-Regel: gecachte Treffer, die passend +zu internen Trailer-Umschnitten über angrenzende Source-Szenen laufen, werden +nicht mehr auf die erste Source-Szene zurückgekürzt. +Gezielte Einzel-Beat-Matches gewichten außerdem die automatisch aus Nachbarbeats +abgeleiteten Continuity-Seeds. Wenn ein Beat direkt an einen bereits passenden +Vorgänger anschließt, kann ein späterer ähnlich aussehender Moment derselben +Dialogszene den erwarteten Anschluss nicht mehr nur wegen eines höheren +Standbildscores verdrängen. +Diese Continuity-Seeds sind aber nur Suchanker: in derselben Szene darf ein +späterer Inpoint gewinnen, wenn die mehrframeige Content-Prüfung die +Bewegungsphase klar besser trifft. Dadurch bleiben Anschlussmatches stabil, +ohne Hand-/Kopfbewegungen auf einen falschen Zeitpunkt festzunageln. +Continuity- und Vision-Seeds allein schalten den globalen FFmpeg-Scan +standardmäßig nicht ab. Sie sind Suchanker, keine Beweise; der volle CV-Scan +bleibt aktiv, damit semantisch plausible, aber falsche Vision-Treffer echte +Bildmatches nicht verdrängen. +Lange Trailerbeats werden nicht mehr automatisch über ihre gesamte Beat-Länge +gegen einen einzigen Source-Clip validiert. Sobald nach einem sichtbaren +Source-Abschnitt eine anhaltende Schwarzblende oder Titel-/Credit-Insel beginnt, +endet der matchbare Referenzbereich dort; zwei aufeinanderfolgende dunkle +Samples reichen dafür. Spätere Text-/Creditbilder im selben Beat gehen damit +nicht mehr in Reranking, Validation oder Span-Schätzung ein. +Zusätzlich werden sehr dunkle, kontrastarme oder noch nicht sauber +auf-/abgeblendete Referenzframes aus Score, Inhalts-Reranking, +Phasen-Alignment und Motion-Templates herausgenommen. Blenden sollen bestimmen, +wie der Clip später exportiert wird, aber nicht, ob der Bildinhalt als Match +gilt. +Treffer unter `provisional_content_threshold` werden gar nicht mehr gespeichert +oder aus alten Cache-Ergebnissen übernommen. Das verhindert, dass offensichtlich +falsche Szenen im Report als Match-Kandidat weiterleben. + +### Log-Level + +```powershell +python cli.py run --log-level DEBUG +``` + +--- + +## Projektstruktur + +``` +ai_trailer_2026/ +│ +├── config.toml ← Alle Parameter (kein Hardcoding im Code) +├── .env ← API-Keys (NICHT commiten) +├── cli.py ← Einstiegspunkt +│ +├── src/ +│ ├── core/ +│ │ ├── config.py load_config() → AppConfig (frozen dataclasses) +│ │ └── models.py Scene, TrailerBeat, VibeHit, MatchResult, EditTimeline +│ ├── cv/ +│ │ ├── fingerprinting.py Text-Safe Crop · HS-Histogramme · pHash +│ │ ├── vibe_check.py Phase 1: Histogram+pHash Filter +│ │ ├── scene_indexer.py PySceneDetect → Fingerprint → JSON-Cache +│ │ ├── frame_extractor.py VideoCapture-Wrapper +│ │ └── deep_scan.py Phase 2: Coarse+Refine Template-Matching +│ ├── audio/ +│ │ └── transcriber.py faster-whisper Transkription +│ ├── llm/ +│ │ ├── dramaturg.py OpenRouter → BeatType (Dialog/Dramaturgie) +│ │ └── vision_cache.py optionale gecachte 3-Frame Vision-Seeds +│ ├── pipeline/ +│ │ ├── trailer_analyzer.py Reference-Trailer → TrailerBeat[] +│ │ └── matcher.py Orchestrierung + EditTimeline-Builder +│ └── export/ +│ ├── timecode.py Sekunden ↔ FCPXML-Rational ↔ SMPTE +│ ├── fcpxml_writer.py FCPXML 1.10 +│ └── edl_writer.py CMX 3600 EDL +│ +├── output/ ← FCPXML/EDL Output (gitignored) +├── .cache/ ← Szenen-Index + Match-Ergebnisse (gitignored) +└── tests/ 52 Unit-Tests (pytest) +``` + +--- + +## Cache-Verhalten + +Damit nicht bei jedem Lauf der gesamte Quellfilm neu analysiert werden muss: + +| Datei | Inhalt | Neu bauen mit | +|-------|--------|---------------| +| `.cache/scene_index.json` | Alle Quellfilm-Szenen + Fingerprints | `--force-reindex` | +| `.cache/trailer_beats.json` | Erkannte Trailer-Beats | `python cli.py analyze` erneut | +| `.cache/match_results.json` | CV-Matching-Ergebnisse | `python cli.py match` erneut | +| `.cache/vision_descriptions.json` | Optionale 3-Frame Vision-Beschreibungen für Beats/Szenen | löschen oder anderes Vision-Modell konfigurieren | + +--- + +## Tests + +```powershell +pytest tests/ -v +``` + +Alle Tests laufen ohne echte Videodateien (synthetische Frames via numpy/OpenCV). + +--- + +## Konfiguration (Auszug) + +Alle Werte in `config.toml` — keine hardgecodeten Konstanten im Code. + +```toml +[cv.vibe_check] +top_k_candidates = 10 # Top-K Kandidaten für Deep Scan +phash_max_distance = 12 # Hamming-Distanz Schwelle (0–64) +crop_top_fraction = 0.15 # Obere 15% ausblenden (Logos) +crop_bottom_fraction = 0.30 # Untere 30% ausblenden (Letterbox/Subs) + +[cv.deep_scan] +coarse_step_seconds = 0.5 # Scan-Schrittgröße (Coarse Pass) +match_threshold = 0.65 # Mindestscore für bestätigte automatische Matches +provisional_match_threshold = 0.45 # Niedrigere automatische Kandidaten im Report zeigen +coarse_candidate_threshold = 0.50 # Niedrigeres Gate vor Multi-Frame-Refine +refine_window_seconds = 0.6 # Suchfenster für framegenaue Inpoint-Feinjustage +refine_step_seconds = 0.04 # ~1 Frame bei 25fps (Refine Pass) +content_align_window_seconds = 0.48 # Lokales Suchfenster um einen groben Treffer +content_align_sample_step_s = 0.28 # Referenzframes für direkten Bildinhalt-Offset +content_validation_weight = 0.35 # Gewicht der festen Whole-Frame-/Spatial-Endprüfung +provisional_content_threshold = 0.42 # Untergrenze für Report-/Cache-Kandidaten +start_tie_break_score_delta = 0.015 # Bei fast gleichen Scores früheren Inpoint wählen +start_preroll_frames = 0 # Kein pauschaler Start-Ausgleich; Offset kommt aus Bildinhalt +sequence_candidate_count = 240 # Breiter Kandidatenpool vor Inhalts-Reranking +max_refine_candidates = 6 # Teurer Frame-Refine läuft nur auf den besten Inhaltskandidaten +scene_seed_top_k = 30 # Scene-Level-Kandidaten als zusätzliche Suchanker +scene_seed_points_per_scene = 6 # Inpoint-Samples pro Scene-Level-Kandidat +content_rerank_candidate_count = 100 # Grobe Kandidaten vor Inhalts-Reranking +skip_coarse_scan_with_weighted_seeds = false # Vision-Seeds nur als Hinweise; Vollscan bleibt robust +sequence_score_weight = 0.55 # Gewicht für mehrere zeitliche Vergleichsframes +span_score_weight = 0.15 # Gewicht für Stabilität bis zum Beat-Ende +coarse_score_weight = 0.10 # Gewicht des groben Midpoint-Treffers +duration_score_weight = 0.20 # Gewicht für nutzbare Länge des Source-Treffers +duration_tie_break_score_delta = 0.03 # Bei ähnlichem Score längeren Treffer bevorzugen +min_duration_coverage = 0.65 # Treffer muss mindestens 65% des matchbaren Referenzanteils tragen +continuity_seed_offsets_s = [-1.0, 0.0, 0.5, 1.0, 1.5, 2.0, 3.0] # Suchanker um gematchte Nachbarbeats +span_sample_step_s = 0.08 # Schrittweite für End-/Drift-Erkennung +trim_tail_frames = 4 # Sicherheitsabstand gegen kurze Blitzer am Ende +scene_boundary_epsilon_s = 0.12 # Szenengrenzen-Toleranz gegen 1-2 Frame Cut-Drift +scoreable_luma_mean_min = 24.0 # Zu dunkle/Fade-Frames nicht scoren +scoreable_luma_p90_min = 58.0 # Helle Bildanteile müssen sichtbar genug sein +scoreable_contrast_min = 24.0 # Kontrastarme Blenden/Titelinseln ignorieren + +[vision] +enabled = false # Kostenkontrolle: per CLI mit --vision aktivierbar +model = "google/gemma-4-31b-it" # Muss ein visionfähiges OpenAI-kompatibles Modell sein +scene_candidate_top_k = 8 # Nur wenige Top-Szenen pro Beat beschreiben +max_new_descriptions_per_run = 12 # API-Kosten pro Lauf begrenzen +max_seed_scenes = 3 # Nur beste Vision-Szenen als Suchanker weitergeben +seed_points_per_scene = 12 # Inpoint-Samples pro Vision-Szene +seed_score = 0.88 # Vision-Seeds bekommen mehr Priorität als normale Scene-Seeds +max_refine_candidates = 6 # Vision-Pfad prüft mehrere Bewegungsphasen derselben Szene +local_scan_step_s = 0.12 # Dichte lokale Bildsuche in Vision-Szenen +local_scan_max_points_per_scene = 180 # Laufzeitgrenze pro Source-Szene +local_scan_top_candidates = 18 # Beste lokale Kandidaten gehen ins Refinement +local_scan_tie_break_score_delta = 0.08 # Ähnliche Vision-Treffer: frühere Phase bevorzugen +multi_shot_cut_corr_threshold = 0.20 # Interne Trailer-Umschnitte erkennen +multi_shot_boundary_tolerance_s = 0.20 # Source-Grenze muss zum Trailer-Cut passen +fullscan_fallback = false # Nur relevant, wenn skip_coarse_scan_with_weighted_seeds=true ist +content_threshold = 0.22 # Lockeres Content-Gate nur für gewichtete Vision-Seeds +similarity_threshold = 0.18 # Mindest-Textähnlichkeit für Vision-Seeds +``` + +--- + +## Lizenz + +Internes Tool — nicht für den öffentlichen Vertrieb. diff --git a/cli.py b/cli.py new file mode 100644 index 0000000..5105d5f --- /dev/null +++ b/cli.py @@ -0,0 +1,899 @@ +""" +cli.py — AI Trailer Generator v2 — Command-Line Interface + +Usage: + python cli.py analyze [--config CONFIG] [--no-audio] [--no-llm] + python cli.py match [--config CONFIG] [--force-reindex] + python cli.py rematch --beat N [--threshold F] [--refine] + python cli.py report [--config CONFIG] + python cli.py run [--config CONFIG] [--force-reindex] [--no-audio] [--no-llm] + python cli.py export [--config CONFIG] [--format fcpxml|edl|both] + +On --no-audio / --no-llm: + These flags do NOT affect matching quality. + Whisper and the LLM only assign narrative labels (HOOK/SETUP/CLIMAX) + to beats in the export metadata. The CV pipeline is identical either way. + Use them for fast iterations: they skip large model downloads. + +All heavy imports are deferred so --help is instant. +""" + +from __future__ import annotations + +import argparse +import json +import logging +import sys +from pathlib import Path + + +# --------------------------------------------------------------------------- +# Logging setup +# --------------------------------------------------------------------------- + +def _setup_logging(level: str = "INFO") -> None: + # Force UTF-8 for Windows console emoji printing + if sys.stdout.encoding != 'utf-8': + sys.stdout.reconfigure(encoding='utf-8') + logging.basicConfig( + format="%(asctime)s %(levelname)-8s %(name)s — %(message)s", + datefmt="%H:%M:%S", + level=getattr(logging, level.upper(), logging.INFO), + stream=sys.stdout, + ) + logging.getLogger("PIL").setLevel(logging.WARNING) + + +def _ensure_utf8_console() -> None: + """Make argparse help safe on Windows before logging is configured.""" + if sys.stdout.encoding != "utf-8": + sys.stdout.reconfigure(encoding="utf-8") + + +# --------------------------------------------------------------------------- +# Cache helpers (match results ↔ JSON) +# --------------------------------------------------------------------------- + +def _results_cache_path(cfg: "AppConfig") -> Path: # type: ignore[name-defined] + return cfg.paths.cache_dir / "match_results.json" + + +def _save_results(results: list, cfg: "AppConfig") -> None: # type: ignore[name-defined] + from src.core.models import MatchResult + data = [ + { + "beat_id": r.beat_id, + "scene_id": r.scene_id, + "source_path": str(r.source_path), + "in_point_s": r.in_point_s, + "out_point_s": r.out_point_s, + "in_point_frame": r.in_point_frame, + "match_score": r.match_score, + "match_location": list(r.match_location), + "is_confirmed": r.is_confirmed, + "segments": [ + { + "trailer_offset_s": s.trailer_offset_s, + "duration_s": s.duration_s, + "scene_id": s.scene_id, + "in_point_s": s.in_point_s, + "out_point_s": s.out_point_s, + "match_score": s.match_score, + "is_confirmed": s.is_confirmed, + } + for s in getattr(r, "segments", ()) + ], + } + for r in results + ] + p = _results_cache_path(cfg) + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(json.dumps(data, indent=2), encoding="utf-8") + logging.getLogger(__name__).info("Match results cached → %s", p) + + +def _load_results(cfg: "AppConfig") -> list: # type: ignore[name-defined] + from src.core.models import MatchResult, MatchSegment + p = _results_cache_path(cfg) + if not p.exists(): + raise FileNotFoundError(f"No cached results at {p}. Run 'match' first.") + raw = json.loads(p.read_text(encoding="utf-8")) + return [ + MatchResult( + beat_id=d["beat_id"], + scene_id=d["scene_id"], + source_path=Path(d["source_path"]), + in_point_s=d["in_point_s"], + out_point_s=d["out_point_s"], + in_point_frame=d["in_point_frame"], + match_score=d["match_score"], + match_location=tuple(d["match_location"]), + is_confirmed=d.get("is_confirmed", True), + segments=tuple( + MatchSegment( + trailer_offset_s=float(s["trailer_offset_s"]), + duration_s=float(s["duration_s"]), + scene_id=int(s["scene_id"]), + in_point_s=float(s["in_point_s"]), + out_point_s=float(s["out_point_s"]), + match_score=float(s["match_score"]), + is_confirmed=bool(s.get("is_confirmed", True)), + ) + for s in d.get("segments", ()) + ), + ) + for d in raw + ] + + +def _load_scene_cache_light(cfg) -> list[dict]: + p = cfg.paths.cache_dir / "scene_index.json" + if not p.exists(): + return [] + return json.loads(p.read_text(encoding="utf-8")) + + +def _scene_fps_light(scene: dict, cfg) -> float: + duration_s = max(0.0, float(scene["end_s"]) - float(scene["start_s"])) + frame_count = max(0, int(scene["end_frame"]) - int(scene["start_frame"])) + return frame_count / duration_s if duration_s > 0 and frame_count > 0 else cfg.export.edl_frame_rate + + +def _scene_for_time_light(scenes: list[dict], t_sec: float, cfg) -> dict | None: + for idx, scene in enumerate(scenes): + if float(scene["start_s"]) <= t_sec < float(scene["end_s"]): + if ( + float(scene["end_s"]) - t_sec <= cfg.cv.deep_scan.scene_boundary_epsilon_s + and idx + 1 < len(scenes) + ): + return scenes[idx + 1] + return scene + return None + + +def _scene_by_id_light(scenes: list[dict], scene_id: int) -> dict | None: + return next((s for s in scenes if int(s["scene_id"]) == scene_id), None) + + +def _contiguous_duration_light(beat, in_point_s: float, scenes: list[dict], cfg, matchable_duration_s: float) -> float: + if matchable_duration_s <= 0: + return 0.0 + try: + from src.cv.global_scan import _reference_internal_cut_offsets + cut_offsets = _reference_internal_cut_offsets(beat, cfg) + except Exception: + cut_offsets = [] + + start_idx = None + for idx, scene in enumerate(scenes): + if float(scene["start_s"]) <= in_point_s < float(scene["end_s"]): + start_idx = idx + break + if start_idx is None: + return 0.0 + + target_end = in_point_s + matchable_duration_s + current_end = in_point_s + for scene in scenes[start_idx:]: + scene_end = float(scene["end_s"]) + if target_end <= scene_end: + return matchable_duration_s + + boundary_offset = scene_end - in_point_s + if not any( + abs(boundary_offset - cut_offset) <= cfg.vision.multi_shot_boundary_tolerance_s + for cut_offset in cut_offsets + ): + tail_s = max(0.0, cfg.cv.deep_scan.trim_tail_frames / _scene_fps_light(scene, cfg)) + return max(0.0, scene_end - in_point_s - tail_s) + current_end = scene_end + + return max(0.0, current_end - in_point_s) + + +def _normalize_cached_results(beats: list, results: list, cfg) -> list: + """ + Re-apply current generic timing rules to cached results. + + This keeps old automatic cache entries from preserving obsolete scene-boundary + or tail-trim behavior without introducing manual per-beat truth. + """ + from dataclasses import replace + + scenes = _load_scene_cache_light(cfg) + if not scenes: + return results + + beats_by_id = {b.beat_id: b for b in beats} + normalized = [] + for result in results: + beat = beats_by_id.get(result.beat_id) + if result.match_score < cfg.cv.deep_scan.provisional_match_threshold: + continue + + scene = _scene_for_time_light(scenes, result.in_point_s, cfg) + declared_scene = _scene_by_id_light(scenes, result.scene_id) + + # If the automatic matcher selected a scene but its in-point sits just + # before that scene's detected start, treat this as scene-boundary drift + # and clamp to the declared scene. This is generic: no beat IDs, no + # manual timestamps, just consistent scene/time reconciliation. + if declared_scene is not None: + declared_start = float(declared_scene["start_s"]) + declared_end = float(declared_scene["end_s"]) + declared_fps = _scene_fps_light(declared_scene, cfg) + boundary_tolerance_s = ( + cfg.cv.deep_scan.scene_boundary_epsilon_s + + cfg.cv.deep_scan.start_preroll_frames / declared_fps + ) + if declared_start - boundary_tolerance_s <= result.in_point_s < declared_end: + scene = declared_scene + + if beat is None or scene is None: + normalized.append(result) + continue + + fps = _scene_fps_light(scene, cfg) + adjusted_in_s = result.in_point_s + scene_changed = int(scene["scene_id"]) != result.scene_id + starts_before_scene = result.in_point_s < float(scene["start_s"]) + if scene_changed or starts_before_scene or result.duration_s <= 0.12: + adjusted_in_s = max(0.0, result.in_point_s - (cfg.cv.deep_scan.start_preroll_frames / fps)) + adjusted_in_s = max(float(scene["start_s"]), adjusted_in_s) + scene = _scene_for_time_light(scenes, adjusted_in_s, cfg) or scene + fps = _scene_fps_light(scene, cfg) + + matchable_duration_s = beat.duration_s + try: + from src.cv.global_scan import estimate_matchable_reference_duration + matchable_duration_s = estimate_matchable_reference_duration(beat, cfg) + except Exception: + pass + + tail_s = max(0.0, cfg.cv.deep_scan.trim_tail_frames / fps) + single_scene_duration_s = max(0.0, min(beat.duration_s, float(scene["end_s"]) - adjusted_in_s) - tail_s) + contiguous_duration_s = _contiguous_duration_light( + beat, + adjusted_in_s, + scenes, + cfg, + matchable_duration_s, + ) + max_duration_s = max(single_scene_duration_s, min(beat.duration_s, contiguous_duration_s)) + + normalized_result = result + if ( + scene_changed + or starts_before_scene + or result.duration_s <= 0.12 + or result.out_point_s > adjusted_in_s + max_duration_s + (1.0 / fps) + ): + normalized_result = replace( + result, + scene_id=int(scene["scene_id"]), + in_point_s=adjusted_in_s, + out_point_s=adjusted_in_s + max_duration_s, + in_point_frame=int(adjusted_in_s * fps), + ) + + coverage = ( + max(0.0, normalized_result.duration_s) / matchable_duration_s + if matchable_duration_s > 0 else 0.0 + ) + if coverage < cfg.cv.deep_scan.min_duration_coverage: + continue + + try: + from src.cv.content_align import align_cached_match_by_content + _, content_score = align_cached_match_by_content( + beat, + normalized_result.in_point_s, + cfg, + search_window_s=min(0.8, cfg.cv.deep_scan.content_align_window_seconds), + fps=12.5, + ) + content_gate = ( + cfg.cv.deep_scan.provisional_content_threshold + if normalized_result.is_confirmed + else min(cfg.cv.deep_scan.provisional_content_threshold, cfg.vision.content_threshold) + ) + if content_score < content_gate: + continue + if content_score < cfg.cv.deep_scan.match_threshold and normalized_result.is_confirmed: + normalized_result = replace( + normalized_result, + match_score=min(normalized_result.match_score, content_score), + is_confirmed=False, + ) + except Exception: + pass + + normalized.append(normalized_result) + + return normalized + + +# --------------------------------------------------------------------------- +# Command handlers +# --------------------------------------------------------------------------- + +def _build_transcribe_callback(cfg): + """Return a transcribe_callback closure, or None if audio is disabled.""" + from src.audio.transcriber import transcribe_video + + def _cb(path, start_s, end_s, offset_s): + return transcribe_video(path, cfg, start_s=start_s, end_s=end_s, time_offset_s=offset_s) + + return _cb + + +def _build_classify_callback(cfg): + """Return a classify_callback closure.""" + from src.llm.dramaturg import classify_beats + + def _cb(beats): + return classify_beats(beats, cfg) + + return _cb + + +def cmd_analyze(args: argparse.Namespace, cfg) -> list: + from src.pipeline.trailer_analyzer import analyze_reference_trailer + + transcribe_cb = _build_transcribe_callback(cfg) if not args.no_audio else None + classify_cb = _build_classify_callback(cfg) if not args.no_llm else None + + beats = analyze_reference_trailer( + cfg, + transcribe_callback=transcribe_cb, + classify_callback=classify_cb, + ) + + # Persist beats for downstream commands (including histogram bytes as hex) + beats_cache = cfg.paths.cache_dir / "trailer_beats.json" + beats_cache.parent.mkdir(parents=True, exist_ok=True) + beats_data = [ + { + "beat_id": b.beat_id, + "start_s": b.start_s, + "end_s": b.end_s, + "start_frame": b.start_frame, + "end_frame": b.end_frame, + "beat_type": b.beat_type.name, + "dialogue": [{"start_s": d.start_s, "end_s": d.end_s, "text": d.text} for d in b.dialogue], + "phash": b.phash, + "luma_hist": b.luma_hist.hex() if b.luma_hist else None, + "sat_hist": b.sat_hist.hex() if b.sat_hist else None, + } + for b in beats + ] + beats_cache.write_text(json.dumps(beats_data, indent=2, ensure_ascii=False), encoding="utf-8") + print(f"\n\u2705 {len(beats)} beats analyzed \u2192 {beats_cache}") + return beats + + +def _load_beats(cfg) -> list: + from src.core.models import BeatType, DialogueLine, TrailerBeat + + p = cfg.paths.cache_dir / "trailer_beats.json" + if not p.exists(): + raise FileNotFoundError(f"No cached beats at {p}. Run 'analyze' first.") + + raw = json.loads(p.read_text(encoding="utf-8")) + beats = [] + for d in raw: + dialogue = tuple( + DialogueLine(start_s=x["start_s"], end_s=x["end_s"], text=x["text"]) + for x in d.get("dialogue", []) + ) + beats.append(TrailerBeat( + beat_id=d["beat_id"], + trailer_path=cfg.paths.reference_trailer, + start_s=d["start_s"], + end_s=d["end_s"], + start_frame=d["start_frame"], + end_frame=d["end_frame"], + beat_type=BeatType[d.get("beat_type", "UNKNOWN")], + dialogue=dialogue, + phash=d.get("phash"), + luma_hist=bytes.fromhex(d["luma_hist"]) if d.get("luma_hist") else None, + sat_hist= bytes.fromhex(d["sat_hist"]) if d.get("sat_hist") else None, + )) + return beats + + +def _select_beats(beats: list, beat_id: int | None) -> list: + """Return all beats or exactly one requested beat.""" + if beat_id is None: + return beats + selected = [b for b in beats if b.beat_id == beat_id] + if not selected: + raise ValueError(f"Beat {beat_id} not found. Run 'analyze' first.") + return selected + + +def _select_results(results: list, beat_ids: set[int] | None) -> list: + """Return all results or only results for the requested beats.""" + if beat_ids is None: + return results + return [r for r in results if r.beat_id in beat_ids] + + +def _find_scene_for_in_point(cfg, in_point_s: float): + from src.cv.scene_indexer import build_scene_index + + scenes = build_scene_index(cfg) + for idx, scene in enumerate(scenes): + if scene.start_s <= in_point_s < scene.end_s: + if ( + scene.end_s - in_point_s <= cfg.cv.deep_scan.scene_boundary_epsilon_s + and idx + 1 < len(scenes) + ): + return scenes[idx + 1] + return scene + return None + + +def _reference_scoreable_segments(beat, cfg) -> list[tuple[float, float]]: + """Find visible source-matchable islands inside a trailer beat.""" + from src.cv.frame_extractor import grab_frame_at_path + from src.cv.global_scan import _is_scoreable_reference_frame + + step_s = max(0.08, cfg.cv.deep_scan.span_sample_step_s) + min_segment_s = max(0.32, step_s * 3.0) + bridge_gap_s = max(0.18, step_s * 2.0) + raw: list[tuple[float, float]] = [] + start: float | None = None + last_seen: float | None = None + t = 0.0 + while t <= beat.duration_s: + frame = grab_frame_at_path(beat.trailer_path, beat.start_s + t) + scoreable = frame is not None and _is_scoreable_reference_frame(frame, cfg) + if scoreable: + if start is None: + start = t + last_seen = t + elif start is not None and last_seen is not None and t - last_seen > bridge_gap_s: + end = min(beat.duration_s, last_seen + step_s) + if end - start >= min_segment_s: + raw.append((start, end)) + start = None + last_seen = None + t = round(t + step_s, 6) + + if start is not None and last_seen is not None: + end = min(beat.duration_s, last_seen + step_s) + if end - start >= min_segment_s: + raw.append((start, end)) + + return raw + + +def _attach_visual_segments(results: list, beats: list, cfg) -> list: + """Attach automatic sub-shot matches for multi-island trailer beats.""" + from dataclasses import replace + from src.core.models import MatchResult, MatchSegment + from src.cv.global_scan import run_global_scan + + by_id = {b.beat_id: b for b in beats} + expanded: list[MatchResult] = [] + for result in results: + beat = by_id.get(result.beat_id) + if beat is None: + expanded.append(result) + continue + + islands = _reference_scoreable_segments(beat, cfg) + if len(islands) <= 1: + primary = MatchSegment( + trailer_offset_s=0.0, + duration_s=max(0.0, result.duration_s), + scene_id=result.scene_id, + in_point_s=result.in_point_s, + out_point_s=result.out_point_s, + match_score=result.match_score, + is_confirmed=result.is_confirmed, + ) + expanded.append(replace(result, segments=(primary,))) + continue + + segments: list[MatchSegment] = [] + first_start, first_end = islands[0] + first_duration = min(max(0.0, result.duration_s), max(0.0, first_end - first_start)) + segments.append( + MatchSegment( + trailer_offset_s=first_start, + duration_s=first_duration, + scene_id=result.scene_id, + in_point_s=result.in_point_s, + out_point_s=result.in_point_s + first_duration, + match_score=result.match_score, + is_confirmed=result.is_confirmed, + ) + ) + + for start_s, end_s in islands[1:]: + segment_beat = replace( + beat, + start_s=beat.start_s + start_s, + end_s=beat.start_s + end_s, + ) + segment_matches = run_global_scan([segment_beat], cfg, seed_in_points=None) + if not segment_matches: + continue + seg = segment_matches[0] + seg_dur = min(max(0.0, end_s - start_s), max(0.0, seg.duration_s)) + segments.append( + MatchSegment( + trailer_offset_s=start_s, + duration_s=seg_dur, + scene_id=seg.scene_id, + in_point_s=seg.in_point_s, + out_point_s=seg.in_point_s + seg_dur, + match_score=seg.match_score, + is_confirmed=seg.is_confirmed, + ) + ) + + expanded.append(replace(result, segments=tuple(segments))) + return expanded + + +def cmd_match(args: argparse.Namespace, cfg) -> list: + from src.pipeline.matcher import run_matching + from dataclasses import replace + + if getattr(args, "vision", False): + cfg = replace(cfg, vision=replace(cfg.vision, enabled=True)) + if getattr(args, "no_vision", False): + cfg = replace(cfg, vision=replace(cfg.vision, enabled=False)) + + all_beats = _load_beats(cfg) + beats = _select_beats(all_beats, getattr(args, "beat", None)) + cached = _normalize_cached_results(all_beats, _load_results(cfg), cfg) if _results_cache_path(cfg).exists() else [] + seed_in_points = ( + _continuity_seed_in_points(args.beat, all_beats, cached, cfg) + if getattr(args, "beat", None) is not None + else None + ) + results = run_matching( + cfg, + beats, + force_reindex=args.force_reindex, + seed_in_points=seed_in_points, + ) + results = _attach_visual_segments(results, beats, cfg) + + # A targeted one-beat match should improve the cache without deleting + # automatic matches for other beats. + if getattr(args, "beat", None) is not None and _results_cache_path(cfg).exists(): + cached = [r for r in cached if r.beat_id != args.beat] + for result in results: + cached = _update_result(result, cached) + results_to_save = cached + else: + results_to_save = results + + _save_results(results_to_save, cfg) + + print(f"\n✅ {len(results)} / {len(beats)} beats matched.") + for r in results: + print(f" Beat {r.beat_id:03d} → scene {r.scene_id:04d} " + f"in={r.in_point_s:>8.3f}s score={r.match_score:.3f}") + return results + + +def _update_result(new_result, results: list) -> list: + """Replace or insert a MatchResult in the list (by beat_id).""" + updated = [r for r in results if r.beat_id != new_result.beat_id] + updated.append(new_result) + return sorted(updated, key=lambda r: r.beat_id) + + +def _continuity_seed_in_points(beat_id: int, beats: list, results: list, cfg) -> dict[int, list[float | tuple[float, float]]]: + beats_by_id = {b.beat_id: b for b in beats} + results_by_id = {r.beat_id: r for r in results} + target = beats_by_id.get(beat_id) + if target is None: + return {} + + seeds: list[tuple[float, float]] = [] + base_score = max(cfg.cv.deep_scan.coarse_candidate_threshold + 0.08, 0.92) + prev_matches = [ + (b, results_by_id[b.beat_id]) + for b in beats + if b.beat_id < beat_id and b.beat_id in results_by_id + ] + if prev_matches: + prev_beat, prev_result = max(prev_matches, key=lambda item: item[0].beat_id) + trailer_gap_s = max(0.0, target.start_s - prev_beat.end_s) + expected = prev_result.out_point_s + trailer_gap_s + for offset in cfg.cv.deep_scan.continuity_seed_offsets_s: + offset_score = max( + cfg.cv.deep_scan.coarse_candidate_threshold, + base_score - abs(offset) * 0.06, + ) + seeds.append((expected + offset, offset_score)) + + next_matches = [ + (b, results_by_id[b.beat_id]) + for b in beats + if b.beat_id > beat_id and b.beat_id in results_by_id + ] + if next_matches: + next_beat, next_result = min(next_matches, key=lambda item: item[0].beat_id) + trailer_gap_s = max(0.0, next_beat.start_s - target.end_s) + expected = next_result.in_point_s - trailer_gap_s - target.duration_s + for offset in cfg.cv.deep_scan.continuity_seed_offsets_s: + offset_score = max( + cfg.cv.deep_scan.coarse_candidate_threshold, + base_score - abs(offset) * 0.06, + ) + seeds.append((expected - offset, offset_score)) + + unique: dict[float, float] = {} + for seed_t, seed_score in seeds: + rounded = round(max(0.0, seed_t), 3) + unique[rounded] = max(unique.get(rounded, 0.0), seed_score) + points = [(seed_t, score) for seed_t, score in sorted(unique.items())] + return {beat_id: points} if points else {} + + +def cmd_rematch(args: argparse.Namespace, cfg) -> None: + """ + Re-run automatic matching for ONE beat. + + python cli.py rematch --beat 5 # re-scan CV for beat 5 + python cli.py rematch --beat 5 --threshold 0.40 # relax threshold + """ + + beat_id = args.beat + beats = _load_beats(cfg) + results = _load_results(cfg) if _results_cache_path(cfg).exists() else [] + + beat = next((b for b in beats if b.beat_id == beat_id), None) + if beat is None: + print(f"\u274c Beat {beat_id} not found. Run 'analyze' first.") + return + + # ---- Refine an already acceptable cached match ------------------------- + if args.refine: + current = next((r for r in results if r.beat_id == beat_id), None) + if current is None: + print(f"❌ Beat {beat_id} has no cached match to refine. Run 'match --beat {beat_id}' first.") + return + + from src.cv.content_align import align_cached_match_by_content + refined_in_s, sequence_score = align_cached_match_by_content( + beat, + current.in_point_s, + cfg, + search_window_s=args.refine_window, + ) + usable_duration_s = max(0.0, current.out_point_s - current.in_point_s) + span_score = sequence_score + scene_data = _scene_for_time_light(_load_scene_cache_light(cfg), refined_in_s, cfg) + out_point_s = refined_in_s + usable_duration_s + if scene_data is not None: + out_point_s = min(out_point_s, float(scene_data["end_s"])) + matchable_duration_s = beat.duration_s + duration_coverage = ( + max(0.0, out_point_s - refined_in_s) / matchable_duration_s + if matchable_duration_s > 0 else 0.0 + ) + if duration_coverage < cfg.cv.deep_scan.min_duration_coverage: + print( + f"❌ Beat {beat_id} refined candidate rejected: " + f"duration coverage {duration_coverage:.0%} < " + f"{cfg.cv.deep_scan.min_duration_coverage:.0%}" + ) + return + + try: + from src.cv.frame_extractor import get_video_info + fps = float(get_video_info(cfg.paths.source_movie)["fps"]) or cfg.export.edl_frame_rate + except Exception: + fps = cfg.export.edl_frame_rate + + from src.core.models import MatchResult + refined = MatchResult( + beat_id=beat_id, + scene_id=int(scene_data["scene_id"]) if scene_data is not None else current.scene_id, + source_path=current.source_path, + in_point_s=max(0.0, refined_in_s), + out_point_s=out_point_s, + in_point_frame=int(max(0.0, refined_in_s) * fps), + match_score=max(sequence_score, span_score), + match_location=current.match_location, + is_confirmed=max(sequence_score, span_score) >= cfg.cv.deep_scan.match_threshold, + ) + results = _update_result(refined, results) + _save_results(results, cfg) + print( + f"✅ Beat {beat_id} refined → " + f"in={refined.in_point_s:.3f}s, out={refined.out_point_s:.3f}s, " + f"sequence_score={refined.match_score:.3f}" + ) + return + + # ---- Re-run CV with optional threshold override ------------------------ + from dataclasses import replace as dc_replace + run_cfg = cfg + if args.threshold is not None: + run_cfg = dc_replace( + cfg, + cv=dc_replace( + cfg.cv, + deep_scan=dc_replace(cfg.cv.deep_scan, match_threshold=args.threshold), + ), + ) + print(f"ℹ️ threshold overridden to {args.threshold} for beat {beat_id}") + + from src.cv.global_scan import run_global_scan + seed_in_points = _continuity_seed_in_points(beat_id, beats, results, run_cfg) + matches = run_global_scan([beat], run_cfg, seed_in_points=seed_in_points) + + if not matches: + print(f"❌ Beat {beat_id}: no match. Try --threshold 0.40.") + return + + match = matches[0] + results = _update_result(match, results) + _save_results(results, cfg) + print(f"✅ Beat {beat_id} rematched → (in={match.in_point_s:.3f}s, score={match.match_score:.3f})") + + +def cmd_report(args: argparse.Namespace, cfg) -> None: + from src.pipeline.reporter import generate_report + beats = _select_beats(_load_beats(cfg), getattr(args, "beat", None)) + beat_ids = {b.beat_id for b in beats} if getattr(args, "beat", None) is not None else None + results = _select_results(_normalize_cached_results(_load_beats(cfg), _load_results(cfg), cfg), beat_ids) + out = generate_report(beats, results, cfg) + if getattr(args, "beat", None) is not None and not results: + print( + f"\n⚠️ Beat {args.beat} has no cached match yet. " + f"Run: python cli.py match --beat {args.beat}" + ) + print(f"\n\u2705 Report \u2192 {out}") + + +def cmd_export(args: argparse.Namespace, cfg) -> None: + from src.export.edl_writer import write_edl + from src.export.fcpxml_writer import write_fcpxml + from src.pipeline.matcher import build_timeline + + beats = _select_beats(_load_beats(cfg), getattr(args, "beat", None)) + beat_ids = {b.beat_id for b in beats} if getattr(args, "beat", None) is not None else None + results = _select_results(_normalize_cached_results(_load_beats(cfg), _load_results(cfg), cfg), beat_ids) + if getattr(args, "beat", None) is not None and not results: + print(f"❌ Beat {args.beat} has no cached match. Run 'match --beat {args.beat}' first.") + return + timeline = build_timeline(beats, results, cfg) + + fmt = args.format or cfg.export.output_format + beat_id = getattr(args, "beat", None) + out_stem = ( + f"{cfg.paths.reference_trailer.stem}_beat_{beat_id:03d}" + if beat_id is not None + else timeline.title + ) + + if fmt in ("fcpxml", "both"): + out = write_fcpxml(timeline, cfg, output_path=cfg.paths.output_dir / f"{out_stem}.fcpxml") + print(f"✅ FCPXML → {out}") + + if fmt in ("edl", "both"): + out = write_edl(timeline, cfg, output_path=cfg.paths.output_dir / f"{out_stem}.edl") + print(f"✅ EDL → {out}") + + +def cmd_run(args: argparse.Namespace, cfg) -> None: + """Full pipeline: analyze → match → report → export.""" + cmd_analyze(args, cfg) + cmd_match(args, cfg) + cmd_report(args, cfg) + cmd_export(args, cfg) + + +# --------------------------------------------------------------------------- +# Argument parser +# --------------------------------------------------------------------------- + +def _build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="ai-trailer", + description="AI Trailer Generator v2 — Pure CV scene matching", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "--config", type=Path, default=Path("config.toml"), + metavar="CONFIG", help="Path to config.toml (default: ./config.toml)", + ) + parser.add_argument( + "--log-level", default="INFO", + choices=["DEBUG", "INFO", "WARNING", "ERROR"], + help="Logging verbosity (default: INFO)", + ) + + sub = parser.add_subparsers(dest="command", required=True) + + # analyze + p_analyze = sub.add_parser("analyze", help="Detect trailer beats + fingerprint") + p_analyze.add_argument("--no-audio", action="store_true", + help="Skip Whisper (only affects beat labels, not matching)") + p_analyze.add_argument("--no-llm", action="store_true", + help="Skip LLM classification (only affects beat labels)") + + # match + p_match = sub.add_parser("match", help="Run 2-phase CV matching") + p_match.add_argument("--force-reindex", action="store_true", + help="Ignore scene cache and re-run PySceneDetect") + p_match.add_argument("--beat", type=int, + help="Match only one beat and merge it into the cached results") + p_match.add_argument("--vision", action="store_true", + help="Enable cached vision descriptions for extra automatic search seeds") + p_match.add_argument("--no-vision", action="store_true", + help="Disable vision seeding even if [vision].enabled is true") + + # rematch + p_rematch = sub.add_parser("rematch", help="Re-run or override matching for one beat") + p_rematch.add_argument("--beat", type=int, required=True, help="Beat ID to rematch") + p_rematch.add_argument("--threshold", type=float, default=None, help="Override match_threshold") + p_rematch.add_argument("--refine", action="store_true", + help="Refine the cached match by measuring a local image-content offset") + p_rematch.add_argument("--refine-window", type=float, default=None, + help="Seconds to search around the cached in-point when using --refine") + + # report + p_report = sub.add_parser("report", help="Generate HTML visual comparison report") + p_report.add_argument("--beat", type=int, help="Report only one beat") + + # export + p_export = sub.add_parser("export", help="Export timeline from cached results") + p_export.add_argument("--format", choices=["fcpxml", "edl", "both"], + help="Override [export] output_format from config") + p_export.add_argument("--beat", type=int, help="Export only one beat") + + # run + p_run = sub.add_parser("run", help="Full pipeline: analyze → match → export") + p_run.add_argument("--no-audio", action="store_true") + p_run.add_argument("--no-llm", action="store_true") + p_run.add_argument("--force-reindex", action="store_true") + p_run.add_argument("--vision", action="store_true") + p_run.add_argument("--no-vision", action="store_true") + p_run.add_argument("--format", choices=["fcpxml", "edl", "both"]) + p_run.add_argument("--beat", type=int, + help="Run match/report/export for only one cached beat") + + return parser + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + +def main() -> None: + _ensure_utf8_console() + parser = _build_parser() + args = parser.parse_args() + + _setup_logging(args.log_level) + + from src.core.config import load_config + cfg = load_config(args.config) + + dispatch = { + "analyze": cmd_analyze, + "match": cmd_match, + "rematch": cmd_rematch, + "report": cmd_report, + "export": cmd_export, + "run": cmd_run, + } + + handler = dispatch[args.command] + handler(args, cfg) + + +if __name__ == "__main__": + main() diff --git a/config.toml b/config.toml new file mode 100644 index 0000000..d3d159c --- /dev/null +++ b/config.toml @@ -0,0 +1,198 @@ +# ============================================================================= +# AI Trailer Generator v2 — Central Configuration +# ============================================================================= +# All tunable parameters, thresholds, and file paths are defined here. +# NO hardcoded values are allowed in the Python source code. +# ============================================================================= + +[project] +name = "AI Trailer Generator v2" +version = "2.0.0" +log_level = "INFO" # DEBUG | INFO | WARNING | ERROR + +# ----------------------------------------------------------------------------- +# [paths] — External video sources (read-only access) +# ----------------------------------------------------------------------------- +[paths] +source_movie = "B:/Proxy/BehindTheRedDoor_FTR_1080P_2398_Fixed.mp4" +reference_trailer = "F:/Encodings/BehindTheRedDoor_Trailer_REFERENCE.mp4" + +# Output destinations (inside project sandbox) +output_dir = "output" +cache_dir = ".cache" +proxy_dir = "proxy" + +# ----------------------------------------------------------------------------- +# [video] — Decode / proxy settings +# ----------------------------------------------------------------------------- +[video] +# Target FPS for internal frame extraction (0 = use source FPS) +extract_fps = 1.0 +# Proxy resolution for template matching (width x height) +proxy_width = 640 +proxy_height = 360 + +# ----------------------------------------------------------------------------- +# [cv] — Computer Vision engine parameters +# Phase 1 — "Vibe Check" (histogram / perceptual hash scene-level filter) +# Phase 2 — "Deep Scan" (template matching frame-level precision) +# ----------------------------------------------------------------------------- +[cv] + +[cv.vibe_check] +# Number of top candidate scenes to forward to Deep Scan +top_k_candidates = 100 + +# Histogram comparison method: +# CORREL=0 | CHISQR=1 | INTERSECT=2 | BHATTACHARYYA=3 +hist_compare_method = 0 + +# Histogram bins per channel (hue, saturation) +hist_bins_hue = 50 +hist_bins_saturation = 60 + +# pHash similarity threshold (lower = stricter; 0–64 range) +# NOTE: 12 is for near-duplicate detection. Cross-video matching +# (trailer vs source movie with different grading/compression) +# needs 25–35. Start at 32 and tighten if you get false positives. +phash_max_distance = 32 + +# ---- Text-Safe Crop ------------------------------------------------------- +# Fraction of frame height to EXCLUDE from the top (e.g. logos, title cards) +crop_top_fraction = 0.15 +# Fraction of frame height to EXCLUDE from the bottom (e.g. letterbox, subs) +crop_bottom_fraction = 0.30 + +[cv.deep_scan] +# Step size in SECONDS between sampled frames during the coarse scan pass +coarse_step_seconds = 0.5 + +# Minimum template match score (0.0–1.0) to accept a candidate as a hit +match_threshold = 0.65 + +# Store/report lower-confidence automatic candidates for visual review instead +# of dropping them as "NO MATCH". Confirmed exports can still use match_threshold. +provisional_match_threshold = 0.45 + +# Lower gate for entering temporal multi-frame refinement. The final decision +# still uses sequence/span scoring; this only avoids rejecting real matches +# because one midpoint frame is weak. +coarse_candidate_threshold = 0.50 + +# Candidate ranking weights. Duration coverage matters when the same visual +# shot appears multiple times: prefer the occurrence that can cover the beat. +sequence_score_weight = 0.55 +span_score_weight = 0.15 +coarse_score_weight = 0.10 +duration_score_weight = 0.20 +duration_tie_break_score_delta = 0.03 +min_duration_coverage = 0.65 +continuity_seed_offsets_s = [-1.0, 0.0, 0.5, 1.0, 1.5, 2.0, 3.0] +scene_seed_top_k = 30 +scene_seed_points_per_scene = 6 +content_rerank_candidate_count = 100 +skip_coarse_scan_with_weighted_seeds = false + +# cv2.matchTemplate method: +# TM_CCOEFF_NORMED=5 (recommended), TM_CCORR_NORMED=3 +match_method = 5 + +# If a coarse hit is found, refine by scanning ± this many seconds +refine_window_seconds = 0.6 +refine_step_seconds = 0.04 # ≈ 1 frame at 25 fps +content_align_window_seconds = 0.48 +content_align_sample_step_s = 0.28 +content_validation_weight = 0.35 +provisional_content_threshold = 0.42 + +# When several adjacent frame offsets score almost the same, prefer the earlier +# one. This avoids matches that are visually correct but start a few frames late. +start_tie_break_score_delta = 0.015 +start_preroll_frames = 0 + +# Automatic temporal verification after a coarse image hit. +# More candidates reduces false positives from visually similar shots. +sequence_candidate_count = 240 +sequence_min_distance_s = 1.0 +max_refine_candidates = 6 + +# Match-span detection: trim when the source starts drifting into a different shot. +span_sample_step_s = 0.08 +trim_tail_frames = 4 + +# If a refined in-point lands this close to a detected scene end, treat it as +# the next scene. Scene detectors often place cuts a frame or two around the +# visible boundary. +scene_boundary_epsilon_s = 0.12 +scoreable_luma_mean_min = 24.0 +scoreable_luma_p90_min = 58.0 +scoreable_contrast_min = 24.0 + +# ----------------------------------------------------------------------------- +# [scene_detection] — PySceneDetect parameters (used to segment source movie) +# ----------------------------------------------------------------------------- +[scene_detection] +# Threshold for ContentDetector (lower = more sensitive) +content_threshold = 27.0 +# Minimum scene duration in seconds +min_scene_duration_s = 1.5 + +# ----------------------------------------------------------------------------- +# [whisper] — Dialogue / audio analysis +# ----------------------------------------------------------------------------- +[whisper] +model = "large-v3" +language = "ar" +device = "cuda" # cuda | cpu +compute_type = "float16" # float16 | int8 | float32 + +# ----------------------------------------------------------------------------- +# [llm] — Used ONLY for thematic segmentation / dramaturgy +# ----------------------------------------------------------------------------- +[llm] +provider = "openrouter" +base_url = "https://openrouter.ai/api/v1" +model = "google/gemma-4-31b-it" +timeout_seconds = 120 +temperature = 0.3 +max_tokens = 4096 + +# ----------------------------------------------------------------------------- +# [vision] — Optional cached visual descriptions for ambiguous matching +# ----------------------------------------------------------------------------- +[vision] +# Disabled by default to avoid surprise API cost. Enable when you want the +# matcher to ask a vision-capable model for cached 3-frame scene descriptions. +enabled = false +provider = "openrouter" +base_url = "https://openrouter.ai/api/v1" +model = "google/gemma-4-31b-it" +timeout_seconds = 90 +temperature = 0.0 +max_tokens = 350 + +# Cost controls: per beat, only the top scene-level candidates are described, +# and cached descriptions in .cache/vision_descriptions.json are reused. +scene_candidate_top_k = 8 +max_new_descriptions_per_run = 12 +max_seed_scenes = 3 +seed_points_per_scene = 12 +seed_score = 0.88 +max_refine_candidates = 6 +local_scan_step_s = 0.12 +local_scan_max_points_per_scene = 180 +local_scan_top_candidates = 18 +local_scan_tie_break_score_delta = 0.08 +multi_shot_cut_corr_threshold = 0.20 +multi_shot_boundary_tolerance_s = 0.20 +fullscan_fallback = false +content_threshold = 0.22 +similarity_threshold = 0.18 + +# ----------------------------------------------------------------------------- +# [export] — FCPXML / EDL export settings +# ----------------------------------------------------------------------------- +[export] +fcpxml_version = "1.10" +edl_frame_rate = 23.976 # fps used in EDL timecode generation +output_format = "fcpxml" # fcpxml | edl | both diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..5e831ff --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,68 @@ +[build-system] +requires = ["setuptools>=69", "wheel"] +build-backend = "setuptools.backends.legacy:build" + +[project] +name = "ai-trailer-2026" +version = "2.0.0" +description = "Frame-accurate trailer reconstruction via pure Computer Vision" +requires-python = ">=3.11" + +dependencies = [ + # Computer Vision + "opencv-python>=4.9", + "imagehash>=4.3", + "numpy>=1.26", + "Pillow>=10.0", + + # Scene detection + "scenedetect[opencv]>=0.6", + + # Audio / transcription + "faster-whisper>=1.0", + + # Config / secrets + # tomllib — built-in stdlib (Python 3.11+), no install needed + "python-dotenv>=1.0", # loads .env into os.environ + + # Export + "lxml>=5.0", # FCPXML generation +] + +[project.optional-dependencies] +dev = [ + "pytest>=8.0", + "pytest-cov", + "mypy>=1.9", + "ruff>=0.4", +] + +[tool.setuptools.packages.find] +where = ["."] +include = ["src*"] + +# --------------------------------------------------------------------------- +# Ruff (linter + formatter) +# --------------------------------------------------------------------------- +[tool.ruff] +line-length = 100 +target-version = "py311" + +[tool.ruff.lint] +select = ["E", "F", "I", "UP", "B", "C4", "ANN"] +ignore = ["ANN101", "ANN102"] + +# --------------------------------------------------------------------------- +# Mypy +# --------------------------------------------------------------------------- +[tool.mypy] +python_version = "3.11" +strict = true +ignore_missing_imports = true + +# --------------------------------------------------------------------------- +# Pytest +# --------------------------------------------------------------------------- +[tool.pytest.ini_options] +testpaths = ["tests"] +addopts = "-v --tb=short" diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8c67a95 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,37 @@ +# AI Trailer Generator v2 — Python Dependencies +# Generated from: pip freeze (Python 3.11, Windows) +# Install with: pip install -r requirements.txt +# +# NOTE: faster-whisper and scenedetect may pull in torch/cuda extras +# depending on your platform. See README for CUDA setup. + +# Computer Vision +opencv-python>=4.9 +numpy>=1.26 +Pillow>=10.0 +ImageHash>=4.3 +PyWavelets>=1.6 # required by ImageHash + +# Video scene detection +scenedetect[opencv]>=0.6 + +# Audio transcription +# faster-whisper>=1.0 ← uncomment when ready to use Whisper +# (pulls in torch; large download) + +# Config & secrets +python-dotenv>=1.0 # loads .env into os.environ +# tomllib — stdlib in Python 3.11+, no install needed + +# XML export +# lxml>=5.0 ← optional: only needed for advanced FCPXML features +# stdlib xml.etree.ElementTree is used by default + +# HTTP (LLM calls via urllib.request — no extra dep needed) +# requests ← not used; stdlib urllib is sufficient + +# Dev / testing +pytest>=8.0 +pytest-cov +# mypy>=1.9 +# ruff>=0.4 diff --git a/setup_venv.ps1 b/setup_venv.ps1 new file mode 100644 index 0000000..dac843d --- /dev/null +++ b/setup_venv.ps1 @@ -0,0 +1,89 @@ +# setup_venv.ps1 — AI Trailer Generator v2 — Virtual Environment Setup +# Run once: .\setup_venv.ps1 +# ----------------------------------------------------------------------- +# If blocked by ExecutionPolicy: +# Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser + +$ErrorActionPreference = "Stop" +$VENV_DIR = ".venv" + +function Resolve-ProjectPython { + $cmd = Get-Command python -ErrorAction SilentlyContinue + if ($cmd) { + return $cmd.Source + } + + $candidates = @( + "$env:LOCALAPPDATA\Programs\Python\Python311\python.exe", + "$env:LOCALAPPDATA\Microsoft\WindowsApps\python.exe" + ) + + foreach ($candidate in $candidates) { + if ($candidate -and (Test-Path $candidate)) { + return $candidate + } + } + + throw "Python 3.11+ not found. Install Python 3.11+ or add it to PATH." +} + +Write-Host "" +Write-Host "==================================================" -ForegroundColor Cyan +Write-Host " AI Trailer Generator v2 — venv Setup" -ForegroundColor Cyan +Write-Host "==================================================" -ForegroundColor Cyan +Write-Host "" + +# ---- 1. Check Python version ------------------------------------------------ +$PROJECT_PYTHON = Resolve-ProjectPython +$pythonVersion = & $PROJECT_PYTHON --version 2>&1 +Write-Host "Python: $pythonVersion" +if ($pythonVersion -notmatch "3\.(1[1-9]|[2-9]\d)") { + Write-Error "Python 3.11+ required. Found: $pythonVersion" + exit 1 +} + +# ---- 2. Create venv --------------------------------------------------------- +if (Test-Path $VENV_DIR) { + Write-Host "Virtual environment already exists at '$VENV_DIR'. Skipping creation." -ForegroundColor Yellow +} else { + Write-Host "Creating virtual environment in '$VENV_DIR' ..." -ForegroundColor Green + & $PROJECT_PYTHON -m venv $VENV_DIR + Write-Host "Done." -ForegroundColor Green +} + +# ---- 3. Activate venv ------------------------------------------------------- +$activate = Join-Path $VENV_DIR "Scripts\Activate.ps1" +Write-Host "Activating virtual environment ..." +. $activate +$VENV_PYTHON = Join-Path $VENV_DIR "Scripts\python.exe" + +# ---- 4. Upgrade pip --------------------------------------------------------- +Write-Host "Upgrading pip ..." -ForegroundColor Green +& $VENV_PYTHON -m pip install --upgrade pip --quiet + +# ---- 5. Install dependencies ------------------------------------------------ +Write-Host "Installing dependencies from requirements.txt ..." -ForegroundColor Green +& $VENV_PYTHON -m pip install -r requirements.txt + +# ---- 6. Copy .env if missing ------------------------------------------------ +if (-not (Test-Path ".env")) { + if (Test-Path ".env.example") { + Copy-Item ".env.example" ".env" + Write-Host "" + Write-Host " .env created from .env.example." -ForegroundColor Yellow + Write-Host " >>> Open .env and fill in your OPENROUTER_API_KEY! <<<" -ForegroundColor Red + } +} + +# ---- 7. Done ---------------------------------------------------------------- +Write-Host "" +Write-Host "==================================================" -ForegroundColor Cyan +Write-Host " Setup complete!" -ForegroundColor Green +Write-Host "" +Write-Host " Activate the venv with:" +Write-Host " .\.venv\Scripts\Activate.ps1" -ForegroundColor White +Write-Host "" +Write-Host " Then run the pipeline:" +Write-Host " python cli.py run --no-audio --no-llm" -ForegroundColor White +Write-Host "==================================================" -ForegroundColor Cyan +Write-Host "" diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..521670b --- /dev/null +++ b/src/__init__.py @@ -0,0 +1 @@ +# src package diff --git a/src/audio/__init__.py b/src/audio/__init__.py new file mode 100644 index 0000000..20dc2fc --- /dev/null +++ b/src/audio/__init__.py @@ -0,0 +1 @@ +# src.audio package — Whisper / dialogue analysis diff --git a/src/audio/transcriber.py b/src/audio/transcriber.py new file mode 100644 index 0000000..95be4d0 --- /dev/null +++ b/src/audio/transcriber.py @@ -0,0 +1,182 @@ +""" +src/audio/transcriber.py — Whisper transcription via faster-whisper + +Responsibility: + - Transcribe audio from a video file into a list of DialogueLine objects + - Optionally restrict to a time window [start_s, end_s] (for single beats) + - All model config (model name, device, compute_type) comes from AppConfig + +The LLM is NOT used here. This is pure audio-to-text. +""" + +from __future__ import annotations + +import logging +import tempfile +from pathlib import Path +from typing import Sequence + +from src.core.config import AppConfig +from src.core.models import DialogueLine + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Audio extraction helper (video → wav via ffmpeg) +# --------------------------------------------------------------------------- + +def _extract_audio_segment( + video_path: Path, + start_s: float | None, + end_s: float | None, + out_wav: Path, +) -> None: + """ + Use ffmpeg (subprocess) to extract a mono 16kHz WAV from *video_path*. + + Args: + video_path: Source video. + start_s: Start time in seconds (None = beginning of file). + end_s: End time in seconds (None = end of file). + out_wav: Destination WAV path. + + Raises: + RuntimeError: If ffmpeg exits with a non-zero code. + """ + import subprocess + + cmd = ["ffmpeg", "-y", "-loglevel", "error"] + + if start_s is not None: + cmd += ["-ss", str(start_s)] + if end_s is not None and start_s is not None: + cmd += ["-t", str(end_s - start_s)] + elif end_s is not None: + cmd += ["-to", str(end_s)] + + cmd += [ + "-i", str(video_path), + "-vn", # no video + "-ac", "1", # mono + "-ar", "16000", # 16 kHz — Whisper native rate + "-f", "wav", + str(out_wav), + ] + + result = subprocess.run(cmd, capture_output=True) + if result.returncode != 0: + raise RuntimeError( + f"ffmpeg failed (code {result.returncode}):\n" + f"{result.stderr.decode(errors='replace')}" + ) + + +# --------------------------------------------------------------------------- +# Core transcription +# --------------------------------------------------------------------------- + +def transcribe_video( + video_path: Path, + cfg: AppConfig, + start_s: float | None = None, + end_s: float | None = None, + time_offset_s: float = 0.0, +) -> list[DialogueLine]: + """ + Transcribe dialogue from *video_path* using faster-whisper. + + Args: + video_path: Path to source or trailer video. + cfg: Application configuration (whisper section). + start_s: Clip start in video-file seconds (None = beginning). + end_s: Clip end in video-file seconds (None = end of file). + time_offset_s: Added to every transcript timestamp so that beat-level + transcripts align with absolute movie time. + + Returns: + List of DialogueLine ordered by start time. + """ + try: + from faster_whisper import WhisperModel + except ImportError: + raise ImportError("faster-whisper not installed. Run: pip install faster-whisper") + + w = cfg.whisper + + logger.info( + "Transcribing %s [%.1f–%s] with %s on %s …", + video_path.name, + start_s or 0.0, + f"{end_s:.1f}s" if end_s else "end", + w.model, + w.device, + ) + + with tempfile.TemporaryDirectory() as tmp: + wav = Path(tmp) / "audio.wav" + _extract_audio_segment(video_path, start_s, end_s, wav) + + model = WhisperModel(w.model, device=w.device, compute_type=w.compute_type) + segments, _ = model.transcribe( + str(wav), + language=w.language if w.language else None, + beam_size=5, + ) + + lines: list[DialogueLine] = [] + for seg in segments: + lines.append(DialogueLine( + start_s=seg.start + time_offset_s, + end_s=seg.end + time_offset_s, + text=seg.text.strip(), + )) + + logger.info("Transcription done: %d segments.", len(lines)) + return lines + + +# --------------------------------------------------------------------------- +# Convenience: transcribe a whole file and return grouped by scene +# --------------------------------------------------------------------------- + +def transcribe_full_movie( + cfg: AppConfig, +) -> list[DialogueLine]: + """ + Transcribe the entire source movie. Use this result to enrich Scenes + via a dialogue_callback passed to build_scene_index(). + """ + return transcribe_video(cfg.paths.source_movie, cfg) + + +def assign_dialogue_to_scenes( + all_dialogue: Sequence[DialogueLine], + scenes: list["src.core.models.Scene"], # type: ignore[name-defined] +) -> list["src.core.models.Scene"]: # type: ignore[name-defined] + """ + Distribute pre-transcribed DialogueLines into their respective Scenes. + + A line is assigned to the scene whose window contains its midpoint. + + Args: + all_dialogue: Full-movie transcript as flat list. + scenes: Scene list (will be replaced with enriched copies). + + Returns: + New list of Scene objects with dialogue tuples populated. + """ + from dataclasses import replace + from src.core.models import Scene + + enriched: list[Scene] = [] + for scene in scenes: + matched = tuple( + line for line in all_dialogue + if scene.start_s <= (line.start_s + line.end_s) / 2.0 < scene.end_s + ) + enriched.append(replace(scene, dialogue=matched)) + + total_assigned = sum(len(s.dialogue) for s in enriched) + logger.info("Assigned %d dialogue lines across %d scenes.", total_assigned, len(enriched)) + return enriched diff --git a/src/core/__init__.py b/src/core/__init__.py new file mode 100644 index 0000000..61e4b74 --- /dev/null +++ b/src/core/__init__.py @@ -0,0 +1 @@ +# src.core package diff --git a/src/core/config.py b/src/core/config.py new file mode 100644 index 0000000..3e3f798 --- /dev/null +++ b/src/core/config.py @@ -0,0 +1,387 @@ +""" +src/core/config.py — Configuration loader for AI Trailer Generator v2 + +Loads config.toml and exposes typed, nested dataclasses. +All CV thresholds, paths, and model settings are sourced exclusively here. +API keys are NEVER stored in config.toml; they are loaded from .env. +""" + +from __future__ import annotations + +import os +import tomllib + +try: + from dotenv import load_dotenv as _load_dotenv + _HAS_DOTENV = True +except ImportError: # dotenv optional — falls back to existing env vars + _HAS_DOTENV = False +from dataclasses import dataclass, field +from pathlib import Path +from typing import Literal + + +# --------------------------------------------------------------------------- +# Leaf sections +# --------------------------------------------------------------------------- + +@dataclass(frozen=True) +class PathsConfig: + source_movie: Path + reference_trailer: Path + output_dir: Path + cache_dir: Path + proxy_dir: Path + + +@dataclass(frozen=True) +class VideoConfig: + extract_fps: float + proxy_width: int + proxy_height: int + + +@dataclass(frozen=True) +class VibeCheckConfig: + top_k_candidates: int + hist_compare_method: int + hist_bins_hue: int + hist_bins_saturation: int + phash_max_distance: int + crop_top_fraction: float + crop_bottom_fraction: float + + +@dataclass(frozen=True) +class DeepScanConfig: + coarse_step_seconds: float + match_threshold: float + provisional_match_threshold: float + coarse_candidate_threshold: float + sequence_score_weight: float + span_score_weight: float + coarse_score_weight: float + duration_score_weight: float + duration_tie_break_score_delta: float + min_duration_coverage: float + continuity_seed_offsets_s: tuple[float, ...] + scene_seed_top_k: int + scene_seed_points_per_scene: int + content_rerank_candidate_count: int + skip_coarse_scan_with_weighted_seeds: bool + max_refine_candidates: int + match_method: int + refine_window_seconds: float + refine_step_seconds: float + content_align_window_seconds: float + content_align_sample_step_s: float + content_validation_weight: float + provisional_content_threshold: float + start_tie_break_score_delta: float + start_preroll_frames: int + sequence_candidate_count: int + sequence_min_distance_s: float + span_sample_step_s: float + trim_tail_frames: int + scene_boundary_epsilon_s: float + scoreable_luma_mean_min: float + scoreable_luma_p90_min: float + scoreable_contrast_min: float + + +@dataclass(frozen=True) +class CVConfig: + vibe_check: VibeCheckConfig + deep_scan: DeepScanConfig + + +@dataclass(frozen=True) +class SceneDetectionConfig: + content_threshold: float + min_scene_duration_s: float + + +@dataclass(frozen=True) +class WhisperConfig: + model: str + language: str + device: Literal["cuda", "cpu"] + compute_type: Literal["float16", "int8", "float32"] + + +@dataclass(frozen=True) +class LLMConfig: + provider: Literal["ollama", "openai", "openrouter"] + base_url: str + model: str + timeout_seconds: int + temperature: float + max_tokens: int + # Loaded from .env — NEVER committed to version control + api_key: str = "" + + +@dataclass(frozen=True) +class VisionConfig: + enabled: bool + provider: Literal["openai", "openrouter"] + base_url: str + model: str + timeout_seconds: int + temperature: float + max_tokens: int + scene_candidate_top_k: int + max_new_descriptions_per_run: int + max_seed_scenes: int + seed_points_per_scene: int + seed_score: float + max_refine_candidates: int + local_scan_step_s: float + local_scan_max_points_per_scene: int + local_scan_top_candidates: int + local_scan_tie_break_score_delta: float + multi_shot_cut_corr_threshold: float + multi_shot_boundary_tolerance_s: float + fullscan_fallback: bool + content_threshold: float + similarity_threshold: float + api_key: str = "" + + +@dataclass(frozen=True) +class ExportConfig: + fcpxml_version: str + edl_frame_rate: float + output_format: Literal["fcpxml", "edl", "both"] + + +# --------------------------------------------------------------------------- +# Root config — single object passed through the entire application +# --------------------------------------------------------------------------- + +@dataclass(frozen=True) +class AppConfig: + project_name: str + version: str + log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR"] + + paths: PathsConfig + video: VideoConfig + cv: CVConfig + scene_detection: SceneDetectionConfig + whisper: WhisperConfig + llm: LLMConfig + vision: VisionConfig + export: ExportConfig + + +# --------------------------------------------------------------------------- +# Loader +# --------------------------------------------------------------------------- + +_DEFAULT_CONFIG_PATH = Path(__file__).parents[2] / "config.toml" +_DEFAULT_ENV_PATH = Path(__file__).parents[2] / ".env" + + +def load_config( + config_path: Path = _DEFAULT_CONFIG_PATH, + env_path: Path = _DEFAULT_ENV_PATH, +) -> AppConfig: + """ + Parse config.toml and return a fully-typed, immutable AppConfig. + + API keys are read from the .env file (or existing environment variables); + they are never stored in config.toml. + + Args: + config_path: Absolute or relative path to the TOML file. + Defaults to /config.toml. + env_path: Path to the .env file. + Defaults to /.env. + + Raises: + FileNotFoundError: If the TOML file does not exist. + KeyError / TypeError: If a required key is missing or has the wrong type. + """ + # Load .env first so os.environ is populated before we read it below. + if _HAS_DOTENV: + _load_dotenv(dotenv_path=env_path, override=False) + + if not config_path.exists(): + raise FileNotFoundError( + f"Config file not found: {config_path}\n" + "Copy config.toml.example to config.toml and adjust your paths." + ) + + with config_path.open("rb") as fh: + raw: dict = tomllib.load(fh) + + project = raw["project"] + paths_raw = raw["paths"] + video_raw = raw["video"] + cv_raw = raw["cv"] + sd_raw = raw["scene_detection"] + whisper_raw = raw["whisper"] + llm_raw = raw["llm"] + vision_raw = raw.get("vision", {}) + export_raw = raw["export"] + + # Resolve paths relative to the config file's parent directory so the + # project is relocatable, but keep absolute paths as-is. + def _resolve(p: str) -> Path: + path = Path(p) + return path if path.is_absolute() else (config_path.parent / path).resolve() + + paths = PathsConfig( + source_movie=_resolve(paths_raw["source_movie"]), + reference_trailer=_resolve(paths_raw["reference_trailer"]), + output_dir=_resolve(paths_raw["output_dir"]), + cache_dir=_resolve(paths_raw["cache_dir"]), + proxy_dir=_resolve(paths_raw["proxy_dir"]), + ) + + video = VideoConfig( + extract_fps=float(video_raw["extract_fps"]), + proxy_width=int(video_raw["proxy_width"]), + proxy_height=int(video_raw["proxy_height"]), + ) + + vibe_check = VibeCheckConfig( + top_k_candidates=int(cv_raw["vibe_check"]["top_k_candidates"]), + hist_compare_method=int(cv_raw["vibe_check"]["hist_compare_method"]), + hist_bins_hue=int(cv_raw["vibe_check"]["hist_bins_hue"]), + hist_bins_saturation=int(cv_raw["vibe_check"]["hist_bins_saturation"]), + phash_max_distance=int(cv_raw["vibe_check"]["phash_max_distance"]), + crop_top_fraction=float(cv_raw["vibe_check"]["crop_top_fraction"]), + crop_bottom_fraction=float(cv_raw["vibe_check"]["crop_bottom_fraction"]), + ) + + deep_scan = DeepScanConfig( + coarse_step_seconds=float(cv_raw["deep_scan"]["coarse_step_seconds"]), + match_threshold=float(cv_raw["deep_scan"]["match_threshold"]), + provisional_match_threshold=float(cv_raw["deep_scan"].get("provisional_match_threshold", 0.45)), + coarse_candidate_threshold=float(cv_raw["deep_scan"].get("coarse_candidate_threshold", cv_raw["deep_scan"]["match_threshold"])), + sequence_score_weight=float(cv_raw["deep_scan"].get("sequence_score_weight", 0.55)), + span_score_weight=float(cv_raw["deep_scan"].get("span_score_weight", 0.15)), + coarse_score_weight=float(cv_raw["deep_scan"].get("coarse_score_weight", 0.10)), + duration_score_weight=float(cv_raw["deep_scan"].get("duration_score_weight", 0.20)), + duration_tie_break_score_delta=float(cv_raw["deep_scan"].get("duration_tie_break_score_delta", 0.03)), + min_duration_coverage=float(cv_raw["deep_scan"].get("min_duration_coverage", 0.65)), + continuity_seed_offsets_s=tuple( + float(v) for v in cv_raw["deep_scan"].get( + "continuity_seed_offsets_s", + [-1.0, 0.0, 0.5, 1.0, 1.5, 2.0, 3.0], + ) + ), + scene_seed_top_k=int(cv_raw["deep_scan"].get("scene_seed_top_k", 30)), + scene_seed_points_per_scene=int(cv_raw["deep_scan"].get("scene_seed_points_per_scene", 6)), + content_rerank_candidate_count=int(cv_raw["deep_scan"].get("content_rerank_candidate_count", 100)), + skip_coarse_scan_with_weighted_seeds=bool(cv_raw["deep_scan"].get("skip_coarse_scan_with_weighted_seeds", False)), + max_refine_candidates=int(cv_raw["deep_scan"].get("max_refine_candidates", 6)), + match_method=int(cv_raw["deep_scan"]["match_method"]), + refine_window_seconds=float(cv_raw["deep_scan"].get("refine_window_seconds", 0.6)), + refine_step_seconds=float(cv_raw["deep_scan"]["refine_step_seconds"]), + content_align_window_seconds=float(cv_raw["deep_scan"].get("content_align_window_seconds", 0.48)), + content_align_sample_step_s=float(cv_raw["deep_scan"].get("content_align_sample_step_s", 0.28)), + content_validation_weight=float(cv_raw["deep_scan"].get("content_validation_weight", 0.35)), + provisional_content_threshold=float(cv_raw["deep_scan"].get("provisional_content_threshold", 0.42)), + start_tie_break_score_delta=float(cv_raw["deep_scan"].get("start_tie_break_score_delta", 0.015)), + start_preroll_frames=int(cv_raw["deep_scan"].get("start_preroll_frames", 0)), + sequence_candidate_count=int(cv_raw["deep_scan"].get("sequence_candidate_count", 240)), + sequence_min_distance_s=float(cv_raw["deep_scan"].get("sequence_min_distance_s", 1.0)), + span_sample_step_s=float(cv_raw["deep_scan"].get("span_sample_step_s", 0.08)), + trim_tail_frames=int(cv_raw["deep_scan"].get("trim_tail_frames", 2)), + scene_boundary_epsilon_s=float(cv_raw["deep_scan"].get("scene_boundary_epsilon_s", 0.12)), + scoreable_luma_mean_min=float(cv_raw["deep_scan"].get("scoreable_luma_mean_min", 24.0)), + scoreable_luma_p90_min=float(cv_raw["deep_scan"].get("scoreable_luma_p90_min", 58.0)), + scoreable_contrast_min=float(cv_raw["deep_scan"].get("scoreable_contrast_min", 24.0)), + ) + + scene_detection = SceneDetectionConfig( + content_threshold=float(sd_raw["content_threshold"]), + min_scene_duration_s=float(sd_raw["min_scene_duration_s"]), + ) + + whisper = WhisperConfig( + model=whisper_raw["model"], + language=whisper_raw["language"], + device=whisper_raw["device"], + compute_type=whisper_raw["compute_type"], + ) + + # Resolve API key: env var takes precedence over config (which shouldn't have it). + # Supported env vars (in priority order): + # OPENROUTER_API_KEY → for provider = openrouter + # OPENAI_API_KEY → for provider = openai + # LLM_API_KEY → universal fallback + _provider = llm_raw["provider"] + _api_key = ( + os.environ.get("OPENROUTER_API_KEY", "") + if _provider == "openrouter" + else os.environ.get("OPENAI_API_KEY", "") + if _provider == "openai" + else "" + ) or os.environ.get("LLM_API_KEY", "") + + llm = LLMConfig( + provider=_provider, + base_url=llm_raw["base_url"], + model=llm_raw["model"], + timeout_seconds=int(llm_raw["timeout_seconds"]), + temperature=float(llm_raw["temperature"]), + max_tokens=int(llm_raw["max_tokens"]), + api_key=_api_key, + ) + + vision_provider = vision_raw.get("provider", _provider if _provider in ("openai", "openrouter") else "openrouter") + vision_api_key = ( + os.environ.get("OPENROUTER_API_KEY", "") + if vision_provider == "openrouter" + else os.environ.get("OPENAI_API_KEY", "") + ) or os.environ.get("VISION_API_KEY", "") or os.environ.get("LLM_API_KEY", "") + + vision = VisionConfig( + enabled=bool(vision_raw.get("enabled", False)), + provider=vision_provider, + base_url=str(vision_raw.get("base_url", llm.base_url)), + model=str(vision_raw.get("model", llm.model)), + timeout_seconds=int(vision_raw.get("timeout_seconds", llm.timeout_seconds)), + temperature=float(vision_raw.get("temperature", 0.0)), + max_tokens=int(vision_raw.get("max_tokens", 350)), + scene_candidate_top_k=int(vision_raw.get("scene_candidate_top_k", 8)), + max_new_descriptions_per_run=int(vision_raw.get("max_new_descriptions_per_run", 12)), + max_seed_scenes=int(vision_raw.get("max_seed_scenes", 3)), + seed_points_per_scene=int(vision_raw.get("seed_points_per_scene", 12)), + seed_score=float(vision_raw.get("seed_score", 0.88)), + max_refine_candidates=int(vision_raw.get("max_refine_candidates", 6)), + local_scan_step_s=float(vision_raw.get("local_scan_step_s", 0.12)), + local_scan_max_points_per_scene=int(vision_raw.get("local_scan_max_points_per_scene", 180)), + local_scan_top_candidates=int(vision_raw.get("local_scan_top_candidates", 18)), + local_scan_tie_break_score_delta=float(vision_raw.get("local_scan_tie_break_score_delta", 0.08)), + multi_shot_cut_corr_threshold=float(vision_raw.get("multi_shot_cut_corr_threshold", 0.20)), + multi_shot_boundary_tolerance_s=float(vision_raw.get("multi_shot_boundary_tolerance_s", 0.20)), + fullscan_fallback=bool(vision_raw.get("fullscan_fallback", False)), + content_threshold=float(vision_raw.get("content_threshold", 0.22)), + similarity_threshold=float(vision_raw.get("similarity_threshold", 0.18)), + api_key=vision_api_key, + ) + + export = ExportConfig( + fcpxml_version=str(export_raw["fcpxml_version"]), + edl_frame_rate=float(export_raw["edl_frame_rate"]), + output_format=export_raw["output_format"], + ) + + return AppConfig( + project_name=project["name"], + version=project["version"], + log_level=project["log_level"], + paths=paths, + video=video, + cv=CVConfig(vibe_check=vibe_check, deep_scan=deep_scan), + scene_detection=scene_detection, + whisper=whisper, + llm=llm, + vision=vision, + export=export, + ) diff --git a/src/core/models.py b/src/core/models.py new file mode 100644 index 0000000..838609c --- /dev/null +++ b/src/core/models.py @@ -0,0 +1,287 @@ +""" +src/core/models.py — Canonical data models for AI Trailer Generator v2 + +Rules: + - Every model is a frozen dataclass (immutable after creation). + - All fields are strictly typed; no bare dicts or untyped lists. + - Seconds are always float; frame numbers are always int. + - Confidence scores live in [0.0, 1.0]. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from enum import Enum, auto +from pathlib import Path +from typing import Optional + + +# =========================================================================== +# Enumerations +# =========================================================================== + +class MatchMethod(Enum): + """CV template matching method (mirrors cv2.TM_* constants).""" + TM_SQDIFF = 0 + TM_SQDIFF_NORMED = 1 + TM_CCORR = 2 + TM_CCORR_NORMED = 3 + TM_CCOEFF = 4 + TM_CCOEFF_NORMED = 5 + + +class BeatType(Enum): + """Narrative role of a trailer beat (for dramaturgy / LLM use only).""" + HOOK = auto() # Opening attention grabber + SETUP = auto() # World / character introduction + CONFLICT = auto() # Inciting incident / rising tension + CLIMAX = auto() # Peak action / emotion + RESOLUTION = auto() # Cool-down / tagline + UNKNOWN = auto() + + +class ExportFormat(Enum): + FCPXML = "fcpxml" + EDL = "edl" + BOTH = "both" + + +# =========================================================================== +# Phase 0 — Source-movie scene index +# =========================================================================== + +@dataclass(frozen=True) +class DialogueLine: + """Single transcribed line from Whisper output.""" + start_s: float # onset in seconds + end_s: float # offset in seconds + text: str # verbatim transcript + speaker: Optional[str] = None # diarisation label if available + + @property + def duration_s(self) -> float: + return self.end_s - self.start_s + + +@dataclass(frozen=True) +class Scene: + """ + One detected scene in the source movie. + + Produced by PySceneDetect; enriched by Whisper dialogue and + (optionally) perceptual hashes during the Vibe Check phase. + """ + scene_id: int # zero-based index in source movie + source_path: Path # absolute path to the source video file + start_s: float # scene start in seconds + end_s: float # scene end in seconds + start_frame: int # first frame number + end_frame: int # last frame number + + # Populated after Vibe Check fingerprinting + luma_hist: Optional[bytes] = None # serialised np.ndarray (pickle) + sat_hist: Optional[bytes] = None + phash: Optional[str] = None # 64-bit hex string + + # Populated after Whisper pass + dialogue: tuple[DialogueLine, ...] = field(default_factory=tuple) + + @property + def duration_s(self) -> float: + return self.end_s - self.start_s + + @property + def midpoint_s(self) -> float: + return self.start_s + self.duration_s / 2.0 + + def __repr__(self) -> str: + return ( + f"Scene(id={self.scene_id}, " + f"{self.start_s:.2f}s–{self.end_s:.2f}s, " + f"dur={self.duration_s:.2f}s)" + ) + + +# =========================================================================== +# Phase 1 — Reference-trailer beat +# =========================================================================== + +@dataclass(frozen=True) +class TrailerBeat: + """ + One cut / segment in the reference trailer. + + The 'beat' is the atomic unit of a trailer: it maps exactly to one + clip that will later be sourced from the original movie. + """ + beat_id: int + trailer_path: Path + start_s: float + end_s: float + start_frame: int + end_frame: int + + beat_type: BeatType = BeatType.UNKNOWN # set by LLM dramaturgy pass + + # Visual fingerprints of the *middle* frame (populated by CV pipeline) + luma_hist: Optional[bytes] = None + sat_hist: Optional[bytes] = None + phash: Optional[str] = None + + # Dialogue extracted from this beat + dialogue: tuple[DialogueLine, ...] = field(default_factory=tuple) + + @property + def duration_s(self) -> float: + return self.end_s - self.start_s + + @property + def midpoint_s(self) -> float: + return self.start_s + self.duration_s / 2.0 + + def __repr__(self) -> str: + return ( + f"TrailerBeat(id={self.beat_id}, " + f"{self.beat_type.name}, " + f"{self.start_s:.2f}s–{self.end_s:.2f}s)" + ) + + +# =========================================================================== +# Phase 2 — CV match result +# =========================================================================== + +@dataclass(frozen=True) +class VibeHit: + """ + Intermediate result from Phase 1 (Vibe Check — histogram/pHash). + + Represents a *candidate* scene that passed the coarse filter. + Not yet a confirmed match; forwarded to Deep Scan. + """ + beat_id: int + scene_id: int + hist_score: float # histogram similarity [0.0, 1.0] (CORREL method) + phash_distance: int # Hamming distance [0, 64]; lower = more similar + combined_score: float # weighted aggregate used for ranking + + +@dataclass(frozen=True) +class MatchSegment: + """ + One source-backed visual island inside a trailer beat. + + Some trailer beats contain multiple shots separated by fades/title frames. + A single continuous source in/out cannot represent those beats accurately. + """ + trailer_offset_s: float + duration_s: float + scene_id: int + in_point_s: float + out_point_s: float + match_score: float + is_confirmed: bool = True + + +@dataclass(frozen=True) +class MatchResult: + """ + Final, confirmed match from Phase 2 (Deep Scan — template matching). + + One MatchResult per TrailerBeat: the best frame-accurate hit found + inside the source movie. + """ + beat_id: int # which trailer beat was matched + scene_id: int # which source scene contains the match + source_path: Path # absolute path to source video + + # Frame-accurate in-point / out-point in the SOURCE movie + in_point_s: float # matched frame onset in source seconds + out_point_s: float # computed out-point (in_point + beat duration) + in_point_frame: int # matched frame number in source movie + + # Match quality + match_score: float # cv2.matchTemplate peak value [0.0, 1.0] + match_location: tuple[int, int] = field(default_factory=lambda: (0, 0)) + # (x, y) pixel location of the best match within the source frame + + # Provenance + vibe_hit: Optional[VibeHit] = None # the candidate that led here + is_confirmed: bool = True + segments: tuple[MatchSegment, ...] = field(default_factory=tuple) + + @property + def duration_s(self) -> float: + return self.out_point_s - self.in_point_s + + def __repr__(self) -> str: + return ( + f"MatchResult(beat={self.beat_id} → scene={self.scene_id}, " + f"in={self.in_point_s:.3f}s, score={self.match_score:.3f})" + ) + + +# =========================================================================== +# Phase 3 — Edit timeline (pre-export) +# =========================================================================== + +@dataclass(frozen=True) +class EditClip: + """ + One clip on the final edit timeline, ready for FCPXML / EDL export. + + Combines beat dramaturgy + the CV-confirmed source in/out points. + """ + clip_index: int # position on the timeline (0-based) + beat: TrailerBeat + match: MatchResult + + # Timeline position (in the OUTPUT trailer) + timeline_start_s: float + timeline_end_s: float + source_duration_s: float | None = None + trailer_tail_s: float = 0.0 + + # Optional audio override (e.g. VO or music) + audio_path: Optional[Path] = None + audio_offset_s: float = 0.0 + + @property + def timeline_duration_s(self) -> float: + return self.timeline_end_s - self.timeline_start_s + + @property + def source_timeline_duration_s(self) -> float: + if self.source_duration_s is not None: + return max(0.0, self.source_duration_s) + return self.timeline_duration_s + + def __repr__(self) -> str: + return ( + f"EditClip(#{self.clip_index}, " + f"tl={self.timeline_start_s:.2f}s–{self.timeline_end_s:.2f}s, " + f"src={self.match.in_point_s:.3f}s)" + ) + + +@dataclass(frozen=True) +class EditTimeline: + """ + The complete ordered sequence of EditClips that forms the trailer. + + Passed to the export layer (FCPXML / EDL writer). + """ + title: str + frame_rate: float # e.g. 23.976 + clips: tuple[EditClip, ...] # ordered by clip_index + + @property + def total_duration_s(self) -> float: + if not self.clips: + return 0.0 + last = max(self.clips, key=lambda c: c.timeline_end_s) + return last.timeline_end_s + + @property + def clip_count(self) -> int: + return len(self.clips) diff --git a/src/cv/__init__.py b/src/cv/__init__.py new file mode 100644 index 0000000..4d40340 --- /dev/null +++ b/src/cv/__init__.py @@ -0,0 +1 @@ +# src.cv package — Computer Vision engine diff --git a/src/cv/content_align.py b/src/cv/content_align.py new file mode 100644 index 0000000..f500a95 --- /dev/null +++ b/src/cv/content_align.py @@ -0,0 +1,240 @@ +from __future__ import annotations + +import math +import shutil +import subprocess +from pathlib import Path + +import numpy as np +from PIL import Image, ImageFilter, ImageOps + +from src.core.config import AppConfig +from src.core.models import TrailerBeat + + +def _run(cmd: list[str]) -> None: + result = subprocess.run(cmd, capture_output=True) + if result.returncode != 0: + raise RuntimeError(result.stderr.decode(errors="replace")) + + +def _extract_frames( + video_path: Path, + start_s: float, + duration_s: float, + fps: float, + out_dir: Path, + prefix: str, +) -> None: + out_dir.mkdir(parents=True, exist_ok=True) + _run([ + "ffmpeg", "-y", "-loglevel", "error", + "-ss", str(max(0.0, start_s)), + "-i", str(video_path), + "-t", str(max(0.04, duration_s)), + "-vf", f"scale=640:360,fps={fps}", + str(out_dir / f"{prefix}_%04d.png"), + ]) + + +def _cropped_image(path: Path, cfg: AppConfig) -> Image.Image: + image = Image.open(path).convert("L") + image = _trim_dark_borders(image) + w, h = image.size + # Final validation should see the composition. The broader text-safe crop + # used for coarse search can remove bodies, furniture and lower-frame + # spatial cues that distinguish otherwise similar face/window shots. + top = int(h * 0.05) + bottom = int(h * 0.95) + return image.crop((0, top, w, bottom)) + + +def _trim_dark_borders(image: Image.Image) -> Image.Image: + """Remove encoded black matte/pillarbox borders before content scoring.""" + gray = image.convert("L") + arr = np.asarray(gray, dtype=np.float32) + if arr.size == 0: + return image + h, w = arr.shape[:2] + col_signal = np.percentile(arr, 90, axis=0) + row_signal = np.percentile(arr, 90, axis=1) + active_cols = np.where(col_signal > 18.0)[0] + active_rows = np.where(row_signal > 18.0)[0] + if active_cols.size >= max(8, int(w * 0.35)): + x0 = max(0, int(active_cols[0]) - 2) + x1 = min(w, int(active_cols[-1]) + 3) + else: + x0, x1 = 0, w + if active_rows.size >= max(8, int(h * 0.35)): + y0 = max(0, int(active_rows[0]) - 2) + y1 = min(h, int(active_rows[-1]) + 3) + else: + y0, y1 = 0, h + if x1 - x0 < int(w * 0.35) or y1 - y0 < int(h * 0.35): + return image + return image.crop((x0, y0, x1, y1)) + + +def _feature(path: Path, cfg: AppConfig) -> np.ndarray: + image = _cropped_image(path, cfg) + w, h = image.size + image = image.crop((int(w * 0.10), int(h * 0.10), int(w * 0.90), int(h * 0.90))) + image = ImageOps.equalize(image).filter(ImageFilter.FIND_EDGES).resize((160, 62)) + arr = np.asarray(image, dtype=np.float32) + return (arr - arr.mean()) / (arr.std() + 1e-6) + + +def _luma_feature(path: Path, cfg: AppConfig) -> np.ndarray: + image = ImageOps.equalize(_cropped_image(path, cfg)).resize((160, 80)) + arr = np.asarray(image, dtype=np.float32) + return (arr - arr.mean()) / (arr.std() + 1e-6) + + +def _hist_feature(path: Path, cfg: AppConfig) -> np.ndarray: + image = _trim_dark_borders(Image.open(path).convert("RGB")) + w, h = image.size + top = int(h * 0.05) + bottom = int(h * 0.95) + arr = np.asarray(image.crop((0, top, w, bottom)).resize((160, 80)), dtype=np.float32) + hist_parts = [] + for channel in range(3): + hist, _ = np.histogram(arr[:, :, channel], bins=32, range=(0, 255)) + hist = hist.astype(np.float32) + hist_parts.append(hist / (hist.sum() + 1e-6)) + return np.concatenate(hist_parts) + + +def _spatial_hist_feature(path: Path, cfg: AppConfig) -> np.ndarray: + image = _trim_dark_borders(Image.open(path).convert("RGB")) + w, h = image.size + top = int(h * 0.05) + bottom = int(h * 0.95) + arr = np.asarray(image.crop((0, top, w, bottom)).resize((160, 80)), dtype=np.float32) + cells = [] + grid_y = 4 + grid_x = 4 + cell_h = arr.shape[0] // grid_y + cell_w = arr.shape[1] // grid_x + for gy in range(grid_y): + for gx in range(grid_x): + cell = arr[gy * cell_h:(gy + 1) * cell_h, gx * cell_w:(gx + 1) * cell_w, :] + for channel in range(3): + hist, _ = np.histogram(cell[:, :, channel], bins=16, range=(0, 255)) + hist = hist.astype(np.float32) + cells.append(hist / (hist.sum() + 1e-6)) + return np.concatenate(cells) + + +def _is_dark(path: Path, cfg: AppConfig) -> bool: + image = _trim_dark_borders(Image.open(path).convert("L")) + w, h = image.size + top = int(h * 0.05) + bottom = int(h * 0.95) + arr = np.asarray(image.crop((0, top, w, bottom)), dtype=np.float32) + return float(arr.mean()) < 28.0 and float(np.percentile(arr, 90)) < 58.0 + + +def _corr(a: np.ndarray, b: np.ndarray) -> float: + return float((a * b).mean()) + + +def _hist_intersection(a: np.ndarray, b: np.ndarray) -> float: + return float(np.minimum(a, b).sum() / (np.maximum(a, b).sum() + 1e-6)) + + +def _paired_frame_score(ref_path: Path, src_path: Path, cfg: AppConfig) -> float: + edge_score = _corr(_feature(ref_path, cfg), _feature(src_path, cfg)) + luma_score = _corr(_luma_feature(ref_path, cfg), _luma_feature(src_path, cfg)) + hist_score = _hist_intersection(_hist_feature(ref_path, cfg), _hist_feature(src_path, cfg)) + spatial_score = _hist_intersection(_spatial_hist_feature(ref_path, cfg), _spatial_hist_feature(src_path, cfg)) + return ( + edge_score * 0.24 + + luma_score * 0.24 + + hist_score * 0.14 + + spatial_score * 0.38 + ) + + +def align_cached_match_by_content( + beat: TrailerBeat, + estimated_in_point_s: float, + cfg: AppConfig, + search_window_s: float | None = None, + fps: float = 25.0, +) -> tuple[float, float]: + """ + Measure the local source offset directly from rendered frame content. + + This is intentionally independent from the global OpenCV matcher: it only + needs FFmpeg, Pillow and numpy, and it scans a small window around an + already plausible candidate. + """ + window_s = ( + search_window_s + if search_window_s is not None + else cfg.cv.deep_scan.content_align_window_seconds + ) + sample_step_s = max(1.0 / fps, cfg.cv.deep_scan.content_align_sample_step_s) + source_start_s = max(0.0, estimated_in_point_s - window_s) + source_duration_s = beat.duration_s + (2.0 * window_s) + 0.5 + + tmp = cfg.paths.output_dir / "align_tmp" / f"beat_{beat.beat_id:03d}" + shutil.rmtree(tmp, ignore_errors=True) + tmp.mkdir(parents=True, exist_ok=True) + try: + ref_dir = tmp / "ref" + src_dir = tmp / "src" + _extract_frames(beat.trailer_path, beat.start_s, beat.duration_s, fps, ref_dir, "ref") + _extract_frames(cfg.paths.source_movie, source_start_s, source_duration_s, fps, src_dir, "src") + + ref_frames = sorted(ref_dir.glob("ref_*.png")) + src_frames = sorted(src_dir.glob("src_*.png")) + if not ref_frames or not src_frames: + return estimated_in_point_s, 0.0 + + sample_frame_step = max(1, int(round(sample_step_s * fps))) + min_matchable_frames = max(1, len(ref_frames) - int(round(0.24 * fps))) + template_offsets: list[int] = [] + templates: list[tuple[int, np.ndarray]] = [] + for idx in range(0, min_matchable_frames, sample_frame_step): + path = ref_frames[idx] + if _is_dark(path, cfg): + continue + template_offsets.append(idx) + templates.append((idx, _feature(path, cfg))) + + if len(templates) < 3: + template_offsets = list(range(0, min_matchable_frames, sample_frame_step)) + templates = [ + (idx, _feature(ref_frames[idx], cfg)) + for idx in template_offsets + ] + + search_start_frame = 0 + search_end_frame = max(0, len(src_frames) - min_matchable_frames) + estimated_frame = int(round((estimated_in_point_s - source_start_s) * fps)) + best_frame = estimated_frame + best_score = -1.0 + + for candidate_frame in range(search_start_frame, search_end_frame + 1): + scores: list[float] = [] + for offset_frame in template_offsets: + src_idx = candidate_frame + offset_frame + if src_idx < 0 or src_idx >= len(src_frames): + break + scores.append(_paired_frame_score(ref_frames[offset_frame], src_frames[src_idx], cfg)) + if len(scores) < max(3, math.ceil(len(templates) * 0.65)): + continue + + avg_score = sum(scores) / len(scores) + min_score = min(scores) + score = (avg_score * 0.68) + (min_score * 0.32) + if score > best_score + 0.003: + best_score = score + best_frame = candidate_frame + elif score >= best_score - 0.003 and abs(candidate_frame - estimated_frame) < abs(best_frame - estimated_frame): + best_frame = candidate_frame + + return source_start_s + (best_frame / fps), max(0.0, best_score) + finally: + shutil.rmtree(tmp, ignore_errors=True) diff --git a/src/cv/deep_scan.py b/src/cv/deep_scan.py new file mode 100644 index 0000000..d8adcab --- /dev/null +++ b/src/cv/deep_scan.py @@ -0,0 +1,253 @@ +""" +src/cv/deep_scan.py — Phase 2: Frame-accurate template matching (Deep Scan) + +Responsibility: + Given a TrailerBeat and a ranked list of VibeHit candidates, open the + source video and scan each candidate scene in two passes: + + 1. Coarse pass: step through at coarse_step_seconds intervals, + comparing via cv2.matchTemplate. + 2. Refine pass: if coarse score > threshold, zoom in ± refine_window_seconds + at refine_step_seconds resolution to pin the exact in-point. + +Returns a MatchResult if a confident hit is found, otherwise None. +""" + +from __future__ import annotations + +import logging +from pathlib import Path +from typing import Sequence + +import cv2 +import numpy as np + +from src.core.config import AppConfig +from src.core.models import MatchResult, Scene, TrailerBeat, VibeHit +from src.cv.fingerprinting import text_safe_crop +from src.cv.frame_extractor import ( + grab_frame_at, + grab_frame_at_path, + iter_frames_stepped, + open_video, +) + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Template preparation +# --------------------------------------------------------------------------- + +def _prepare_template( + trailer_beat: TrailerBeat, + cfg: AppConfig, + proxy_w: int, + proxy_h: int, +) -> np.ndarray | None: + """ + Extract, crop, and resize the representative frame from the trailer beat. + + This frame becomes the cv2.matchTemplate "needle". + """ + vc = cfg.cv.vibe_check + ds = cfg.cv.deep_scan + + beat_frame = grab_frame_at_path( + trailer_beat.trailer_path, + trailer_beat.midpoint_s, + ) + if beat_frame is None: + logger.warning("Beat %d: cannot decode midpoint frame.", trailer_beat.beat_id) + return None + + cropped = text_safe_crop(beat_frame, vc.crop_top_fraction, vc.crop_bottom_fraction) + resized = cv2.resize(cropped, (proxy_w, proxy_h), interpolation=cv2.INTER_AREA) + + # Crop the template by 10% on all sides to allow sliding window (translation invariance) + # when matching against the source movie, which might have slight pan/scan shifts. + margin_y = int(proxy_h * 0.10) + margin_x = int(proxy_w * 0.10) + template = resized[margin_y : proxy_h - margin_y, margin_x : proxy_w - margin_x] + + return template + + +# --------------------------------------------------------------------------- +# Single-frame match +# --------------------------------------------------------------------------- + +def _match_frame( + source_frame: np.ndarray, + template: np.ndarray, + method: int, + proxy_w: int, + proxy_h: int, + crop_top: float, + crop_bottom: float, +) -> tuple[float, tuple[int, int]]: + """ + Run cv2.matchTemplate between *source_frame* and *template*. + + Returns: + (score, (x, y)) where score ∈ [0, 1] for CCOEFF_NORMED. + """ + cropped = text_safe_crop(source_frame, crop_top, crop_bottom) + haystack = cv2.resize(cropped, (proxy_w, proxy_h), interpolation=cv2.INTER_AREA) + + # Match the slightly smaller template inside the full proxy frame + result = cv2.matchTemplate(haystack, template, method) + _, max_val, _, max_loc = cv2.minMaxLoc(result) + return float(max_val), (int(max_loc[0]), int(max_loc[1])) + + +# --------------------------------------------------------------------------- +# Deep Scan core +# --------------------------------------------------------------------------- + +def scan_scene( + beat: TrailerBeat, + scene: Scene, + template: np.ndarray, + cfg: AppConfig, +) -> tuple[float, float, tuple[int, int]] | None: + """ + Scan one source scene in two passes (coarse → refine). + + Returns: + (best_timestamp_s, best_score, best_location) or None if no hit. + """ + ds = cfg.cv.deep_scan + vc = cfg.cv.vibe_check + proxy_w = cfg.video.proxy_width + proxy_h = cfg.video.proxy_height + + best_t = scene.start_s + best_score = 0.0 + best_loc = (0, 0) + + # ---- Coarse pass -------------------------------------------------------- + with open_video(scene.source_path) as cap: + for t, frame in iter_frames_stepped( + cap, scene.start_s, scene.end_s, ds.coarse_step_seconds + ): + score, loc = _match_frame( + frame, template, ds.match_method, + proxy_w, proxy_h, vc.crop_top_fraction, vc.crop_bottom_fraction, + ) + if score > best_score: + best_score = score + best_t = t + best_loc = loc + + if best_score < ds.match_threshold: + return None # scene doesn't contain a match worth refining + + # ---- Refine pass ---------------------------------------------------- + refine_start = max(scene.start_s, best_t - ds.refine_window_seconds) + refine_end = min(scene.end_s, best_t + ds.refine_window_seconds) + + refined_t = best_t + refined_score = best_score + refined_loc = best_loc + + for t, frame in iter_frames_stepped( + cap, refine_start, refine_end, ds.refine_step_seconds + ): + score, loc = _match_frame( + frame, template, ds.match_method, + proxy_w, proxy_h, vc.crop_top_fraction, vc.crop_bottom_fraction, + ) + if score > refined_score: + refined_score = score + refined_t = t + refined_loc = loc + + logger.debug( + "Beat %d → Scene %d: coarse=%.3f refined=%.3f @%.3fs", + beat.beat_id, scene.scene_id, best_score, refined_score, refined_t, + ) + return refined_t, refined_score, refined_loc + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + +def run_deep_scan( + beat: TrailerBeat, + candidates: Sequence[VibeHit], + scenes_by_id: dict[int, Scene], + cfg: AppConfig, +) -> MatchResult | None: + """ + Phase 2 Deep Scan: iterate over Vibe Check candidates and template-match. + + Args: + beat: The trailer beat to source. + candidates: Ranked VibeHit list from Phase 1 (best first). + scenes_by_id: Lookup dict: scene_id → Scene. + cfg: Application configuration. + + Returns: + The best MatchResult above threshold, or None if no match found. + """ + proxy_w = cfg.video.proxy_width + proxy_h = cfg.video.proxy_height + + template = _prepare_template(beat, cfg, proxy_w, proxy_h) + if template is None: + return None + + best_result: MatchResult | None = None + + for vibe_hit in candidates: + scene = scenes_by_id.get(vibe_hit.scene_id) + if scene is None: + logger.warning("VibeHit references unknown scene_id=%d", vibe_hit.scene_id) + continue + + hit = scan_scene(beat, scene, template, cfg) + if hit is None: + continue + + in_point_s, match_score, match_loc = hit + + # Frame number: approximate via FPS (refined later if needed) + from src.cv.frame_extractor import get_video_info + info = get_video_info(scene.source_path) + fps = float(info["fps"]) or 24.0 + in_point_frame = int(in_point_s * fps) + + candidate_result = MatchResult( + beat_id=beat.beat_id, + scene_id=scene.scene_id, + source_path=scene.source_path, + in_point_s=in_point_s, + out_point_s=in_point_s + beat.duration_s, + in_point_frame=in_point_frame, + match_score=match_score, + match_location=match_loc, + vibe_hit=vibe_hit, + ) + + if best_result is None or match_score > best_result.match_score: + best_result = candidate_result + + # Early exit: if score is very high, no need to check other candidates + if match_score >= 0.90: + logger.info( + "Beat %d: early-exit match (score=%.3f) in scene %d @%.3fs", + beat.beat_id, match_score, scene.scene_id, in_point_s, + ) + break + + if best_result: + logger.info("Beat %d → MATCH scene=%d score=%.3f in=%.3fs", + beat.beat_id, best_result.scene_id, + best_result.match_score, best_result.in_point_s) + else: + logger.warning("Beat %d → NO MATCH found in %d candidates.", + beat.beat_id, len(candidates)) + + return best_result diff --git a/src/cv/fingerprinting.py b/src/cv/fingerprinting.py new file mode 100644 index 0000000..dc0c0b0 --- /dev/null +++ b/src/cv/fingerprinting.py @@ -0,0 +1,228 @@ +""" +src/cv/fingerprinting.py — Image fingerprinting for the Vibe Check phase + +Responsibilities (Single Responsibility Principle): + - Text-Safe Crop: strip top/bottom fractions to hide logos & letterbox + - Luma + Saturation histogram extraction (scale-invariant) + - Perceptual hash (pHash) via imagehash + +This module is PURELY functional — no file I/O, no video decoding, +no search logic. It takes numpy arrays and returns numeric descriptors. +""" + +from __future__ import annotations + +import pickle +from typing import TYPE_CHECKING + +import cv2 +import numpy as np + +try: + import imagehash + from PIL import Image as PilImage + _HAS_IMAGEHASH = True +except ImportError: + _HAS_IMAGEHASH = False + +if TYPE_CHECKING: + from src.core.config import VibeCheckConfig + + +# --------------------------------------------------------------------------- +# Text-Safe Crop +# --------------------------------------------------------------------------- + +def text_safe_crop( + frame: np.ndarray, + crop_top: float, + crop_bottom: float, +) -> np.ndarray: + """ + Remove the top and bottom fractions of a frame. + + This eliminates title cards, logos (top) and letterbox / subtitles + (bottom) before any colour analysis, preventing false positives. + + Args: + frame: BGR or greyscale frame as (H, W[, C]) ndarray. + crop_top: Fraction [0, 1) of height to remove from the top. + crop_bottom: Fraction [0, 1) of height to remove from the bottom. + + Returns: + Cropped view (no copy — avoids memory overhead). + + Raises: + ValueError: If crop fractions are out of range or overlap. + """ + if not (0.0 <= crop_top < 1.0): + raise ValueError(f"crop_top must be in [0, 1); got {crop_top}") + if not (0.0 <= crop_bottom < 1.0): + raise ValueError(f"crop_bottom must be in [0, 1); got {crop_bottom}") + if crop_top + crop_bottom >= 1.0: + raise ValueError( + f"crop_top ({crop_top}) + crop_bottom ({crop_bottom}) must be < 1.0" + ) + + h = frame.shape[0] + y_start = int(h * crop_top) + y_end = int(h * (1.0 - crop_bottom)) + return frame[y_start:y_end] + + +# --------------------------------------------------------------------------- +# Histogram extraction +# --------------------------------------------------------------------------- + +def extract_hs_histograms( + frame_bgr: np.ndarray, + bins_luma: int | None = None, + bins_sat: int | None = None, + *, + bins_hue: int | None = None, +) -> tuple[np.ndarray, np.ndarray]: + """ + Compute normalised Luma (Value) and Saturation histograms from a BGR frame. + + We use Luma and Saturation (ignoring Hue) because Hue is highly sensitive + to color grading differences between the trailer and the source movie. + + Args: + frame_bgr: BGR frame (H, W, 3) uint8. + bins_luma: Number of histogram bins for the Luma channel [0, 256). + bins_hue: Backwards-compatible alias for bins_luma. + bins_sat: Number of histogram bins for the Saturation channel [0, 256). + + Returns: + (luma_hist, sat_hist) — each a 1-D float32 ndarray, L2-normalised. + """ + if bins_luma is None: + bins_luma = bins_hue + elif bins_hue is not None and bins_hue != bins_luma: + raise ValueError("bins_hue is an alias for bins_luma; pass only one value") + if bins_luma is None or bins_sat is None: + raise TypeError("bins_luma/bins_hue and bins_sat are required") + + hsv = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2HSV) + luma = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY) + + # Use perceptual grayscale luma rather than HSV Value. Value would make + # saturated red and blue look identical, weakening the scene-level filter. + luma_hist = cv2.calcHist( + [luma], [0], None, [bins_luma], [0, 256] + ).flatten().astype(np.float32) + + sat_hist = cv2.calcHist( + [hsv], [1], None, [bins_sat], [0, 256] + ).flatten().astype(np.float32) + + # L2-normalise so scene size doesn't affect scores + cv2.normalize(luma_hist, luma_hist, alpha=1.0, norm_type=cv2.NORM_L2) + cv2.normalize(sat_hist, sat_hist, alpha=1.0, norm_type=cv2.NORM_L2) + + return luma_hist, sat_hist + + +def compare_histograms( + hist_a: np.ndarray, + hist_b: np.ndarray, + method: int, +) -> float: + """ + Compare two histograms using cv2.compareHist. + + Args: + hist_a, hist_b: 1-D float32 ndarrays of identical shape. + method: cv2.HISTCMP_* constant (e.g. cv2.HISTCMP_CORREL = 0). + + Returns: + Raw score from cv2.compareHist (range depends on method). + For CORREL: [-1, 1], higher = more similar. + For BHATTACHARYYA: [0, 1], lower = more similar. + """ + return float(cv2.compareHist(hist_a, hist_b, method)) + + +# --------------------------------------------------------------------------- +# Perceptual Hash +# --------------------------------------------------------------------------- + +def compute_phash(frame_bgr: np.ndarray, hash_size: int = 8) -> str: + """ + Compute a perceptual hash (pHash) of a BGR frame. + + pHash is rotation- and scale-invariant; it catches visual similarity + even when resolution differs between trailer proxy and source movie. + + Args: + frame_bgr: BGR frame (H, W, 3) uint8. + hash_size: DCT block size; 8 → 64-bit hash (default). + + Returns: + Hex string representation of the 64-bit hash (e.g. "f8e0e0e0..."). + + Raises: + RuntimeError: If imagehash is not installed. + """ + if not _HAS_IMAGEHASH: + raise RuntimeError( + "imagehash is not installed. Run: pip install imagehash" + ) + rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB) + pil = PilImage.fromarray(rgb) + phash = imagehash.phash(pil, hash_size=hash_size) + return str(phash) + + +def phash_distance(hash_a: str, hash_b: str) -> int: + """ + Compute Hamming distance between two pHash hex strings. + + Args: + hash_a, hash_b: Hex strings as returned by compute_phash(). + + Returns: + Integer Hamming distance [0, 64]. 0 = identical. + """ + if not _HAS_IMAGEHASH: + raise RuntimeError("imagehash is not installed.") + return int(imagehash.hex_to_hash(hash_a) - imagehash.hex_to_hash(hash_b)) + + +# --------------------------------------------------------------------------- +# Serialisation helpers (histograms ↔ bytes for caching) +# --------------------------------------------------------------------------- + +def hist_to_bytes(hist: np.ndarray) -> bytes: + """Serialise a numpy histogram array for storage in a Scene/Beat model.""" + return pickle.dumps(hist, protocol=pickle.HIGHEST_PROTOCOL) + + +def bytes_to_hist(data: bytes) -> np.ndarray: + """Deserialise a numpy histogram array from bytes.""" + return pickle.loads(data) # noqa: S301 (trusted internal cache only) + + +# --------------------------------------------------------------------------- +# High-level convenience: fingerprint one frame using config +# --------------------------------------------------------------------------- + +def fingerprint_frame( + frame_bgr: np.ndarray, + cfg: "VibeCheckConfig", +) -> tuple[bytes, bytes, str]: + """ + Apply Text-Safe Crop, histogram extraction, and pHash in one call. + + Args: + frame_bgr: Full BGR frame (H, W, 3) uint8. + cfg: VibeCheckConfig carrying crop fractions and bin counts. + + Returns: + (luma_hist_bytes, sat_hist_bytes, phash_hex) + """ + cropped = text_safe_crop(frame_bgr, cfg.crop_top_fraction, cfg.crop_bottom_fraction) + luma_hist, sat_hist = extract_hs_histograms(cropped, cfg.hist_bins_hue, cfg.hist_bins_saturation) + phash_hex = compute_phash(cropped) + + return hist_to_bytes(luma_hist), hist_to_bytes(sat_hist), phash_hex diff --git a/src/cv/frame_extractor.py b/src/cv/frame_extractor.py new file mode 100644 index 0000000..5cedd19 --- /dev/null +++ b/src/cv/frame_extractor.py @@ -0,0 +1,172 @@ +""" +src/cv/frame_extractor.py — Low-level video frame access + +Responsibility: + Provide a thin, testable wrapper around cv2.VideoCapture for: + - seeking to an exact timestamp and returning one BGR frame + - iterating frames with a configurable step size + - extracting the "representative" middle frame of a Scene / TrailerBeat + +No fingerprinting, no matching — only raw frame delivery. +""" + +from __future__ import annotations + +import logging +from contextlib import contextmanager +from pathlib import Path +from typing import Generator, Iterator + +import cv2 +import numpy as np + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Context-managed VideoCapture +# --------------------------------------------------------------------------- + +@contextmanager +def open_video(path: Path) -> Generator[cv2.VideoCapture, None, None]: + """ + Context manager that opens a VideoCapture and guarantees release. + + Args: + path: Absolute path to the video file. + + Raises: + FileNotFoundError: If the file does not exist. + RuntimeError: If OpenCV cannot open the file. + """ + if not path.exists(): + raise FileNotFoundError(f"Video not found: {path}") + + cap = cv2.VideoCapture(str(path)) + if not cap.isOpened(): + raise RuntimeError(f"OpenCV could not open video: {path}") + + try: + yield cap + finally: + cap.release() + + +# --------------------------------------------------------------------------- +# Video metadata +# --------------------------------------------------------------------------- + +def get_video_info(path: Path) -> dict[str, float | int]: + """ + Return basic metadata without keeping the file open. + + Returns: + dict with keys: fps, frame_count, duration_s, width, height + """ + with open_video(path) as cap: + fps = cap.get(cv2.CAP_PROP_FPS) + frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + + duration_s = frame_count / fps if fps > 0 else 0.0 + return { + "fps": fps, + "frame_count": frame_count, + "duration_s": duration_s, + "width": width, + "height": height, + } + + +# --------------------------------------------------------------------------- +# Single frame extraction +# --------------------------------------------------------------------------- + +def grab_frame_at(cap: cv2.VideoCapture, timestamp_s: float) -> np.ndarray | None: + """ + Seek to *timestamp_s* and return the BGR frame at that position. + + Uses CAP_PROP_POS_MSEC for sub-frame accuracy. + + Args: + cap: An already-open VideoCapture. + timestamp_s: Target time in seconds. + + Returns: + BGR ndarray (H, W, 3) or None if seeking / decoding failed. + """ + cap.set(cv2.CAP_PROP_POS_MSEC, timestamp_s * 1000.0) + ok, frame = cap.read() + if not ok or frame is None: + logger.debug("grab_frame_at: failed at %.3fs", timestamp_s) + return None + return frame + + +def grab_frame_at_path(path: Path, timestamp_s: float) -> np.ndarray | None: + """ + One-shot convenience: open → seek → grab → release. + Prefer open_video() when grabbing multiple frames from the same file. + """ + with open_video(path) as cap: + return grab_frame_at(cap, timestamp_s) + + +# --------------------------------------------------------------------------- +# Middle-frame extraction (representative frame for fingerprinting) +# --------------------------------------------------------------------------- + +def grab_midpoint_frame( + cap: cv2.VideoCapture, + start_s: float, + end_s: float, +) -> np.ndarray | None: + """ + Grab the frame at the exact midpoint of a [start_s, end_s] interval. + + Args: + cap: Open VideoCapture for the source video. + start_s: Interval start in seconds. + end_s: Interval end in seconds. + + Returns: + BGR frame or None if decoding failed. + """ + mid = start_s + (end_s - start_s) / 2.0 + return grab_frame_at(cap, mid) + + +# --------------------------------------------------------------------------- +# Stepped-frame iterator (used by Deep Scan coarse pass) +# --------------------------------------------------------------------------- + +def iter_frames_stepped( + cap: cv2.VideoCapture, + start_s: float, + end_s: float, + step_s: float, +) -> Iterator[tuple[float, np.ndarray]]: + """ + Yield (timestamp_s, frame) for every *step_s* increment in [start_s, end_s]. + + Frames that fail to decode are silently skipped. + + Args: + cap: Open VideoCapture. + start_s: Scan window start in seconds. + end_s: Scan window end in seconds. + step_s: Step between samples in seconds. + + Yields: + (timestamp_s, bgr_frame) + """ + if step_s <= 0: + raise ValueError(f"step_s must be > 0; got {step_s}") + + t = start_s + while t <= end_s: + frame = grab_frame_at(cap, t) + if frame is not None: + yield t, frame + t = round(t + step_s, 6) # avoid float accumulation drift diff --git a/src/cv/global_scan.py b/src/cv/global_scan.py new file mode 100644 index 0000000..89b0930 --- /dev/null +++ b/src/cv/global_scan.py @@ -0,0 +1,1509 @@ +import logging +import cv2 +import numpy as np +import subprocess as sp +from typing import Sequence +import time +from dataclasses import replace + +from src.core.config import AppConfig +from src.core.models import MatchResult, TrailerBeat +from src.cv.fingerprinting import text_safe_crop +from src.cv.frame_extractor import grab_frame_at_path, get_video_info, open_video, grab_frame_at + +logger = logging.getLogger(__name__) +SeedPoint = float | tuple[float, float] +_REFERENCE_CUT_CACHE: dict[tuple[str, float, float, float], list[float]] = {} + + +def _prepare_template(frame: np.ndarray, cfg: AppConfig) -> np.ndarray: + proxy_w = cfg.video.proxy_width + proxy_h = cfg.video.proxy_height + cb = text_safe_crop( + frame, + cfg.cv.vibe_check.crop_top_fraction, + cfg.cv.vibe_check.crop_bottom_fraction, + ) + rb = cv2.resize(cb, (proxy_w, proxy_h), interpolation=cv2.INTER_AREA) + + margin_y = int(proxy_h * 0.10) + margin_x = int(proxy_w * 0.10) + return _feature_image(rb[margin_y:proxy_h-margin_y, margin_x:proxy_w-margin_x]) + + +def _prepare_haystack(frame: np.ndarray, cfg: AppConfig) -> np.ndarray: + cb = text_safe_crop( + frame, + cfg.cv.vibe_check.crop_top_fraction, + cfg.cv.vibe_check.crop_bottom_fraction, + ) + rb = cv2.resize(cb, (cfg.video.proxy_width, cfg.video.proxy_height), interpolation=cv2.INTER_AREA) + return _feature_image(rb) + + +def _center_crop_feature(feature: np.ndarray, cfg: AppConfig) -> np.ndarray: + h, w = feature.shape[:2] + margin_y = int(h * 0.10) + margin_x = int(w * 0.10) + return feature[margin_y:h-margin_y, margin_x:w-margin_x] + + +def _feature_image(frame: np.ndarray) -> np.ndarray: + """ + Convert frames to a look-tolerant matching feature. + + Trailer shots may be desaturated, contrast-shifted, or contain a different + grade than the source movie. Matching luma plus edges is more stable than + raw BGR pixels and rejects unrelated scenes with similar colors. + """ + gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) + gray = cv2.equalizeHist(gray) + edges = cv2.Canny(gray, 60, 140) + return cv2.addWeighted(gray, 0.70, edges, 0.30, 0) + + +def _match_score(frame: np.ndarray, template: np.ndarray, cfg: AppConfig) -> float: + haystack = _prepare_haystack(frame, cfg) + res = cv2.matchTemplate(haystack, template, cv2.TM_CCOEFF_NORMED) + _, max_val, _, _ = cv2.minMaxLoc(res) + return float(max_val) + + +def _fixed_position_score(frame: np.ndarray, template: np.ndarray, cfg: AppConfig) -> float: + fixed = _center_crop_feature(_prepare_haystack(frame, cfg), cfg) + if fixed.shape != template.shape: + fixed = cv2.resize(fixed, (template.shape[1], template.shape[0]), interpolation=cv2.INTER_AREA) + res = cv2.matchTemplate(fixed, template, cv2.TM_CCOEFF_NORMED) + _, max_val, _, _ = cv2.minMaxLoc(res) + return float(max_val) + + +def _fixed_feature(frame: np.ndarray, template_shape: tuple[int, ...], cfg: AppConfig) -> np.ndarray: + fixed = _center_crop_feature(_prepare_haystack(frame, cfg), cfg) + if fixed.shape != template_shape: + fixed = cv2.resize(fixed, (template_shape[1], template_shape[0]), interpolation=cv2.INTER_AREA) + return fixed + + +def _corr_same_size(a: np.ndarray, b: np.ndarray) -> float: + if a.shape != b.shape: + b = cv2.resize(b, (a.shape[1], a.shape[0]), interpolation=cv2.INTER_AREA) + res = cv2.matchTemplate(a, b, cv2.TM_CCOEFF_NORMED) + _, max_val, _, _ = cv2.minMaxLoc(res) + if np.isnan(max_val): + return 0.0 + return float(max_val) + + +def _validation_crop(frame: np.ndarray) -> np.ndarray: + frame = _trim_dark_borders(frame) + h = frame.shape[0] + return frame[int(h * 0.05):int(h * 0.95), :] + + +def _trim_dark_borders(frame: np.ndarray) -> np.ndarray: + """ + Remove encoded black matte/pillarbox borders before fixed-position checks. + + The reference trailer can contain vertical black bars while the source movie + does not. Whole-frame spatial validation should compare picture content, not + container matte. + """ + if frame.size == 0: + return frame + gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) + h, w = gray.shape[:2] + col_signal = np.percentile(gray, 90, axis=0) + row_signal = np.percentile(gray, 90, axis=1) + active_cols = np.where(col_signal > 18.0)[0] + active_rows = np.where(row_signal > 18.0)[0] + if active_cols.size >= max(8, int(w * 0.35)): + x0 = max(0, int(active_cols[0]) - 2) + x1 = min(w, int(active_cols[-1]) + 3) + else: + x0, x1 = 0, w + if active_rows.size >= max(8, int(h * 0.35)): + y0 = max(0, int(active_rows[0]) - 2) + y1 = min(h, int(active_rows[-1]) + 3) + else: + y0, y1 = 0, h + if x1 - x0 < int(w * 0.35) or y1 - y0 < int(h * 0.35): + return frame + return frame[y0:y1, x0:x1] + + +def _fixed_luma_feature(frame: np.ndarray, cfg: AppConfig) -> np.ndarray: + cropped = _validation_crop(frame) + gray = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY) + gray = cv2.equalizeHist(gray) + resized = cv2.resize(gray, (160, 80), interpolation=cv2.INTER_AREA).astype(np.float32) + return (resized - float(np.mean(resized))) / (float(np.std(resized)) + 1e-6) + + +def _fixed_edge_feature(frame: np.ndarray, cfg: AppConfig) -> np.ndarray: + cropped = _validation_crop(frame) + gray = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY) + gray = cv2.equalizeHist(gray) + edges = cv2.Canny(gray, 60, 140) + resized = cv2.resize(edges, (160, 80), interpolation=cv2.INTER_AREA).astype(np.float32) + return (resized - float(np.mean(resized))) / (float(np.std(resized)) + 1e-6) + + +def _fixed_hist_feature(frame: np.ndarray, cfg: AppConfig) -> np.ndarray: + cropped = _validation_crop(frame) + resized = cv2.resize(cropped, (160, 80), interpolation=cv2.INTER_AREA) + chans = cv2.split(resized) + parts = [] + for channel in chans: + hist = cv2.calcHist([channel], [0], None, [32], [0, 256]).astype(np.float32).flatten() + parts.append(hist / (float(np.sum(hist)) + 1e-6)) + return np.concatenate(parts) + + +def _fixed_spatial_hist_feature(frame: np.ndarray, cfg: AppConfig) -> np.ndarray: + cropped = _validation_crop(frame) + resized = cv2.resize(cropped, (160, 80), interpolation=cv2.INTER_AREA) + grid_y = 4 + grid_x = 4 + cell_h = resized.shape[0] // grid_y + cell_w = resized.shape[1] // grid_x + parts = [] + for gy in range(grid_y): + for gx in range(grid_x): + cell = resized[gy * cell_h:(gy + 1) * cell_h, gx * cell_w:(gx + 1) * cell_w, :] + for channel in cv2.split(cell): + hist = cv2.calcHist([channel], [0], None, [16], [0, 256]).astype(np.float32).flatten() + parts.append(hist / (float(np.sum(hist)) + 1e-6)) + return np.concatenate(parts) + + +def _array_corr(a: np.ndarray, b: np.ndarray) -> float: + if a.shape != b.shape: + return 0.0 + return float(np.mean(a * b)) + + +def _hist_intersection(a: np.ndarray, b: np.ndarray) -> float: + if a.shape != b.shape: + return 0.0 + return float(np.minimum(a, b).sum() / (np.maximum(a, b).sum() + 1e-6)) + + +def _fixed_content_features(frame: np.ndarray, cfg: AppConfig) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + return ( + _fixed_luma_feature(frame, cfg), + _fixed_edge_feature(frame, cfg), + _fixed_hist_feature(frame, cfg), + _fixed_spatial_hist_feature(frame, cfg), + ) + + +def _fixed_content_pair_score( + ref_features: tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray], + source_frame: np.ndarray, + cfg: AppConfig, +) -> float: + src_luma, src_edge, src_hist, src_spatial = _fixed_content_features(source_frame, cfg) + ref_luma, ref_edge, ref_hist, ref_spatial = ref_features + luma_score = _array_corr(ref_luma, src_luma) + edge_score = _array_corr(ref_edge, src_edge) + hist_score = _hist_intersection(ref_hist, src_hist) + spatial_score = _hist_intersection(ref_spatial, src_spatial) + return ( + edge_score * 0.24 + + luma_score * 0.24 + + hist_score * 0.14 + + spatial_score * 0.38 + ) + + +def _prepare_validation_templates( + beat: TrailerBeat, + cfg: AppConfig, +) -> list[tuple[float, tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]]: + step_s = max(0.20, cfg.cv.deep_scan.content_align_sample_step_s * 1.5) + matchable_s = estimate_matchable_reference_duration(beat, cfg, sample_step_s=step_s) + templates: list[tuple[float, tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]] = [] + t = 0.0 + while t <= matchable_s: + frame = grab_frame_at_path(beat.trailer_path, beat.start_s + t) + if frame is not None and _is_scoreable_reference_frame(frame, cfg): + templates.append((t, _fixed_content_features(frame, cfg))) + t = round(t + step_s, 6) + + if len(templates) >= 3: + return templates + + fallback: list[tuple[float, tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]] = [] + for offset_s in _beat_offsets(matchable_s): + frame = grab_frame_at_path(beat.trailer_path, beat.start_s + offset_s) + if frame is not None and _is_scoreable_reference_frame(frame, cfg): + fallback.append((offset_s, _fixed_content_features(frame, cfg))) + return fallback + + +def _prepare_rerank_templates( + beat: TrailerBeat, + cfg: AppConfig, +) -> list[tuple[float, tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]]: + matchable_s = estimate_matchable_reference_duration(beat, cfg) + templates: list[tuple[float, tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]] = [] + for offset_s in _beat_offsets(matchable_s): + frame = grab_frame_at_path(beat.trailer_path, beat.start_s + offset_s) + if frame is not None and _is_scoreable_reference_frame(frame, cfg): + templates.append((offset_s, _fixed_content_features(frame, cfg))) + return templates + + +def _fixed_content_sequence_score( + cap: cv2.VideoCapture, + in_point_s: float, + templates: list[tuple[float, tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]], + cfg: AppConfig, +) -> float: + if not templates: + return 0.0 + + scores: list[float] = [] + for offset_s, ref_features in templates: + frame = grab_frame_at(cap, in_point_s + offset_s) + if frame is None: + return 0.0 + scores.append(_fixed_content_pair_score(ref_features, frame, cfg)) + + if not scores: + return 0.0 + return float((sum(scores) / len(scores)) * 0.68 + min(scores) * 0.32) + + +def _reference_internal_cut_offsets(beat: TrailerBeat, cfg: AppConfig) -> list[float]: + """Detect hard visual cuts inside a single trailer beat.""" + cache_key = ( + str(beat.trailer_path), + round(float(beat.start_s), 3), + round(float(beat.end_s), 3), + round(float(cfg.vision.multi_shot_cut_corr_threshold), 3), + ) + cached = _REFERENCE_CUT_CACHE.get(cache_key) + if cached is not None: + return cached + + step_s = max(1.0 / cfg.export.edl_frame_rate, 0.08) + previous: np.ndarray | None = None + cuts: list[float] = [] + t = 0.0 + while t <= beat.duration_s: + frame = grab_frame_at_path(beat.trailer_path, beat.start_s + t) + if frame is not None and _is_scoreable_reference_frame(frame, cfg): + feature = _prepare_haystack(frame, cfg) + if previous is not None: + corr = _corr_same_size(previous, feature) + if ( + corr < cfg.vision.multi_shot_cut_corr_threshold + and 0.18 < t < beat.duration_s - 0.18 + and (not cuts or t - cuts[-1] > 0.24) + ): + cuts.append(round(t, 3)) + previous = feature + t = round(t + step_s, 6) + if cuts: + logger.debug('Beat %d: detected internal trailer cuts at %s', beat.beat_id, cuts) + _REFERENCE_CUT_CACHE[cache_key] = cuts + return cuts + + +def _scene_fps_estimate(scene, cfg: AppConfig) -> float: + duration_s = max(0.0, float(scene.end_s) - float(scene.start_s)) + frame_count = max(0, int(scene.end_frame) - int(scene.start_frame)) + if duration_s <= 0.0 or frame_count <= 0: + return cfg.export.edl_frame_rate + return frame_count / duration_s + + +def _contiguous_scene_coverage_duration( + beat: TrailerBeat, + in_point_s: float, + scenes: Sequence | None, + matchable_duration_s: float, + cfg: AppConfig, +) -> float: + """ + Allow a source span to cross scene boundaries only when the trailer beat has + matching internal cuts at the same relative offsets. + """ + if not scenes or matchable_duration_s <= 0: + return 0.0 + + start_idx = None + for idx, scene in enumerate(scenes): + if float(scene.start_s) <= in_point_s < float(scene.end_s): + start_idx = idx + break + if start_idx is None: + return 0.0 + + cut_offsets = _reference_internal_cut_offsets(beat, cfg) + target_end = in_point_s + matchable_duration_s + current_end = in_point_s + for scene in scenes[start_idx:]: + scene_end = float(scene.end_s) + fps = _scene_fps_estimate(scene, cfg) + tail_s = max(0.0, cfg.cv.deep_scan.trim_tail_frames / fps) + if target_end <= scene_end: + return matchable_duration_s + + boundary_offset = scene_end - in_point_s + boundary_matches_ref_cut = any( + abs(boundary_offset - cut_offset) <= cfg.vision.multi_shot_boundary_tolerance_s + for cut_offset in cut_offsets + ) + if not boundary_matches_ref_cut: + return max(0.0, scene_end - in_point_s - tail_s) + + current_end = scene_end + + return max(0.0, current_end - in_point_s) + + +def _rerank_candidates_by_content( + beat: TrailerBeat, + candidates: list[tuple[float, float]], + cfg: AppConfig, + scenes: Sequence | None = None, + matchable_duration_s: float | None = None, +) -> list[tuple[float, float, float]]: + templates = _prepare_rerank_templates(beat, cfg) + if not templates: + return [(score, score, t_sec) for score, t_sec in candidates] + + reranked: list[tuple[float, float, float]] = [] + with open_video(cfg.paths.source_movie) as cap: + for coarse_score, t_sec in candidates: + content_score = _fixed_content_sequence_score(cap, t_sec, templates, cfg) + coverage_score = 1.0 + if scenes is not None and matchable_duration_s and matchable_duration_s > 0: + usable_s = _contiguous_scene_coverage_duration( + beat, + t_sec, + scenes, + matchable_duration_s, + cfg, + ) + coverage_score = min(1.0, usable_s / matchable_duration_s) + rank_score = ( + content_score * 0.62 + + coarse_score * 0.18 + + coverage_score * 0.20 + ) + reranked.append((rank_score, coarse_score, t_sec)) + + return sorted(reranked, key=lambda item: item[0], reverse=True) + + +def _dense_weighted_seed_candidates( + beat: TrailerBeat, + seed_candidates: list[tuple[float, float]], + cfg: AppConfig, + scenes: Sequence | None, + matchable_duration_s: float, +) -> list[tuple[float, float]]: + """Scan vision-selected source scenes densely with fixed-position content features.""" + if not scenes or not seed_candidates: + return [] + + weighted_floor = cfg.cv.deep_scan.coarse_candidate_threshold + 0.05 + seeded_scenes: dict[int, tuple[object, float]] = {} + for seed_score, seed_t in seed_candidates: + if seed_score <= weighted_floor: + continue + scene = _find_scene_for_time(scenes, seed_t, cfg) + if scene is None: + continue + previous = seeded_scenes.get(scene.scene_id) + if previous is None or seed_score > previous[1]: + seeded_scenes[scene.scene_id] = (scene, seed_score) + + if not seeded_scenes: + return [] + + templates = _prepare_rerank_templates(beat, cfg) + if not templates: + return [] + + cut_offsets = _reference_internal_cut_offsets(beat, cfg) + dense: list[tuple[float, float, float, float, int]] = [] + with open_video(cfg.paths.source_movie) as cap: + for scene, seed_score in seeded_scenes.values(): + fps = _source_fps_from_scene(scene) or cfg.export.edl_frame_rate + tail_s = max(0.0, cfg.cv.deep_scan.trim_tail_frames / fps) + start_s = max(0.0, float(scene.start_s)) + end_s = max(start_s, float(scene.end_s) - tail_s) + if end_s <= start_s: + continue + span_s = end_s - start_s + step_s = max(0.04, cfg.vision.local_scan_step_s) + max_points = max(2, cfg.vision.local_scan_max_points_per_scene) + point_count = int(span_s / step_s) + 1 + if point_count > max_points: + step_s = span_s / float(max_points - 1) + + t_sec = start_s + while t_sec <= end_s + 0.001: + content_score = _fixed_content_sequence_score(cap, t_sec, templates, cfg) + usable_s = max(0.0, float(scene.end_s) - t_sec - tail_s) + coverage_score = ( + min(1.0, usable_s / matchable_duration_s) + if matchable_duration_s > 0 else 0.0 + ) + rank_score = ( + content_score * 0.50 + + coverage_score * 0.35 + + seed_score * 0.15 + ) + coarse_score = max( + weighted_floor, + min(0.99, seed_score * 0.80 + content_score * 0.20), + ) + dense.append((rank_score, coarse_score, t_sec, content_score, scene.scene_id)) + t_sec += step_s + + for cut_offset in cut_offsets: + shifted_t = max(0.0, float(scene.start_s) - cut_offset) + coverage_score = ( + min( + 1.0, + _contiguous_scene_coverage_duration( + beat, + shifted_t, + scenes, + matchable_duration_s, + cfg, + ) / matchable_duration_s, + ) + if matchable_duration_s > 0 else 0.0 + ) + if coverage_score < 0.80: + continue + content_score = _fixed_content_sequence_score(cap, shifted_t, templates, cfg) + rank_score = ( + content_score * 0.56 + + coverage_score * 0.34 + + seed_score * 0.10 + ) + coarse_score = max( + weighted_floor, + min(0.99, seed_score * 0.78 + content_score * 0.22), + ) + dense.append((rank_score, coarse_score, shifted_t, content_score, scene.scene_id)) + + dense.sort(key=lambda item: item[0], reverse=True) + top = dense[: max(0, cfg.vision.local_scan_top_candidates)] + if top: + logger.info( + 'Beat %d: dense vision content scan kept %d/%d candidates; best scene=%d in=%.3fs content=%.3f rank=%.3f.', + beat.beat_id, + len(top), + len(dense), + top[0][4], + top[0][2], + top[0][3], + top[0][0], + ) + return [(coarse_score, t_sec) for _, coarse_score, t_sec, _, _ in top] + + +def _beat_offsets(duration_s: float) -> list[float]: + """Use several frames across the beat, including the leading edge.""" + if duration_s < 1.0: + return [0.0, duration_s * 0.35, duration_s * 0.70] + if duration_s < 2.5: + return [duration_s * r for r in (0.00, 0.15, 0.35, 0.55, 0.78)] + return [duration_s * r for r in (0.00, 0.12, 0.30, 0.50, 0.70, 0.88)] + + +def _prepare_beat_templates(beat: TrailerBeat, cfg: AppConfig) -> list[tuple[float, np.ndarray]]: + templates: list[tuple[float, np.ndarray]] = [] + matchable_s = estimate_matchable_reference_duration(beat, cfg) + for offset_s in _beat_offsets(matchable_s): + frame = grab_frame_at_path(beat.trailer_path, beat.start_s + offset_s) + if frame is None or not _is_scoreable_reference_frame(frame, cfg): + continue + templates.append((offset_s, _prepare_template(frame, cfg))) + return templates + + +def _prepare_beat_templates_stepped( + beat: TrailerBeat, + cfg: AppConfig, + step_s: float = 0.12, +) -> list[tuple[float, np.ndarray]]: + templates: list[tuple[float, np.ndarray]] = [] + matchable_s = estimate_matchable_reference_duration(beat, cfg, sample_step_s=step_s) + t = 0.0 + while t <= matchable_s: + frame = grab_frame_at_path(beat.trailer_path, beat.start_s + t) + if frame is not None and _is_scoreable_reference_frame(frame, cfg): + templates.append((t, _prepare_template(frame, cfg))) + t = round(t + step_s, 6) + return templates + + +def _prepare_motion_templates( + beat: TrailerBeat, + cfg: AppConfig, + step_s: float = 0.12, +) -> list[tuple[float, float, np.ndarray, tuple[int, ...]]]: + """ + Build reference frame-difference templates for motion-phase alignment. + + Absolute image similarity can match the right shot at the wrong point in a + repeated movement. Frame-to-frame deltas make the refine pass care about the + phase and direction of motion as well. + """ + result: list[tuple[float, float, np.ndarray, tuple[int, ...]]] = [] + max_offset = max(0.0, beat.duration_s - step_s) + t = 0.0 + while t <= max_offset: + f0 = grab_frame_at_path(beat.trailer_path, beat.start_s + t) + f1 = grab_frame_at_path(beat.trailer_path, beat.start_s + t + step_s) + if ( + f0 is not None + and f1 is not None + and _is_scoreable_reference_frame(f0, cfg) + and _is_scoreable_reference_frame(f1, cfg) + ): + feat0 = _prepare_template(f0, cfg) + feat1 = _prepare_template(f1, cfg) + result.append((t, step_s, cv2.absdiff(feat1, feat0), feat0.shape)) + t = round(t + step_s, 6) + return result + + +def _is_dark_reference_frame(frame: np.ndarray, cfg: AppConfig) -> bool: + cropped = text_safe_crop( + frame, + cfg.cv.vibe_check.crop_top_fraction, + cfg.cv.vibe_check.crop_bottom_fraction, + ) + gray = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY) + return float(np.mean(gray)) < 28.0 and float(np.percentile(gray, 90)) < 58.0 + + +def _reference_visibility_stats(frame: np.ndarray, cfg: AppConfig) -> tuple[float, float, float]: + cropped = text_safe_crop( + frame, + cfg.cv.vibe_check.crop_top_fraction, + cfg.cv.vibe_check.crop_bottom_fraction, + ) + gray = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY) + p10 = float(np.percentile(gray, 10)) + p90 = float(np.percentile(gray, 90)) + return float(np.mean(gray)), p90, p90 - p10 + + +def _is_scoreable_reference_frame(frame: np.ndarray, cfg: AppConfig) -> bool: + """Exclude black, fade, and low-visibility reference frames from scoring.""" + if _is_dark_reference_frame(frame, cfg): + return False + + mean_luma, p90_luma, contrast = _reference_visibility_stats(frame, cfg) + low_visibility = ( + mean_luma < cfg.cv.deep_scan.scoreable_luma_mean_min + and p90_luma < cfg.cv.deep_scan.scoreable_luma_p90_min + ) + return not low_visibility and contrast >= cfg.cv.deep_scan.scoreable_contrast_min + + +def estimate_matchable_reference_duration( + beat: TrailerBeat, + cfg: AppConfig, + sample_step_s: float | None = None, +) -> float: + """ + Estimate the part of a trailer beat that should be source-matchable. + + Trailer beats often include trailing black/title/credit frames that do not + exist in the source movie. Those frames should not force the source match to + cover the full beat duration. + """ + step_s = sample_step_s if sample_step_s is not None else cfg.cv.deep_scan.span_sample_step_s + samples: list[tuple[float, bool]] = [] + t = 0.0 + while t <= beat.duration_s: + frame = grab_frame_at_path(beat.trailer_path, beat.start_s + t) + if frame is not None: + samples.append((t, _is_dark_reference_frame(frame, cfg))) + t = round(t + step_s, 6) + + if not samples: + return beat.duration_s + + dark_run_start: float | None = None + saw_visible = False + min_dark_break_s = max(0.24, step_s * 2.0) + for offset_s, is_dark in samples: + if not is_dark: + saw_visible = True + dark_run_start = None + continue + + if saw_visible: + if dark_run_start is None: + dark_run_start = offset_s + if offset_s - dark_run_start >= min_dark_break_s: + break + + if dark_run_start is None: + return beat.duration_s + + # Keep a small buffer before the first sustained dark/title break so the + # source clip does not visibly end before the trailer begins its fade/card. + # Long beats can contain later credit/title islands; those should not force + # one source clip to validate unrelated images. + return max(step_s, min(beat.duration_s, dark_run_start + step_s)) + + +def _sequence_score( + cap: cv2.VideoCapture, + in_point_s: float, + templates: list[tuple[float, np.ndarray]], + cfg: AppConfig, +) -> float: + weighted_scores: list[float] = [] + raw_scores: list[float] = [] + for offset_s, template in templates: + frame = grab_frame_at(cap, in_point_s + offset_s) + if frame is None: + return -1.0 + floating_score = _match_score(frame, template, cfg) + fixed_score = _fixed_position_score(frame, template, cfg) + score = (floating_score * 0.55) + (fixed_score * 0.45) + # The first frames matter most for perceived sync. Weight them higher + # so a match that begins a few frames early loses to a better aligned hit. + weight = 1.35 if offset_s <= 0.16 else 1.0 + weighted_scores.append(score * weight) + raw_scores.append(score) + if not raw_scores: + return -1.0 + + # Reward consistently good temporal alignment. A single strong frame is not + # enough if the other beat frames drift away. + weighted_avg = sum(weighted_scores) / (len(raw_scores) + 0.35 * sum(1 for o, _ in templates if o <= 0.16)) + return float(weighted_avg * 0.70 + min(raw_scores) * 0.30) + + +def _content_alignment_templates( + beat: TrailerBeat, + cfg: AppConfig, +) -> list[tuple[float, np.ndarray]]: + matchable_s = estimate_matchable_reference_duration( + beat, + cfg, + sample_step_s=cfg.cv.deep_scan.content_align_sample_step_s, + ) + step_s = max(1.0 / cfg.export.edl_frame_rate, cfg.cv.deep_scan.content_align_sample_step_s) + max_offset_s = max(0.0, min(beat.duration_s, matchable_s) - step_s) + offsets = [0.0] + t = step_s + while t <= max_offset_s: + offsets.append(round(t, 6)) + t = round(t + step_s, 6) + if matchable_s > step_s and offsets[-1] < max_offset_s: + offsets.append(round(max_offset_s, 6)) + + templates: list[tuple[float, np.ndarray]] = [] + for offset_s in offsets: + frame = grab_frame_at_path(beat.trailer_path, beat.start_s + offset_s) + if frame is not None: + if not _is_scoreable_reference_frame(frame, cfg): + continue + templates.append((offset_s, _prepare_template(frame, cfg))) + if not templates: + return _prepare_beat_templates(beat, cfg) + return templates + + +def _content_alignment_score( + cap: cv2.VideoCapture, + in_point_s: float, + templates: list[tuple[float, np.ndarray]], + cfg: AppConfig, +) -> float: + if not templates: + return -1.0 + + weighted_total = 0.0 + weight_total = 0.0 + raw_scores: list[float] = [] + early_scores: list[float] = [] + + for offset_s, template in templates: + frame = grab_frame_at(cap, in_point_s + offset_s) + if frame is None: + return -1.0 + + # For offset detection the fixed frame position is intentionally more + # important than free template placement. Free placement can make the + # right shot look acceptable even when the movement is a few frames off. + fixed_score = _fixed_position_score(frame, template, cfg) + floating_score = _match_score(frame, template, cfg) + score = fixed_score * 0.72 + floating_score * 0.28 + + weight = 1.45 if offset_s <= 0.20 else 1.0 + weighted_total += score * weight + weight_total += weight + raw_scores.append(score) + if offset_s <= 0.36: + early_scores.append(score) + + avg_score = weighted_total / weight_total if weight_total > 0 else -1.0 + min_score = min(raw_scores) if raw_scores else -1.0 + early_score = sum(early_scores) / len(early_scores) if early_scores else avg_score + return float(avg_score * 0.55 + min_score * 0.25 + early_score * 0.20) + + +def align_in_point_by_content( + beat: TrailerBeat, + estimated_in_point_s: float, + cfg: AppConfig, + search_window_s: float | None = None, +) -> tuple[float, float]: + """ + Find the frame offset directly from image content around a rough match. + + This is deliberately local: once a candidate shot is plausible, scanning a + small window around it with many reference frames is faster and more robust + than repeating a global scan or applying a fixed frame preroll. + """ + templates = _content_alignment_templates(beat, cfg) + if not templates: + return estimated_in_point_s, 0.0 + + with open_video(cfg.paths.source_movie) as cap: + fps = float(cap.get(cv2.CAP_PROP_FPS)) or cfg.export.edl_frame_rate + frame_step_s = 1.0 / fps + window_s = ( + search_window_s + if search_window_s is not None + else cfg.cv.deep_scan.content_align_window_seconds + ) + start_s = max(0.0, estimated_in_point_s - window_s) + end_s = estimated_in_point_s + window_s + tie_delta = cfg.cv.deep_scan.start_tie_break_score_delta + + best_in = estimated_in_point_s + best_score = -1.0 + t = start_s + while t <= end_s: + score = _content_alignment_score(cap, t, templates, cfg) + if score > best_score + tie_delta: + best_score = score + best_in = t + elif score >= best_score - tie_delta and abs(t - estimated_in_point_s) < abs(best_in - estimated_in_point_s): + best_in = t + t = round(t + frame_step_s, 6) + + return best_in, max(0.0, best_score) + + +def _motion_phase_score( + cap: cv2.VideoCapture, + in_point_s: float, + motion_templates: list[tuple[float, float, np.ndarray, tuple[int, ...]]], + cfg: AppConfig, +) -> float: + scores: list[float] = [] + for offset_s, step_s, ref_delta, template_shape in motion_templates: + f0 = grab_frame_at(cap, in_point_s + offset_s) + f1 = grab_frame_at(cap, in_point_s + offset_s + step_s) + if f0 is None or f1 is None: + return -1.0 + src0 = _fixed_feature(f0, template_shape, cfg) + src1 = _fixed_feature(f1, template_shape, cfg) + scores.append(_corr_same_size(cv2.absdiff(src1, src0), ref_delta)) + + if not scores: + return 0.0 + return float((sum(scores) / len(scores)) * 0.65 + min(scores) * 0.35) + + +def estimate_usable_source_duration( + beat: TrailerBeat, + in_point_s: float, + cfg: AppConfig, + sample_step_s: float | None = None, + min_keep_s: float = 0.5, +) -> tuple[float, float]: + """ + Estimate how long the source stays visually aligned with the beat. + + This catches cases where the source dissolves/cuts into the next shot while + the trailer beat continues into a title card or black fade. + + Returns: + (usable_duration_s, average_good_score) + """ + step_s = sample_step_s if sample_step_s is not None else cfg.cv.deep_scan.span_sample_step_s + templates = _prepare_beat_templates_stepped(beat, cfg, step_s) + if not templates: + return beat.duration_s, 0.0 + + scores: list[tuple[float, float]] = [] + source_fps = cfg.export.edl_frame_rate + with open_video(cfg.paths.source_movie) as cap: + source_fps = float(cap.get(cv2.CAP_PROP_FPS)) or cfg.export.edl_frame_rate + for offset_s, template in templates: + frame = grab_frame_at(cap, in_point_s + offset_s) + if frame is None: + break + scores.append((offset_s, _match_score(frame, template, cfg))) + + if not scores: + return 0.0, 0.0 + + warmup_scores = [score for offset, score in scores if offset <= min(1.0, beat.duration_s * 0.35)] + baseline = max(warmup_scores) if warmup_scores else max(score for _, score in scores) + min_score = max(0.34, baseline * 0.48) + + last_good = 0.0 + bad_run = 0 + good_scores: list[float] = [] + + for offset_s, score in scores: + if score >= min_score: + last_good = offset_s + bad_run = 0 + good_scores.append(score) + continue + + if offset_s < min_keep_s: + continue + + bad_run += 1 + if bad_run >= 3: + break + + tail_safety_s = max(0.0, cfg.cv.deep_scan.trim_tail_frames / source_fps) + usable = min(beat.duration_s, max(0.0, last_good - tail_safety_s)) + if usable < min_keep_s and scores: + usable = min(beat.duration_s, max(min_keep_s, scores[0][0] + step_s - tail_safety_s)) + + avg_good = float(sum(good_scores) / len(good_scores)) if good_scores else 0.0 + return usable, avg_good + + +def refine_timestamp(template: np.ndarray, t_sec: float, cfg: AppConfig) -> float: + best_score = -1.0 + best_t = t_sec + tie_delta = cfg.cv.deep_scan.start_tie_break_score_delta + + with open_video(cfg.paths.source_movie) as cap: + fps = float(cap.get(cv2.CAP_PROP_FPS)) + step = 1.0 / fps + start_t = max(0.0, t_sec - 0.5) + end_t = t_sec + 0.5 + + t = start_t + while t <= end_t: + frame = grab_frame_at(cap, t) + if frame is not None: + max_val = _match_score(frame, template, cfg) + if max_val > best_score + tie_delta: + best_score = max_val + best_t = t + elif max_val >= best_score - tie_delta and t < best_t: + best_t = t + t += step + + return best_t + + +def refine_in_point_with_sequence( + beat: TrailerBeat, + estimated_in_point_s: float, + cfg: AppConfig, + search_window_s: float | None = None, +) -> tuple[float, float]: + """ + Refine a rough source in-point by comparing several frames across the beat. + + Returns: + (best_in_point_s, sequence_score) + """ + return align_in_point_by_content(beat, estimated_in_point_s, cfg, search_window_s) + + +def _find_scene_for_time(scenes: Sequence | None, t_sec: float, cfg: AppConfig): + if not scenes: + return None + for idx, scene in enumerate(scenes): + if scene.start_s <= t_sec < scene.end_s: + if ( + scene.end_s - t_sec <= cfg.cv.deep_scan.scene_boundary_epsilon_s + and idx + 1 < len(scenes) + ): + return scenes[idx + 1] + return scene + return None + + +def _source_fps_from_scene(scene) -> float: + duration_s = max(0.0, scene.end_s - scene.start_s) + frame_count = max(0, scene.end_frame - scene.start_frame) + return frame_count / duration_s if duration_s > 0 and frame_count > 0 else 0.0 + + +def _apply_start_preroll(in_point_s: float, source_fps: float, cfg: AppConfig) -> float: + if cfg.cv.deep_scan.start_preroll_frames <= 0: + return in_point_s + fps = source_fps or cfg.export.edl_frame_rate + return max(0.0, in_point_s - (cfg.cv.deep_scan.start_preroll_frames / fps)) + + +def _clamp_to_scene_start(in_point_s: float, scene) -> float: + if scene is None: + return in_point_s + return max(float(scene.start_s), in_point_s) + + +def _add_top_candidate( + candidates: list[tuple[float, float]], + score: float, + t_sec: float, + max_candidates: int, + min_distance_s: float, +) -> list[tuple[float, float]]: + """ + Keep diverse coarse candidates as (score, midpoint_time). + + A single best midpoint frame is too brittle: repeated actors, similar color + palettes, cars, forests, and title-card darkness can all create plausible + false positives. Keeping a ranked pool lets the multi-frame sequence pass + choose the temporally consistent match. + """ + for idx, (old_score, old_t) in enumerate(candidates): + if abs(old_t - t_sec) < min_distance_s: + if score > old_score: + candidates[idx] = (score, t_sec) + return sorted(candidates, key=lambda item: item[0], reverse=True)[:max_candidates] + + candidates.append((score, t_sec)) + return sorted(candidates, key=lambda item: item[0], reverse=True)[:max_candidates] + + +def run_global_scan( + beats: Sequence[TrailerBeat], + cfg: AppConfig, + scenes: Sequence | None = None, + seed_in_points: dict[int, Sequence[SeedPoint]] | None = None, +) -> list[MatchResult]: + logger.info('[Global Scan] Preparing templates for %d beats...', len(beats)) + templates = [] + midpoint_templates = [] + beat_valid = [] + + for b in beats: + bf = grab_frame_at_path(cfg.paths.reference_trailer, b.start_s + (b.end_s - b.start_s)/2) + if bf is None: + midpoint_templates.append(None) + templates.append([]) + beat_valid.append(False) + continue + + midpoint_templates.append(_prepare_template(bf, cfg)) + beat_templates = _prepare_beat_templates(b, cfg) + templates.append(beat_templates) + beat_valid.append(bool(beat_templates)) + + top_candidates: list[list[tuple[float, float]]] = [[] for _ in beats] + seed_candidates: list[list[tuple[float, float]]] = [[] for _ in beats] + has_weighted_seeds = False + for idx, beat in enumerate(beats): + for seed in (seed_in_points or {}).get(beat.beat_id, ()): + if isinstance(seed, tuple): + seed_t = float(seed[0]) + seed_score = max( + cfg.cv.deep_scan.coarse_candidate_threshold, + min(0.99, float(seed[1])), + ) + has_weighted_seeds = True + else: + seed_t = float(seed) + seed_score = cfg.cv.deep_scan.coarse_candidate_threshold + seed_candidate = ( + seed_score, + max(0.0, seed_t), + ) + seed_candidates[idx].append(seed_candidate) + top_candidates[idx] = _add_top_candidate( + top_candidates[idx], + seed_candidate[0], + seed_candidate[1], + max_candidates=cfg.cv.deep_scan.sequence_candidate_count, + min_distance_s=cfg.cv.deep_scan.sequence_min_distance_s, + ) + if (seed_in_points or {}).get(beat.beat_id): + logger.info( + 'Beat %d: added %d seeded in-point candidates.', + beat.beat_id, + len((seed_in_points or {}).get(beat.beat_id, ())), + ) + + skip_coarse_scan = ( + cfg.vision.enabled + and cfg.cv.deep_scan.skip_coarse_scan_with_weighted_seeds + and has_weighted_seeds + and all(top_candidates[i] for i, valid in enumerate(beat_valid) if valid) + ) + + if skip_coarse_scan: + logger.info('[Global Scan] Weighted vision seeds present; skipping full FFmpeg coarse scan.') + else: + fps = 2.0 + cmd = [ + 'ffmpeg', '-i', str(cfg.paths.source_movie), + '-vf', f'scale={cfg.video.proxy_width}:{cfg.video.proxy_height},fps={fps}', + '-f', 'image2pipe', '-vcodec', 'rawvideo', '-pix_fmt', 'bgr24', '-' + ] + logger.info('[Global Scan] Streaming %s via FFmpeg (%.1f fps) ...', cfg.paths.source_movie.name, fps) + + p = sp.Popen(cmd, stdout=sp.PIPE, stderr=sp.DEVNULL) + frame_size = cfg.video.proxy_width * cfg.video.proxy_height * 3 + frame_idx = 0 + start_t = time.time() + + while True: + raw = p.stdout.read(frame_size) + if len(raw) != frame_size: break + + frame = np.frombuffer(raw, dtype=np.uint8).reshape((cfg.video.proxy_height, cfg.video.proxy_width, 3)) + haystack = _prepare_haystack(frame, cfg) + + for i, beat_templates in enumerate(templates): + if not beat_valid[i]: continue + source_t = frame_idx / fps + for beat_offset_s, template in beat_templates: + res = cv2.matchTemplate(haystack, template, cv2.TM_CCOEFF_NORMED) + _, max_val, _, _ = cv2.minMaxLoc(res) + candidate_in_s = source_t - beat_offset_s + if candidate_in_s < 0.0: + continue + + top_candidates[i] = _add_top_candidate( + top_candidates[i], + float(max_val), + candidate_in_s, + max_candidates=cfg.cv.deep_scan.sequence_candidate_count, + min_distance_s=cfg.cv.deep_scan.sequence_min_distance_s, + ) + + frame_idx += 1 + if frame_idx % 1000 == 0: + logger.info('[Global Scan] Processed %d frames (%.1fs movie time)...', frame_idx, frame_idx / fps) + + p.stdout.close() + p.wait() + + logger.info('[Global Scan] Finished streaming %d frames in %.1fs.', frame_idx, time.time() - start_t) + + results = [] + source_info = get_video_info(cfg.paths.source_movie) + source_fps = float(source_info['fps']) or 24.0 + + for i, b in enumerate(beats): + if not beat_valid[i]: continue + + candidates = top_candidates[i] + if not candidates: + continue + + score = float(candidates[0][0]) + + if score >= cfg.cv.deep_scan.coarse_candidate_threshold: + matchable_duration_s = estimate_matchable_reference_duration(b, cfg) + logger.info( + 'Beat %d: refining %d temporal candidates (best offset score %.3f, matchable %.2fs / beat %.2fs).', + b.beat_id, + len(candidates), + score, + matchable_duration_s, + b.duration_s, + ) + + best_result: MatchResult | None = None + best_short_result: MatchResult | None = None + best_short_coverage = -1.0 + best_duration_coverage = -1.0 + best_content_score = -1.0 + rejected_short_candidates = 0 + rejected_content_candidates = 0 + scan_cfg = cfg.cv.deep_scan + content_gate = ( + min(scan_cfg.provisional_content_threshold, cfg.vision.content_threshold) + if skip_coarse_scan and has_weighted_seeds + else scan_cfg.provisional_content_threshold + ) + + candidate_pool = candidates[:scan_cfg.content_rerank_candidate_count] + for seed_candidate in seed_candidates[i]: + candidate_pool = _add_top_candidate( + candidate_pool, + seed_candidate[0], + seed_candidate[1], + max_candidates=scan_cfg.content_rerank_candidate_count + len(seed_candidates[i]), + min_distance_s=scan_cfg.sequence_min_distance_s, + ) + if skip_coarse_scan and has_weighted_seeds: + dense_candidates = _dense_weighted_seed_candidates( + b, + seed_candidates[i], + cfg, + scenes, + matchable_duration_s, + ) + for dense_candidate in dense_candidates: + candidate_pool = _add_top_candidate( + candidate_pool, + dense_candidate[0], + dense_candidate[1], + max_candidates=( + scan_cfg.content_rerank_candidate_count + + len(seed_candidates[i]) + + len(dense_candidates) + ), + min_distance_s=max(0.04, cfg.vision.local_scan_step_s * 0.5), + ) + reranked_candidates = _rerank_candidates_by_content( + b, + candidate_pool, + cfg, + scenes=scenes, + matchable_duration_s=matchable_duration_s, + ) + refine_limit = ( + min(scan_cfg.max_refine_candidates, cfg.vision.max_refine_candidates) + if skip_coarse_scan and has_weighted_seeds + else scan_cfg.max_refine_candidates + ) + refine_candidates = [ + (coarse_score, in_point_s) + for _, coarse_score, in_point_s in reranked_candidates[:refine_limit] + ] + validation_templates = _prepare_validation_templates(b, cfg) + logger.info( + 'Beat %d: content-reranked top %d / %d candidates.', + b.beat_id, + len(refine_candidates), + len(candidate_pool), + ) + + for coarse_score, coarse_in_s in refine_candidates: + rough_in_s = coarse_in_s + is_weighted_seed_candidate = ( + skip_coarse_scan + and has_weighted_seeds + and coarse_score > scan_cfg.coarse_candidate_threshold + 0.05 + ) + if midpoint_templates[i] is not None and not is_weighted_seed_candidate: + midpoint_t = coarse_in_s + (b.duration_s / 2) + fine_t = refine_timestamp(midpoint_templates[i], midpoint_t, cfg) + rough_in_s = max(0.0, fine_t - (b.duration_s / 2)) + local_align_window_s = ( + min(cfg.vision.local_scan_step_s, cfg.cv.deep_scan.content_align_window_seconds) + if is_weighted_seed_candidate + else None + ) + refined_in_s, sequence_score = refine_in_point_with_sequence( + b, + rough_in_s, + cfg, + search_window_s=local_align_window_s, + ) + scene = _find_scene_for_time(scenes, refined_in_s, cfg) + scene_fps = _source_fps_from_scene(scene) if scene is not None else source_fps + adjusted_in_s = _apply_start_preroll(refined_in_s, scene_fps, cfg) + adjusted_in_s = _clamp_to_scene_start(adjusted_in_s, scene) + scene = _find_scene_for_time(scenes, adjusted_in_s, cfg) + usable_duration_s, span_score = estimate_usable_source_duration(b, adjusted_in_s, cfg) + out_s = adjusted_in_s + usable_duration_s + if scene is not None: + out_s = min(out_s, scene.end_s) + duration_s = max(0.0, out_s - adjusted_in_s) + duration_coverage = min(1.0, duration_s / matchable_duration_s) if matchable_duration_s > 0 else 0.0 + with open_video(cfg.paths.source_movie) as validation_cap: + original_content_score = _fixed_content_sequence_score( + validation_cap, + adjusted_in_s, + validation_templates, + cfg, + ) + content_score = original_content_score + content_in_s, align_content_score = align_in_point_by_content( + b, + adjusted_in_s, + cfg, + search_window_s=( + local_align_window_s + if local_align_window_s is not None + else min(0.8, cfg.cv.deep_scan.content_align_window_seconds) + ), + ) + if abs(content_in_s - adjusted_in_s) <= cfg.cv.deep_scan.content_align_window_seconds: + with open_video(cfg.paths.source_movie) as validation_cap: + aligned_content_score = _fixed_content_sequence_score( + validation_cap, + content_in_s, + validation_templates, + cfg, + ) + if aligned_content_score >= original_content_score + 0.01: + adjusted_in_s = content_in_s + content_score = min(align_content_score, aligned_content_score) + scene = _find_scene_for_time(scenes, adjusted_in_s, cfg) + usable_duration_s = max(0.0, duration_s) + out_s = adjusted_in_s + usable_duration_s + if scene is not None: + out_s = min(out_s, scene.end_s) + duration_s = max(0.0, out_s - adjusted_in_s) + duration_coverage = ( + min(1.0, duration_s / matchable_duration_s) + if matchable_duration_s > 0 else 0.0 + ) + + if is_weighted_seed_candidate and scene is not None and content_score >= content_gate: + contiguous_usable_s = _contiguous_scene_coverage_duration( + b, + adjusted_in_s, + scenes, + matchable_duration_s, + cfg, + ) + scene_duration_s = min(b.duration_s, contiguous_usable_s) + if scene_duration_s > duration_s: + usable_duration_s = scene_duration_s + out_s = adjusted_in_s + usable_duration_s + duration_s = usable_duration_s + duration_coverage = ( + min(1.0, duration_s / matchable_duration_s) + if matchable_duration_s > 0 else 0.0 + ) + span_score = max(span_score, content_score) + + final_score = ( + sequence_score * scan_cfg.sequence_score_weight + + span_score * scan_cfg.span_score_weight + + coarse_score * scan_cfg.coarse_score_weight + + duration_coverage * scan_cfg.duration_score_weight + ) + final_score = ( + final_score * (1.0 - scan_cfg.content_validation_weight) + + content_score * scan_cfg.content_validation_weight + ) + if is_weighted_seed_candidate: + vision_provisional_score = ( + content_score * 0.55 + + duration_coverage * 0.33 + + coarse_score * 0.12 + ) + final_score = max(final_score, vision_provisional_score) + if content_score < scan_cfg.match_threshold and not is_weighted_seed_candidate: + final_score = min(final_score, content_score) + if content_score < content_gate: + logger.debug( + 'Beat %d rejected by content validation in=%.3fs scene=%s content=%.3f min=%.3f', + b.beat_id, + adjusted_in_s, + scene.scene_id if scene is not None else 'none', + content_score, + content_gate, + ) + rejected_content_candidates += 1 + continue + candidate_result = MatchResult( + beat_id=b.beat_id, + scene_id=scene.scene_id if scene is not None else 0, + source_path=cfg.paths.source_movie, + in_point_s=max(0.0, adjusted_in_s), + out_point_s=out_s, + in_point_frame=int(max(0.0, adjusted_in_s) * source_fps), + match_score=final_score, + ) + + if duration_coverage < scan_cfg.min_duration_coverage: + rejected_short_candidates += 1 + logger.debug( + 'Beat %d short candidate in=%.3fs scene=%s sequence=%.3f span=%.3f coarse=%.3f content=%.3f coverage=%.2f final=%.3f', + b.beat_id, + adjusted_in_s, + scene.scene_id if scene is not None else 'none', + sequence_score, + span_score, + coarse_score, + content_score, + duration_coverage, + final_score, + ) + long_enough_for_review = duration_s >= max(0.5, matchable_duration_s * 0.45) + visually_plausible = ( + sequence_score >= scan_cfg.provisional_match_threshold + or final_score >= scan_cfg.provisional_match_threshold + ) + if long_enough_for_review and visually_plausible: + if ( + best_short_result is None + or candidate_result.match_score + > best_short_result.match_score + scan_cfg.duration_tie_break_score_delta + or ( + candidate_result.match_score + >= best_short_result.match_score - scan_cfg.duration_tie_break_score_delta + and duration_coverage > best_short_coverage + ) + ): + best_short_result = candidate_result + best_short_coverage = duration_coverage + continue + + logger.debug( + 'Beat %d candidate in=%.3fs scene=%s sequence=%.3f span=%.3f coarse=%.3f content=%.3f coverage=%.2f final=%.3f', + b.beat_id, + adjusted_in_s, + scene.scene_id if scene is not None else 'none', + sequence_score, + span_score, + coarse_score, + content_score, + duration_coverage, + final_score, + ) + + clearly_better_score = ( + best_result is None + or candidate_result.match_score + > best_result.match_score + scan_cfg.duration_tie_break_score_delta + ) + similar_score_better_duration = ( + best_result is not None + and candidate_result.match_score + >= best_result.match_score - scan_cfg.duration_tie_break_score_delta + and duration_coverage > best_duration_coverage + 0.03 + ) + similar_vision_score_earlier_phase = ( + is_weighted_seed_candidate + and best_result is not None + and candidate_result.scene_id == best_result.scene_id + and candidate_result.match_score + >= best_result.match_score - cfg.vision.local_scan_tie_break_score_delta + and content_score >= best_content_score - 0.005 + and duration_coverage >= best_duration_coverage - 0.03 + and candidate_result.in_point_s < best_result.in_point_s + ) + similar_vision_score_better_phase = ( + is_weighted_seed_candidate + and best_result is not None + and candidate_result.scene_id == best_result.scene_id + and candidate_result.match_score + >= best_result.match_score - cfg.vision.local_scan_tie_break_score_delta + and content_score > best_content_score + 0.008 + and duration_coverage >= best_duration_coverage - 0.03 + ) + + if ( + clearly_better_score + or similar_score_better_duration + or similar_vision_score_earlier_phase + or similar_vision_score_better_phase + ): + best_result = candidate_result + best_duration_coverage = duration_coverage + best_content_score = content_score + + if best_result is None: + if best_short_result is not None: + logger.warning( + 'Beat %d: using short provisional automatic match scene=%d in=%.3fs dur=%.3fs coverage=%.2f score=%.3f', + b.beat_id, + best_short_result.scene_id, + best_short_result.in_point_s, + best_short_result.duration_s, + best_short_coverage, + best_short_result.match_score, + ) + best_result = best_short_result + best_duration_coverage = best_short_coverage + else: + if rejected_content_candidates > 0 and rejected_short_candidates == 0: + logger.warning( + 'Beat %d: NO MATCH after refinement (%d candidates rejected by content validation)', + b.beat_id, + rejected_content_candidates, + ) + else: + logger.warning( + 'Beat %d: NO MATCH after refinement (%d candidates rejected below %.0f%% duration coverage, %d by content validation)', + b.beat_id, + rejected_short_candidates, + scan_cfg.min_duration_coverage * 100.0, + rejected_content_candidates, + ) + continue + is_confirmed = best_result.match_score >= cfg.cv.deep_scan.match_threshold + if best_result.match_score < cfg.cv.deep_scan.provisional_match_threshold: + logger.warning( + 'Beat %d: NO MATCH after refinement (best final score %.3f, provisional threshold %.3f)', + b.beat_id, + best_result.match_score, + cfg.cv.deep_scan.provisional_match_threshold, + ) + continue + if not is_confirmed: + logger.warning( + 'Beat %d: provisional automatic match scene=%d in=%.3fs score=%.3f (confirmed threshold %.3f)', + b.beat_id, + best_result.scene_id, + best_result.in_point_s, + best_result.match_score, + cfg.cv.deep_scan.match_threshold, + ) + + logger.info( + 'Beat %d: best automatic match scene=%d in=%.3fs dur=%.3fs coverage=%.2f score=%.3f', + b.beat_id, + best_result.scene_id, + best_result.in_point_s, + best_result.duration_s, + best_duration_coverage, + best_result.match_score, + ) + + results.append(MatchResult( + beat_id=b.beat_id, + scene_id=best_result.scene_id, + source_path=cfg.paths.source_movie, + in_point_s=best_result.in_point_s, + out_point_s=best_result.out_point_s, + in_point_frame=best_result.in_point_frame, + match_score=best_result.match_score, + is_confirmed=is_confirmed, + )) + else: + logger.warning( + 'Beat %d: NO MATCH (best coarse score %.3f, coarse threshold %.3f)', + b.beat_id, + score, + cfg.cv.deep_scan.coarse_candidate_threshold, + ) + + if skip_coarse_scan and not results and cfg.vision.fullscan_fallback: + logger.warning( + '[Global Scan] Weighted vision-seed pass found no valid matches; retrying with full FFmpeg coarse scan.' + ) + retry_cfg = replace( + cfg, + cv=replace( + cfg.cv, + deep_scan=replace(cfg.cv.deep_scan, skip_coarse_scan_with_weighted_seeds=False), + ), + ) + return run_global_scan(beats, retry_cfg, scenes=scenes, seed_in_points=seed_in_points) + + return results diff --git a/src/cv/scene_indexer.py b/src/cv/scene_indexer.py new file mode 100644 index 0000000..10bb47e --- /dev/null +++ b/src/cv/scene_indexer.py @@ -0,0 +1,229 @@ +""" +src/cv/scene_indexer.py — Source-movie scene segmentation + fingerprinting + +Responsibility: + 1. Run PySceneDetect on the source movie → list of raw scene boundaries + 2. For each scene, extract the midpoint frame and fingerprint it + 3. Optionally run Whisper dialogue on each scene (injected as dependency) + 4. Persist results to .cache/ as JSON for fast re-runs + +Returns: list[Scene] with luma_hist, sat_hist, phash populated. +""" + +from __future__ import annotations + +import json +import logging +import pickle +from pathlib import Path +from typing import Callable, Sequence + +import numpy as np + +from src.core.config import AppConfig +from src.core.models import Scene +from src.cv.fingerprinting import fingerprint_frame +from src.cv.frame_extractor import grab_midpoint_frame, open_video + +logger = logging.getLogger(__name__) + +# Type alias for an optional dialogue-injection callback +DialogueCallback = Callable[[Scene], Scene] + + +# --------------------------------------------------------------------------- +# Cache helpers +# --------------------------------------------------------------------------- + +def _cache_path(cfg: AppConfig) -> Path: + p = cfg.paths.cache_dir / "scene_index.json" + p.parent.mkdir(parents=True, exist_ok=True) + return p + + +def _scene_to_dict(s: Scene) -> dict: + return { + "scene_id": s.scene_id, + "source_path": str(s.source_path), + "start_s": s.start_s, + "end_s": s.end_s, + "start_frame": s.start_frame, + "end_frame": s.end_frame, + # histograms serialised as hex so JSON can hold them + "luma_hist": s.luma_hist.hex() if s.luma_hist else None, + "sat_hist": s.sat_hist.hex() if s.sat_hist else None, + "phash": s.phash, + } + + +def _scene_from_dict(d: dict) -> Scene: + return Scene( + scene_id=d["scene_id"], + source_path=Path(d["source_path"]), + start_s=d["start_s"], + end_s=d["end_s"], + start_frame=d["start_frame"], + end_frame=d["end_frame"], + luma_hist=bytes.fromhex(d["luma_hist"]) if d.get("luma_hist") else None, + sat_hist= bytes.fromhex(d["sat_hist"]) if d.get("sat_hist") else None, + phash=d.get("phash"), + ) + + +def _save_cache(scenes: list[Scene], cfg: AppConfig) -> None: + data = [_scene_to_dict(s) for s in scenes] + _cache_path(cfg).write_text(json.dumps(data, indent=2), encoding="utf-8") + logger.info("Scene index cached → %s (%d scenes)", _cache_path(cfg), len(scenes)) + + +def _load_cache(cfg: AppConfig) -> list[Scene] | None: + p = _cache_path(cfg) + if not p.exists(): + return None + try: + data = json.loads(p.read_text(encoding="utf-8")) + scenes = [_scene_from_dict(d) for d in data] + logger.info("Loaded %d scenes from cache (%s)", len(scenes), p) + return scenes + except Exception as exc: + logger.warning("Cache corrupt, re-indexing: %s", exc) + return None + + +# --------------------------------------------------------------------------- +# PySceneDetect integration +# --------------------------------------------------------------------------- + +def _detect_scenes_pyscenedetect(cfg: AppConfig) -> list[tuple[float, float, int, int]]: + """ + Run PySceneDetect ContentDetector on the source movie. + + Returns: + List of (start_s, end_s, start_frame, end_frame) tuples. + """ + try: + from scenedetect import open_video as sd_open_video, SceneManager + from scenedetect.detectors import ContentDetector + except ImportError: + raise ImportError( + "scenedetect is not installed. Run: pip install scenedetect[opencv]" + ) + + video = sd_open_video(str(cfg.paths.source_movie)) + manager = SceneManager() + manager.add_detector( + ContentDetector( + threshold=cfg.scene_detection.content_threshold, + min_scene_len=int( + cfg.scene_detection.min_scene_duration_s + * video.frame_rate + ), + ) + ) + + logger.info("Detecting scenes in %s …", cfg.paths.source_movie.name) + manager.detect_scenes(video=video, show_progress=True) + + raw = manager.get_scene_list() + result: list[tuple[float, float, int, int]] = [] + for start_tc, end_tc in raw: + result.append(( + start_tc.get_seconds(), + end_tc.get_seconds(), + start_tc.get_frames(), + end_tc.get_frames(), + )) + + logger.info("PySceneDetect found %d scenes.", len(result)) + return result + + +# --------------------------------------------------------------------------- +# Fingerprint enrichment +# --------------------------------------------------------------------------- + +def _fingerprint_scenes( + raw_scenes: list[tuple[float, float, int, int]], + cfg: AppConfig, +) -> list[Scene]: + """ + For each raw scene boundary, extract the midpoint frame and fingerprint it. + """ + scenes: list[Scene] = [] + vc_cfg = cfg.cv.vibe_check + + logger.info("Fingerprinting %d scenes …", len(raw_scenes)) + + with open_video(cfg.paths.source_movie) as cap: + for idx, (start_s, end_s, start_frame, end_frame) in enumerate(raw_scenes): + frame = grab_midpoint_frame(cap, start_s, end_s) + + if frame is None: + logger.warning("Scene %d: midpoint frame decode failed, skipping fingerprint.", idx) + scenes.append(Scene( + scene_id=idx, + source_path=cfg.paths.source_movie, + start_s=start_s, end_s=end_s, + start_frame=start_frame, end_frame=end_frame, + )) + continue + + luma_bytes, sat_bytes, phash_hex = fingerprint_frame(frame, vc_cfg) + + scenes.append(Scene( + scene_id=idx, + source_path=cfg.paths.source_movie, + start_s=start_s, end_s=end_s, + start_frame=start_frame, end_frame=end_frame, + luma_hist=luma_bytes, + sat_hist=sat_bytes, + phash=phash_hex, + )) + + if (idx + 1) % 50 == 0: + logger.info(" … %d / %d scenes fingerprinted", idx + 1, len(raw_scenes)) + + return scenes + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + +def build_scene_index( + cfg: AppConfig, + force_reindex: bool = False, + dialogue_callback: DialogueCallback | None = None, +) -> list[Scene]: + """ + Build (or load from cache) the full scene index for the source movie. + + Steps: + 1. Load from .cache/scene_index.json if available and force_reindex=False. + 2. Otherwise: detect scenes via PySceneDetect → fingerprint → cache. + 3. Optionally enrich each scene with dialogue via dialogue_callback. + + Args: + cfg: Application configuration. + force_reindex: Ignore cache and re-run detection + fingerprinting. + dialogue_callback: Optional function Scene → Scene that adds dialogue. + Injected here so this module stays audio-free. + + Returns: + List of Scene objects with fingerprints populated. + """ + if not force_reindex: + cached = _load_cache(cfg) + if cached is not None: + if dialogue_callback: + cached = [dialogue_callback(s) for s in cached] + return cached + + raw = _detect_scenes_pyscenedetect(cfg) + scenes = _fingerprint_scenes(raw, cfg) + _save_cache(scenes, cfg) + + if dialogue_callback: + scenes = [dialogue_callback(s) for s in scenes] + + return scenes diff --git a/src/cv/vibe_check.py b/src/cv/vibe_check.py new file mode 100644 index 0000000..ed1d1fd --- /dev/null +++ b/src/cv/vibe_check.py @@ -0,0 +1,190 @@ +""" +src/cv/vibe_check.py — Phase 1: Scene-level histogram / pHash filter + +Responsibility: + Given ONE TrailerBeat (with pre-computed fingerprints) and a list of + source Scenes (also fingerprinted), return the Top-K candidates ranked + by a combined histogram + pHash score. + +This module contains ZERO file I/O and ZERO frame decoding — those live +in the pipeline layer. Input = model objects, output = sorted VibeHit list. +""" + +from __future__ import annotations + +import logging +from dataclasses import replace +from typing import Sequence + +import cv2 +import numpy as np + +from src.core.models import Scene, TrailerBeat, VibeHit +from src.cv.fingerprinting import bytes_to_hist, phash_distance + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Scoring +# --------------------------------------------------------------------------- + +# Weight applied to histogram score vs pHash score in the combined metric. +# pHash gets less weight because it's sensitive to text overlays on source. +_HIST_WEIGHT = 0.70 +_PHASH_WEIGHT = 0.30 +_PHASH_MAX_BITS = 64 # maximum possible Hamming distance + + +def _hist_combined_score( + beat: TrailerBeat, + scene: Scene, + hist_method: int, +) -> float: + """ + Average CORREL score of luma + saturation histograms. + + Returns a value in [-1, 1] (CORREL) or [0, 1] depending on method. + Higher is always more similar (we invert BHATTACHARYYA if needed). + """ + if beat.luma_hist is None or scene.luma_hist is None: + return 0.0 + if beat.sat_hist is None or scene.sat_hist is None: + return 0.0 + + luma_score = cv2.compareHist( + bytes_to_hist(beat.luma_hist), + bytes_to_hist(scene.luma_hist), + hist_method, + ) + sat_score = cv2.compareHist( + bytes_to_hist(beat.sat_hist), + bytes_to_hist(scene.sat_hist), + hist_method, + ) + + # Normalise BHATTACHARYYA to [0, 1] similarity (invert distance) + if hist_method == cv2.HISTCMP_BHATTACHARYYA: + luma_score = 1.0 - float(luma_score) + sat_score = 1.0 - float(sat_score) + + return float((luma_score + sat_score) / 2.0) + + +def _phash_score(beat: TrailerBeat, scene: Scene) -> float: + """ + Convert Hamming distance to a [0, 1] similarity score. + + 0 Hamming distance → 1.0 (identical) + 64 Hamming distance → 0.0 (completely different) + """ + if beat.phash is None or scene.phash is None: + return 0.0 + dist = phash_distance(beat.phash, scene.phash) + return 1.0 - (dist / _PHASH_MAX_BITS) + + +def _combined_score( + beat: TrailerBeat, + scene: Scene, + hist_method: int, +) -> float: + """Weighted aggregate of histogram + pHash similarity.""" + hist = _hist_combined_score(beat, scene, hist_method) + phash = _phash_score(beat, scene) + return _HIST_WEIGHT * hist + _PHASH_WEIGHT * phash + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + +def run_vibe_check( + beat: TrailerBeat, + scenes: Sequence[Scene], + top_k: int, + hist_method: int, + phash_max_distance: int, +) -> list[VibeHit]: + """ + Phase 1: Score all source scenes against one trailer beat and return + the top-K candidates for Deep Scan. + + Args: + beat: The trailer beat to match (must have fingerprints). + scenes: All detected scenes from the source movie. + top_k: Maximum number of candidates to return. + hist_method: cv2.HISTCMP_* constant (e.g. 0 = CORREL). + phash_max_distance: Scenes with pHash Hamming distance > this value + are excluded before ranking (hard filter). + + Returns: + List of VibeHit, sorted by combined_score descending, length ≤ top_k. + Empty list if beat has no fingerprints or no scenes pass the filter. + """ + if beat.luma_hist is None and beat.phash is None: + logger.warning( + "Beat %d has no fingerprints — skipping Vibe Check.", beat.beat_id + ) + return [] + + candidates: list[VibeHit] = [] + + for scene in scenes: + # Hard pHash filter: skip scenes that are too visually distant + if beat.phash and scene.phash: + dist = phash_distance(beat.phash, scene.phash) + if dist > phash_max_distance: + continue # fast rejection — avoids full histogram compare + + hist = _hist_combined_score(beat, scene, hist_method) + phash = _phash_score(beat, scene) + combined = _HIST_WEIGHT * hist + _PHASH_WEIGHT * phash + + candidates.append(VibeHit( + beat_id=beat.beat_id, + scene_id=scene.scene_id, + hist_score=round(hist, 4), + phash_distance=( + phash_distance(beat.phash, scene.phash) + if beat.phash and scene.phash + else _PHASH_MAX_BITS + ), + combined_score=round(combined, 4), + )) + + # Sort by combined score, descending; return top-K + candidates.sort(key=lambda h: h.combined_score, reverse=True) + top = candidates[:top_k] + + logger.info( + "Vibe Check beat=%d: %d scenes scored, %d candidates forwarded to Deep Scan. " + "Best score: %.3f (scene %s)", + beat.beat_id, + len(candidates), + len(top), + top[0].combined_score if top else 0.0, + top[0].scene_id if top else "—", + ) + + return top + + +def batch_vibe_check( + beats: Sequence[TrailerBeat], + scenes: Sequence[Scene], + top_k: int, + hist_method: int, + phash_max_distance: int, +) -> dict[int, list[VibeHit]]: + """ + Run Vibe Check for every beat and return a mapping beat_id → [VibeHit]. + + Convenience wrapper for the pipeline layer. + """ + return { + beat.beat_id: run_vibe_check( + beat, scenes, top_k, hist_method, phash_max_distance + ) + for beat in beats + } diff --git a/src/export/__init__.py b/src/export/__init__.py new file mode 100644 index 0000000..da61106 --- /dev/null +++ b/src/export/__init__.py @@ -0,0 +1 @@ +# src.export package — FCPXML / EDL export diff --git a/src/export/edl_writer.py b/src/export/edl_writer.py new file mode 100644 index 0000000..d593b99 --- /dev/null +++ b/src/export/edl_writer.py @@ -0,0 +1,114 @@ +""" +src/export/edl_writer.py — EditTimeline → CMX 3600 EDL + +Generates a standard CMX 3600 Edit Decision List compatible with +Avid, DaVinci Resolve, Premiere Pro, and most NLEs. + +CMX 3600 format reference: + https://en.wikipedia.org/wiki/Edit_decision_list#CMX_3600 +""" + +from __future__ import annotations + +import logging +from pathlib import Path + +from src.core.config import AppConfig +from src.core.models import EditClip, EditTimeline +from src.export.timecode import seconds_to_smpte + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# EDL line builders +# --------------------------------------------------------------------------- + +def _edl_header(title: str) -> str: + return f"TITLE: {title}\nFCM: NON-DROP FRAME\n" + + +def _edl_event( + event_num: int, + clip: EditClip, + fps: float, +) -> str: + """ + Build one CMX 3600 event block for a single clip. + + Format: + NNN AX V C + * FROM CLIP NAME: ... + * COMMENT: ... + """ + src_in = seconds_to_smpte(clip.match.in_point_s, fps) + source_duration_s = clip.source_timeline_duration_s + src_out = seconds_to_smpte(clip.match.in_point_s + source_duration_s, fps) + rec_in = seconds_to_smpte(clip.timeline_start_s, fps) + rec_out = seconds_to_smpte(clip.timeline_start_s + source_duration_s, fps) + + event_line = f"{event_num:03d} AX V C {src_in} {src_out} {rec_in} {rec_out}" + name_line = f"* FROM CLIP NAME: {clip.match.source_path.name}" + comment_line = ( + f"* BEAT {clip.beat.beat_id:03d} | {clip.beat.beat_type.name} | " + f"score={clip.match.match_score:.3f}" + ) + + return "\n".join([event_line, name_line, comment_line, ""]) + + +def _edl_black_tail_event(event_num: int, clip: EditClip, fps: float) -> str: + rec_in = seconds_to_smpte(clip.timeline_start_s + clip.source_timeline_duration_s, fps) + rec_out = seconds_to_smpte(clip.timeline_end_s, fps) + event_line = f"{event_num:03d} BL V C 00:00:00:00 00:00:00:00 {rec_in} {rec_out}" + comment_line = ( + f"* BEAT {clip.beat.beat_id:03d} TRAILER-ONLY TAIL | " + "add fade/dissolve to black" + ) + return "\n".join([event_line, "* FROM CLIP NAME: BLACK", comment_line, ""]) + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + +def write_edl( + timeline: EditTimeline, + cfg: AppConfig, + output_path: Path | None = None, +) -> Path: + """ + Write the EditTimeline as a CMX 3600 EDL file. + + Args: + timeline: EditTimeline from build_timeline(). + cfg: Application configuration. + output_path: Override destination. Defaults to + /.edl. + + Returns: + Path to the written .edl file. + """ + if output_path is None: + output_path = cfg.paths.output_dir / f"{timeline.title}.edl" + + output_path.parent.mkdir(parents=True, exist_ok=True) + + fps = timeline.frame_rate + lines = [_edl_header(timeline.title), "\n"] + + event_num = 1 + for clip in sorted(timeline.clips, key=lambda c: c.clip_index): + lines.append(_edl_event(event_num, clip, fps)) + event_num += 1 + if clip.trailer_tail_s > 0: + lines.append("\n") + lines.append(_edl_black_tail_event(event_num, clip, fps)) + event_num += 1 + lines.append("\n") + + edl_text = "\n".join(lines) + output_path.write_text(edl_text, encoding="utf-8") + + logger.info("EDL written → %s (%d events)", output_path, timeline.clip_count) + return output_path diff --git a/src/export/fcpxml_writer.py b/src/export/fcpxml_writer.py new file mode 100644 index 0000000..bba4098 --- /dev/null +++ b/src/export/fcpxml_writer.py @@ -0,0 +1,222 @@ +""" +src/export/fcpxml_writer.py — EditTimeline → Final Cut Pro XML (FCPXML 1.10) + +Generates a standards-compliant FCPXML file that can be imported directly +into Final Cut Pro X, DaVinci Resolve, or Premiere Pro (via FCPXML plugin). + +Spec reference: https://developer.apple.com/documentation/professional_video_applications/fcpxml_reference +""" + +from __future__ import annotations + +import logging +from pathlib import Path +from urllib.parse import quote +from xml.etree import ElementTree as ET +from xml.etree.ElementTree import Element, SubElement + +from src.core.config import AppConfig +from src.core.models import EditClip, EditTimeline +from src.export.timecode import ( + fcpxml_format_name, + fcpxml_frame_duration, + seconds_to_fcpxml, +) + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Asset registry — one per unique source file +# --------------------------------------------------------------------------- + +class _AssetRegistry: + def __init__(self) -> None: + self._assets: dict[Path, str] = {} # path → asset id + self._counter = 2 # r1 reserved for format + + def get_or_create(self, path: Path) -> str: + if path not in self._assets: + rid = f"r{self._counter}" + self._assets[path] = rid + self._counter += 1 + return self._assets[path] + + @property + def items(self) -> dict[Path, str]: + return dict(self._assets) + + +# --------------------------------------------------------------------------- +# Builder +# --------------------------------------------------------------------------- + +def _path_to_url(path: Path) -> str: + """Convert an absolute Path to a file:// URL as required by FCPXML.""" + posix = path.as_posix() + if not posix.startswith("/"): + # Windows drive letter: C:/foo → /C:/foo + posix = "/" + posix + return "file://" + quote(posix, safe="/:@") + + +def build_fcpxml( + timeline: EditTimeline, + cfg: AppConfig, + source_duration_s: float = 7200.0, # 2-hour fallback if not probed +) -> ET.ElementTree: + """ + Build a complete FCPXML ElementTree from an EditTimeline. + + Args: + timeline: Ordered sequence of EditClips. + cfg: Application configuration. + source_duration_s: Duration of the source movie asset (used for + duration attribute). Will be probed + automatically when possible. + + Returns: + xml.etree.ElementTree.ElementTree — call .write() to serialise. + """ + fps = timeline.frame_rate + + # ---- root --------------------------------------------------------------- + root = Element("fcpxml", version=cfg.export.fcpxml_version) + root.set("xmlns", "http://www.apple.com/dt/FCPXML/1_10") + + # ---- resources ---------------------------------------------------------- + resources = SubElement(root, "resources") + + format_id = "r1" + format_name = fcpxml_format_name(fps) + fmt = SubElement(resources, "format", + id=format_id, + name=format_name, + frameDuration=fcpxml_frame_duration(fps), + width="1920", + height="1080", + colorSpace="1-1-1 (Rec. 709)", + ) + + registry = _AssetRegistry() + + # Pre-register all unique source paths so elements come before + # the block (required by FCPXML spec). + for clip in timeline.clips: + registry.get_or_create(clip.match.source_path) + + # Probe actual source duration when possible + _durations: dict[Path, float] = {} + for path in registry.items: + try: + from src.cv.frame_extractor import get_video_info + info = get_video_info(path) + _durations[path] = float(info["duration_s"]) + except Exception: + _durations[path] = source_duration_s + + for path, rid in registry.items.items(): + dur_s = _durations.get(path, source_duration_s) + SubElement(resources, "asset", + id=rid, + name=path.stem, + src=_path_to_url(path), + start="0s", + duration=seconds_to_fcpxml(dur_s, fps), + hasVideo="1", + hasAudio="1", + format=format_id, + ) + + # ---- library / event / project ------------------------------------------ + library = SubElement(root, "library") + event = SubElement(library, "event", name=timeline.title) + project = SubElement(event, "project", name=timeline.title) + sequence = SubElement(project, "sequence", + duration=seconds_to_fcpxml(timeline.total_duration_s, fps), + format=format_id, + tcStart="0s", + tcFormat="NDF", + audioLayout="stereo", + audioRate="48k", + ) + spine = SubElement(sequence, "spine") + + # ---- clips -------------------------------------------------------------- + for clip in sorted(timeline.clips, key=lambda c: c.clip_index): + asset_id = registry.get_or_create(clip.match.source_path) + + source_duration_s = clip.source_timeline_duration_s + clip_elem = SubElement(spine, "clip", + name=f"Beat_{clip.beat.beat_id:03d}_{clip.beat.beat_type.name}", + ref=asset_id, + # offset = position on the timeline + offset=seconds_to_fcpxml(clip.timeline_start_s, fps), + # duration = matched source part only; trailer-only tails become gaps. + duration=seconds_to_fcpxml(source_duration_s, fps), + # start = in-point inside the source asset + start=seconds_to_fcpxml(clip.match.in_point_s, fps), + ) + + # Inline audio role + SubElement(clip_elem, "audio", + role="dialogue", + srcCh="1, 2", + outCh="L, R", + ) + + if clip.trailer_tail_s > 0: + gap = SubElement(spine, "gap", + name=f"Beat_{clip.beat.beat_id:03d}_TRAILER_TAIL_BLACK_FADE", + offset=seconds_to_fcpxml(clip.timeline_start_s + source_duration_s, fps), + duration=seconds_to_fcpxml(clip.trailer_tail_s, fps), + start="0s", + ) + SubElement(gap, "marker", + start="0s", + value="Trailer-only tail: add fade/dissolve to black here", + completed="0", + ) + + return ET.ElementTree(root) + + +# --------------------------------------------------------------------------- +# Writer +# --------------------------------------------------------------------------- + +def write_fcpxml( + timeline: EditTimeline, + cfg: AppConfig, + output_path: Path | None = None, +) -> Path: + """ + Serialise the EditTimeline to a .fcpxml file. + + Args: + timeline: EditTimeline from build_timeline(). + cfg: Application configuration. + output_path: Override destination. Defaults to + /.fcpxml. + + Returns: + Path to the written .fcpxml file. + """ + if output_path is None: + output_path = cfg.paths.output_dir / f"{timeline.title}.fcpxml" + + output_path.parent.mkdir(parents=True, exist_ok=True) + + tree = build_fcpxml(timeline, cfg) + + # Add XML declaration + DOCTYPE manually (ElementTree doesn't support DOCTYPE) + xml_bytes = ET.tostring(tree.getroot(), encoding="unicode", xml_declaration=False) + header = ( + '\n' + '\n' + ) + + output_path.write_text(header + xml_bytes, encoding="utf-8") + + logger.info("FCPXML written → %s (%d clips)", output_path, timeline.clip_count) + return output_path diff --git a/src/export/timecode.py b/src/export/timecode.py new file mode 100644 index 0000000..89a6ffd --- /dev/null +++ b/src/export/timecode.py @@ -0,0 +1,146 @@ +""" +src/export/timecode.py — Timecode / rational-time conversion helpers + +FCPXML uses rational fractions ("1001/24000s") for all time values. +EDL uses SMPTE timecode strings ("HH:MM:SS:FF"). + +All conversion functions are pure — no I/O, no state. +""" + +from __future__ import annotations + +import math +from fractions import Fraction + + +# --------------------------------------------------------------------------- +# Common frame-rate denominators +# --------------------------------------------------------------------------- + +_FPS_RATIONAL: dict[float, tuple[int, int]] = { + 23.976: (24000, 1001), + 24.0: (24, 1), + 25.0: (25, 1), + 29.97: (30000, 1001), + 30.0: (30, 1), + 50.0: (50, 1), + 59.94: (60000, 1001), + 60.0: (60, 1), +} + +_TOLERANCE = 0.01 # fps match tolerance + + +def _fps_to_rational(fps: float) -> tuple[int, int]: + """Return (numerator, denominator) for common fps values.""" + for ref_fps, rational in _FPS_RATIONAL.items(): + if abs(fps - ref_fps) < _TOLERANCE: + return rational + # Fallback: convert float to exact fraction + f = Fraction(fps).limit_denominator(1001) + return f.numerator, f.denominator + + +# --------------------------------------------------------------------------- +# Seconds → FCPXML rational string +# --------------------------------------------------------------------------- + +def seconds_to_fcpxml(seconds: float, fps: float) -> str: + """ + Convert *seconds* to FCPXML rational time string. + + FCPXML requires exact rational arithmetic to avoid drift. + Example: 10.0s @23.976fps → "240240/24000s" + + Args: + seconds: Time in seconds (float). + fps: Project frame rate. + + Returns: + FCPXML time string, e.g. "240240/24000s". + """ + if seconds == 0.0: + return "0s" + + num, den = _fps_to_rational(fps) # frames per second = num/den + # seconds × (num/den) = frames (float); round to nearest frame + frames = round(seconds * num / den) + # frames ÷ (num/den) = frames × den/num → rational seconds + total_num = frames * den + total_den = num + # Reduce fraction + g = math.gcd(total_num, total_den) + return f"{total_num // g}/{total_den // g}s" + + +def seconds_to_frame_count(seconds: float, fps: float) -> int: + """Convert seconds to integer frame count.""" + return round(seconds * fps) + + +# --------------------------------------------------------------------------- +# Seconds → SMPTE timecode (for EDL) +# --------------------------------------------------------------------------- + +def seconds_to_smpte(seconds: float, fps: float, drop_frame: bool = False) -> str: + """ + Convert *seconds* to SMPTE timecode string "HH:MM:SS:FF". + + Drop-frame timecode (;) is not implemented — always returns NDF (:). + + Args: + seconds: Time in float seconds. + fps: Frame rate (23.976, 24, 25, etc.). + drop_frame: Ignored; placeholder for future DF support. + + Returns: + "HH:MM:SS:FF" string. + """ + total_frames = seconds_to_frame_count(seconds, fps) + nominal_fps = round(fps) # e.g. 23.976 → 24 + + ff = total_frames % nominal_fps + total_s = total_frames // nominal_fps + ss = total_s % 60 + total_m = total_s // 60 + mm = total_m % 60 + hh = total_m // 60 + + return f"{hh:02d}:{mm:02d}:{ss:02d}:{ff:02d}" + + +# --------------------------------------------------------------------------- +# FCPXML format ID helpers +# --------------------------------------------------------------------------- + +def fcpxml_format_name(fps: float, width: int = 1920, height: int = 1080) -> str: + """ + Return an FCPXML format name string for a given frame rate and resolution. + + Example: fps=23.976, 1080p → "FFVideoFormat1080p2398" + """ + res = f"{height}p" + fps_tag = { + 23.976: "2398", + 24.0: "24", + 25.0: "25", + 29.97: "2997", + 30.0: "30", + 50.0: "50", + 59.94: "5994", + 60.0: "60", + }.get(fps, str(int(fps * 100))) + return f"FFVideoFormat{res}{fps_tag}" + + +def fcpxml_frame_duration(fps: float) -> str: + """ + Return FCPXML frameDuration attribute for a given fps. + + frame duration = 1 frame = 1/fps seconds = den/num seconds + Example: 23.976fps → num=24000, den=1001 → frame duration = 1001/24000s + """ + num, den = _fps_to_rational(fps) # fps = num/den (e.g. 24000/1001) + # frame duration = den/num seconds + g = math.gcd(den, num) + return f"{den // g}/{num // g}s" diff --git a/src/llm/__init__.py b/src/llm/__init__.py new file mode 100644 index 0000000..a20d165 --- /dev/null +++ b/src/llm/__init__.py @@ -0,0 +1 @@ +# src.llm package — Thematic segmentation / dramaturgy (NO vision matching) diff --git a/src/llm/dramaturg.py b/src/llm/dramaturg.py new file mode 100644 index 0000000..defcd18 --- /dev/null +++ b/src/llm/dramaturg.py @@ -0,0 +1,202 @@ +""" +src/llm/dramaturg.py — LLM-based thematic beat classification (OpenRouter) + +Responsibility: + - Receive a list of TrailerBeat objects (with dialogue lines attached) + - Send a single structured prompt to the LLM + - Parse the JSON response to assign BeatType to each beat + +IMPORTANT: This module does ZERO visual analysis. + It classifies narrative dramaturgy from dialogue text only. + Visual matching is handled exclusively by the CV engine. +""" + +from __future__ import annotations + +import json +import logging +from dataclasses import replace +from typing import Sequence + +from src.core.config import AppConfig +from src.core.models import BeatType, TrailerBeat + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Prompt builder +# --------------------------------------------------------------------------- + +_SYSTEM_PROMPT = """You are a film trailer editor and narrative analyst. +Your task is to classify each beat of a trailer into one of these dramatic roles: + HOOK - Opening attention grabber (first impression, shocking image, logo) + SETUP - World/character introduction + CONFLICT - Inciting incident, rising tension, threat revealed + CLIMAX - Peak action/emotion, highest stakes + RESOLUTION - Cool-down, tagline, final title card + +You will receive a JSON array of beats with their index and dialogue text. +Respond ONLY with a valid JSON array, one object per beat, with keys: + "beat_id" (int) and "beat_type" (one of the strings above). +Do NOT include any explanation or markdown fences.""" + +_USER_TEMPLATE = """Classify the following {n} trailer beats: + +{beats_json}""" + + +def _build_beats_payload(beats: Sequence[TrailerBeat]) -> str: + payload = [] + for b in beats: + dialogue_text = " / ".join(line.text for line in b.dialogue) or "(no dialogue)" + payload.append({ + "beat_id": b.beat_id, + "duration": round(b.duration_s, 2), + "dialogue": dialogue_text, + }) + return json.dumps(payload, ensure_ascii=False, indent=2) + + +# --------------------------------------------------------------------------- +# OpenRouter / OpenAI-compatible HTTP client +# --------------------------------------------------------------------------- + +def _call_llm(prompt_user: str, cfg: AppConfig) -> str: + """ + Send a chat completion request to the configured LLM provider. + + Supports: openrouter, openai, ollama (all use the OpenAI-compatible API). + + Returns: + The raw text content of the first assistant message. + + Raises: + RuntimeError: On HTTP errors or missing API key. + """ + import urllib.request + import urllib.error + + llm = cfg.llm + + if llm.provider in ("openrouter", "openai") and not llm.api_key: + raise RuntimeError( + f"LLM provider is '{llm.provider}' but no API key found. " + "Set OPENROUTER_API_KEY (or OPENAI_API_KEY) in your .env file." + ) + + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {llm.api_key}", + } + if llm.provider == "openrouter": + headers["HTTP-Referer"] = "https://github.com/ai-trailer-2026" + headers["X-Title"] = "AI Trailer Generator v2" + + body = json.dumps({ + "model": llm.model, + "messages": [ + {"role": "system", "content": _SYSTEM_PROMPT}, + {"role": "user", "content": prompt_user}, + ], + "temperature": llm.temperature, + "max_tokens": llm.max_tokens, + }).encode("utf-8") + + url = f"{llm.base_url.rstrip('/')}/chat/completions" + + req = urllib.request.Request(url, data=body, headers=headers, method="POST") + + try: + with urllib.request.urlopen(req, timeout=llm.timeout_seconds) as resp: + data = json.loads(resp.read().decode("utf-8")) + return data["choices"][0]["message"]["content"] + except urllib.error.HTTPError as exc: + body_text = exc.read().decode(errors="replace") + raise RuntimeError( + f"LLM HTTP {exc.code} from {url}:\n{body_text}" + ) from exc + + +# --------------------------------------------------------------------------- +# Response parser +# --------------------------------------------------------------------------- + +_BEAT_TYPE_MAP: dict[str, BeatType] = {bt.name: bt for bt in BeatType} + + +def _parse_response(raw: str, beats: Sequence[TrailerBeat]) -> dict[int, BeatType]: + """ + Parse the LLM JSON array response into a beat_id → BeatType mapping. + + Falls back to BeatType.UNKNOWN for any beat that cannot be parsed. + """ + # Strip accidental markdown fences + clean = raw.strip() + if clean.startswith("```"): + clean = "\n".join(clean.split("\n")[1:]) + if clean.endswith("```"): + clean = clean[: clean.rfind("```")] + clean = clean.strip() + + result: dict[int, BeatType] = {b.beat_id: BeatType.UNKNOWN for b in beats} + + try: + parsed = json.loads(clean) + if not isinstance(parsed, list): + raise ValueError("Expected JSON array at top level.") + + for item in parsed: + bid = int(item["beat_id"]) + name = str(item.get("beat_type", "UNKNOWN")).upper() + result[bid] = _BEAT_TYPE_MAP.get(name, BeatType.UNKNOWN) + + except (json.JSONDecodeError, KeyError, ValueError) as exc: + logger.warning("LLM response parse error (%s) — all beats → UNKNOWN.\nRaw: %s", exc, raw[:300]) + + return result + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + +def classify_beats( + beats: Sequence[TrailerBeat], + cfg: AppConfig, +) -> list[TrailerBeat]: + """ + Use the LLM to assign a BeatType to each TrailerBeat. + + Args: + beats: TrailerBeat list (dialogue should be populated for best results). + cfg: Application configuration (llm section + api key). + + Returns: + New list of TrailerBeat objects with beat_type set. + On LLM error, all beats keep BeatType.UNKNOWN (no exception raised). + """ + if not beats: + return list(beats) + + logger.info( + "Classifying %d beats via %s / %s …", + len(beats), cfg.llm.provider, cfg.llm.model, + ) + + payload = _build_beats_payload(beats) + prompt = _USER_TEMPLATE.format(n=len(beats), beats_json=payload) + + try: + raw_response = _call_llm(prompt, cfg) + except Exception as exc: + logger.error("LLM classification failed: %s — keeping BeatType.UNKNOWN.", exc) + return list(beats) + + type_map = _parse_response(raw_response, beats) + + enriched = [replace(b, beat_type=type_map.get(b.beat_id, BeatType.UNKNOWN)) for b in beats] + + classified = sum(1 for b in enriched if b.beat_type != BeatType.UNKNOWN) + logger.info("Beat classification done: %d / %d classified.", classified, len(beats)) + return enriched diff --git a/src/llm/vision_cache.py b/src/llm/vision_cache.py new file mode 100644 index 0000000..0e9c7e1 --- /dev/null +++ b/src/llm/vision_cache.py @@ -0,0 +1,316 @@ +""" +Cached vision descriptions for ambiguous trailer/source matching. + +This module is deliberately conservative: it never writes a final match and it +does not replace CV. It describes a small number of 3-frame beat/scene samples, +caches those descriptions, and returns extra source in-point seeds for the CV +scanner to verify. +""" + +from __future__ import annotations + +import base64 +import json +import logging +import re +import urllib.error +import urllib.request +from dataclasses import asdict +from pathlib import Path +from typing import Sequence + +import cv2 + +from src.core.config import AppConfig +from src.core.models import Scene, TrailerBeat + +logger = logging.getLogger(__name__) + +_CACHE_VERSION = 1 +_STOPWORDS = { + "the", "and", "with", "from", "that", "this", "there", "their", "into", + "scene", "frame", "image", "shot", "video", "visible", "looks", "appears", + "eine", "einer", "einem", "einen", "und", "oder", "mit", "der", "die", "das", +} + +_SYSTEM_PROMPT = """You describe film shots for automatic matching. +Return only compact JSON with these keys: +subject, setting, composition, action_phase, distinctive_objects, lighting_color, negatives. +Focus on stable visual facts and spatial layout. Ignore timecode overlays, subtitles, logos, compression, aspect ratio, and color grading differences.""" + + +def _cache_path(cfg: AppConfig) -> Path: + return cfg.paths.cache_dir / "vision_descriptions.json" + + +def _load_cache(cfg: AppConfig) -> dict: + path = _cache_path(cfg) + if not path.exists(): + return {"version": _CACHE_VERSION, "items": {}} + try: + data = json.loads(path.read_text(encoding="utf-8")) + except json.JSONDecodeError: + logger.warning("Vision cache is unreadable; rebuilding: %s", path) + return {"version": _CACHE_VERSION, "items": {}} + if data.get("version") != _CACHE_VERSION or not isinstance(data.get("items"), dict): + return {"version": _CACHE_VERSION, "items": {}} + return data + + +def _save_cache(cfg: AppConfig, cache: dict) -> None: + path = _cache_path(cfg) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(cache, indent=2, ensure_ascii=False), encoding="utf-8") + + +def _sample_times(start_s: float, end_s: float) -> list[float]: + duration_s = max(0.04, end_s - start_s) + return [ + start_s + min(duration_s * 0.12, max(0.0, duration_s - 0.04)), + start_s + duration_s * 0.50, + start_s + max(0.0, duration_s - min(duration_s * 0.12, 0.20)), + ] + + +def _frame_data_url(video_path: Path, t_s: float) -> str | None: + cap = cv2.VideoCapture(str(video_path)) + try: + if not cap.isOpened(): + return None + cap.set(cv2.CAP_PROP_POS_MSEC, max(0.0, t_s) * 1000.0) + ok, frame = cap.read() + if not ok or frame is None: + return None + h, w = frame.shape[:2] + if w > 640: + frame = cv2.resize(frame, (640, int(h * (640 / w))), interpolation=cv2.INTER_AREA) + ok, encoded = cv2.imencode(".jpg", frame, [int(cv2.IMWRITE_JPEG_QUALITY), 72]) + if not ok: + return None + payload = base64.b64encode(encoded.tobytes()).decode("ascii") + return f"data:image/jpeg;base64,{payload}" + finally: + cap.release() + + +def _call_vision_model(label: str, image_urls: list[str], cfg: AppConfig) -> str: + vision = cfg.vision + if vision.provider in ("openai", "openrouter") and not vision.api_key: + raise RuntimeError( + "Vision is enabled but no API key is available. Set VISION_API_KEY, " + "OPENROUTER_API_KEY, OPENAI_API_KEY, or LLM_API_KEY." + ) + + content: list[dict] = [{ + "type": "text", + "text": ( + f"Describe this 3-frame sample for matching. Label: {label}. " + "The frames are start, middle, and end of the same beat/scene." + ), + }] + content.extend({ + "type": "image_url", + "image_url": {"url": url, "detail": "low"}, + } for url in image_urls) + + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {vision.api_key}", + } + if vision.provider == "openrouter": + headers["HTTP-Referer"] = "https://github.com/ai-trailer-2026" + headers["X-Title"] = "AI Trailer Generator v2" + + body = json.dumps({ + "model": vision.model, + "messages": [ + {"role": "system", "content": _SYSTEM_PROMPT}, + {"role": "user", "content": content}, + ], + "temperature": vision.temperature, + "max_tokens": vision.max_tokens, + }).encode("utf-8") + + url = f"{vision.base_url.rstrip('/')}/chat/completions" + req = urllib.request.Request(url, data=body, headers=headers, method="POST") + try: + with urllib.request.urlopen(req, timeout=vision.timeout_seconds) as resp: + data = json.loads(resp.read().decode("utf-8")) + return str(data["choices"][0]["message"]["content"]).strip() + except urllib.error.HTTPError as exc: + body_text = exc.read().decode(errors="replace") + raise RuntimeError(f"Vision HTTP {exc.code} from {url}:\n{body_text}") from exc + + +def _description_key(kind: str, item_id: int, start_s: float, end_s: float, cfg: AppConfig) -> str: + path = cfg.paths.reference_trailer if kind == "beat" else cfg.paths.source_movie + try: + stamp = int(path.stat().st_mtime) + except OSError: + stamp = 0 + return ( + f"{kind}:{item_id}:" + f"{start_s:.3f}:{end_s:.3f}:" + f"{cfg.vision.provider}:{cfg.vision.model}:{stamp}" + ) + + +def _describe_sample( + *, + kind: str, + item_id: int, + label: str, + video_path: Path, + start_s: float, + end_s: float, + cfg: AppConfig, + cache: dict, + budget: list[int], +) -> str | None: + key = _description_key(kind, item_id, start_s, end_s, cfg) + cached = cache["items"].get(key) + if cached: + return str(cached.get("description", "")) + if budget[0] <= 0: + return None + + image_urls = [ + url for url in (_frame_data_url(video_path, t) for t in _sample_times(start_s, end_s)) + if url is not None + ] + if len(image_urls) < 2: + return None + + description = _call_vision_model(label, image_urls, cfg) + cache["items"][key] = { + "kind": kind, + "item_id": item_id, + "start_s": start_s, + "end_s": end_s, + "label": label, + "description": description, + } + budget[0] -= 1 + return description + + +def _terms(text: str) -> set[str]: + words = re.findall(r"[a-zA-Z][a-zA-Z0-9_'-]{2,}", text.lower()) + return {w for w in words if w not in _STOPWORDS} + + +def _text_similarity(a: str, b: str) -> float: + ta = _terms(a) + tb = _terms(b) + if not ta or not tb: + return 0.0 + overlap = len(ta & tb) + return float(overlap / max(8, min(len(ta), len(tb)))) + + +def _scene_seed_points(scene: Scene, max_points: int) -> list[float]: + if max_points <= 1 or scene.duration_s <= 0: + return [scene.start_s] + usable_end = max(scene.start_s, scene.end_s - 0.2) + if usable_end <= scene.start_s: + return [scene.start_s] + step = (usable_end - scene.start_s) / max(1, max_points - 1) + return [scene.start_s + step * idx for idx in range(max_points)] + + +def build_vision_seed_in_points( + beats: Sequence[TrailerBeat], + scenes: Sequence[Scene], + cfg: AppConfig, +) -> dict[int, list[tuple[float, float]]]: + """ + Return extra in-point seeds from cached vision descriptions. + + The function is intentionally small-budget: for each beat it describes the + beat once and only a few top scene-level candidates. Existing descriptions + are read from cache and cost nothing. + """ + if not cfg.vision.enabled: + return {} + if not beats or not scenes: + return {} + + from src.cv.vibe_check import run_vibe_check + + cache = _load_cache(cfg) + budget = [cfg.vision.max_new_descriptions_per_run] + scenes_by_id = {scene.scene_id: scene for scene in scenes} + seeds: dict[int, list[tuple[float, float]]] = {} + + for beat in beats: + beat_desc = _describe_sample( + kind="beat", + item_id=beat.beat_id, + label=f"trailer beat {beat.beat_id}", + video_path=beat.trailer_path, + start_s=beat.start_s, + end_s=beat.end_s, + cfg=cfg, + cache=cache, + budget=budget, + ) + if not beat_desc: + continue + + hits = run_vibe_check( + beat, + scenes, + top_k=cfg.vision.scene_candidate_top_k, + hist_method=cfg.cv.vibe_check.hist_compare_method, + phash_max_distance=64, + ) + + ranked: list[tuple[float, Scene]] = [] + for hit in hits: + scene = scenes_by_id.get(hit.scene_id) + if scene is None: + continue + scene_desc = _describe_sample( + kind="scene", + item_id=scene.scene_id, + label=f"source scene {scene.scene_id}", + video_path=scene.source_path, + start_s=scene.start_s, + end_s=scene.end_s, + cfg=cfg, + cache=cache, + budget=budget, + ) + if not scene_desc: + continue + score = _text_similarity(beat_desc, scene_desc) + if score >= cfg.vision.similarity_threshold: + ranked.append((score, scene)) + + ranked.sort(key=lambda item: item[0], reverse=True) + points: list[tuple[float, float]] = [] + for score, scene in ranked[:cfg.vision.max_seed_scenes]: + logger.info( + "Beat %d: vision seed scene=%d score=%.3f", + beat.beat_id, + scene.scene_id, + score, + ) + weighted_score = max( + cfg.cv.deep_scan.coarse_candidate_threshold, + min(0.98, cfg.vision.seed_score * (0.75 + min(1.0, score) * 0.25)), + ) + points.extend( + (point, weighted_score) + for point in _scene_seed_points(scene, cfg.vision.seed_points_per_scene) + ) + + if points: + merged: dict[float, float] = {} + for point, weighted_score in points: + key = round(max(0.0, point), 3) + merged[key] = max(weighted_score, merged.get(key, 0.0)) + seeds[beat.beat_id] = sorted((point, score) for point, score in merged.items()) + + _save_cache(cfg, cache) + return seeds diff --git a/src/pipeline/__init__.py b/src/pipeline/__init__.py new file mode 100644 index 0000000..53af47b --- /dev/null +++ b/src/pipeline/__init__.py @@ -0,0 +1,3 @@ +""" +src/pipeline/__init__.py — Orchestration layer +""" diff --git a/src/pipeline/matcher.py b/src/pipeline/matcher.py new file mode 100644 index 0000000..431c10a --- /dev/null +++ b/src/pipeline/matcher.py @@ -0,0 +1,291 @@ +""" +src/pipeline/matcher.py — Top-level CV matching orchestrator + +This is the single entry point for the full 2-phase CV pipeline: + + Phase 0: Load / build scene index (PySceneDetect + fingerprinting) + Phase 1: Vibe Check — histogram + pHash filter → Top-K candidates per beat + Phase 2: Deep Scan — template matching → frame-accurate MatchResult per beat + +Usage: + from src.core.config import load_config + from src.pipeline.matcher import run_matching + + cfg = load_config() + beats = [...] # list[TrailerBeat] from trailer analysis + results = run_matching(cfg, beats) +""" + +from __future__ import annotations + +import logging +from typing import Sequence + +from src.core.config import AppConfig +from src.core.models import MatchResult, Scene, TrailerBeat + +logger = logging.getLogger(__name__) +SeedPoint = float | tuple[float, float] + + +def _scene_seed_points(scene: Scene, max_points: int) -> list[float]: + if max_points <= 1 or scene.duration_s <= 0: + return [scene.start_s] + usable_end = max(scene.start_s, scene.end_s - 0.2) + if usable_end <= scene.start_s: + return [scene.start_s] + step = (usable_end - scene.start_s) / max(1, max_points - 1) + return [scene.start_s + step * idx for idx in range(max_points)] + + +def _build_scene_seed_in_points( + beats: Sequence[TrailerBeat], + scenes: Sequence[Scene], + cfg: AppConfig, +) -> dict[int, list[float]]: + from src.cv.vibe_check import run_vibe_check + + scenes_by_id = {scene.scene_id: scene for scene in scenes} + seeds: dict[int, list[float]] = {} + for beat in beats: + hits = run_vibe_check( + beat, + scenes, + top_k=cfg.cv.deep_scan.scene_seed_top_k, + hist_method=cfg.cv.vibe_check.hist_compare_method, + phash_max_distance=64, + ) + points: list[float] = [] + for hit in hits: + scene = scenes_by_id.get(hit.scene_id) + if scene is None: + continue + points.extend(_scene_seed_points(scene, cfg.cv.deep_scan.scene_seed_points_per_scene)) + if points: + seeds[beat.beat_id] = sorted({round(max(0.0, p), 3) for p in points}) + logger.info( + "Beat %d: added %d scene-level seed candidates from %d source scenes.", + beat.beat_id, + len(seeds[beat.beat_id]), + len(hits), + ) + return seeds + + +def _merge_seed_in_points( + *seed_maps: dict[int, Sequence[SeedPoint]] | None, +) -> dict[int, list[SeedPoint]]: + merged: dict[int, dict[float, float | None]] = {} + for seed_map in seed_maps: + if not seed_map: + continue + for beat_id, points in seed_map.items(): + beat_points = merged.setdefault(beat_id, {}) + for point in points: + if isinstance(point, tuple): + t_sec = round(max(0.0, float(point[0])), 3) + score = float(point[1]) + else: + t_sec = round(max(0.0, float(point)), 3) + score = None + old_score = beat_points.get(t_sec) + if old_score is None: + beat_points[t_sec] = score + elif score is not None: + beat_points[t_sec] = max(old_score, score) + + result: dict[int, list[SeedPoint]] = {} + for beat_id, points in merged.items(): + result[beat_id] = [ + (t_sec, score) if score is not None else t_sec + for t_sec, score in sorted(points.items()) + ] + return result + + +# --------------------------------------------------------------------------- +# Beat fingerprinting +# --------------------------------------------------------------------------- + +def fingerprint_beats( + beats: Sequence[TrailerBeat], + cfg: AppConfig, +) -> list[TrailerBeat]: + """ + Enrich every TrailerBeat with its visual fingerprint (histogram + pHash). + + Extracts the midpoint frame from the reference trailer and fingerprints it + using the same Text-Safe Crop parameters as the scene indexer. + + Args: + beats: TrailerBeat list (fingerprints will be None initially). + cfg: Application configuration. + + Returns: + New list of TrailerBeat objects with luma_hist, sat_hist, phash set. + """ + from dataclasses import replace + from src.cv.fingerprinting import fingerprint_frame + from src.cv.frame_extractor import grab_frame_at_path + + vc_cfg = cfg.cv.vibe_check + enriched: list[TrailerBeat] = [] + + for beat in beats: + frame = grab_frame_at_path(beat.trailer_path, beat.midpoint_s) + if frame is None: + logger.warning("Beat %d: cannot decode midpoint frame, leaving unfingerpinted.", beat.beat_id) + enriched.append(beat) + continue + + luma_b, sat_b, phash = fingerprint_frame(frame, vc_cfg) + enriched.append(replace(beat, luma_hist=luma_b, sat_hist=sat_b, phash=phash)) + + logger.info("Fingerprinted %d / %d beats.", sum(1 for b in enriched if b.phash), len(beats)) + return enriched + + +# --------------------------------------------------------------------------- +# Main pipeline entry point +# --------------------------------------------------------------------------- + +def run_matching( + cfg: AppConfig, + beats: Sequence[TrailerBeat], + force_reindex: bool = False, + seed_in_points: dict[int, Sequence[SeedPoint]] | None = None, +) -> list[MatchResult]: + """ + Execute the full 2-phase CV matching pipeline. + + Args: + cfg: Application configuration (loaded from config.toml). + beats: All trailer beats to source (must have trailer_path set). + force_reindex: If True, ignore the scene cache and re-run PySceneDetect. + + Returns: + List of MatchResult, one per beat (unmatched beats are omitted). + Results are in the same order as the input beats. + """ + from src.cv.scene_indexer import build_scene_index + + logger.info("=" * 60) + logger.info("AI Trailer Generator v2 — CV Matching Pipeline") + logger.info("Source : %s", cfg.paths.source_movie.name) + logger.info("Trailer: %s", cfg.paths.reference_trailer.name) + logger.info("Beats : %d", len(beats)) + logger.info("=" * 60) + + # ------------------------------------------------------------------ + # Phase 0: Scene index + # ------------------------------------------------------------------ + logger.info("[Phase 0] Building scene index …") + scenes: list[Scene] = build_scene_index(cfg, force_reindex=force_reindex) + scenes_by_id: dict[int, Scene] = {s.scene_id: s for s in scenes} + logger.info("[Phase 0] %d scenes indexed.", len(scenes)) + + # ------------------------------------------------------------------ + # Phase 0b: Fingerprint the beats + # ------------------------------------------------------------------ + logger.info("[Phase 0b] Fingerprinting %d trailer beats …", len(beats)) + beats = fingerprint_beats(beats, cfg) + + # ------------------------------------------------------------------ + # Phase 1 & 2: Global Scan (bypasses Scene Indexer / Vibe Check entirely) + # ------------------------------------------------------------------ + logger.info("[Phase 1 & 2] Running FFmpeg Global Scan for %d beats ...", len(beats)) + from src.cv.global_scan import run_global_scan + + scene_seed_in_points = _build_scene_seed_in_points(beats, scenes, cfg) + vision_seed_in_points = {} + if cfg.vision.enabled: + try: + from src.llm.vision_cache import build_vision_seed_in_points + + vision_seed_in_points = build_vision_seed_in_points(beats, scenes, cfg) + except Exception as exc: + logger.error("Vision seeding failed: %s — continuing with CV-only seeds.", exc) + results = run_global_scan( + beats, + cfg, + scenes=scenes, + seed_in_points=_merge_seed_in_points(seed_in_points, scene_seed_in_points, vision_seed_in_points), + ) + + logger.info("[Phase 1 & 2] Done. %d / %d beats matched.", len(results), len(beats)) + logger.info("=" * 60) + + return results + + +# --------------------------------------------------------------------------- +# Convenience: build an EditTimeline from match results +# --------------------------------------------------------------------------- + +def build_timeline( + beats: Sequence[TrailerBeat], + results: Sequence[MatchResult], + cfg: AppConfig, +) -> "src.core.models.EditTimeline": # type: ignore[name-defined] + """ + Combine beats + match results into an ordered EditTimeline. + + Unmatched beats are skipped; timeline positions are computed + sequentially from the usable source-match durations. + + Args: + beats: All trailer beats (defines order + durations). + results: MatchResult list from run_matching(). + cfg: Application configuration. + + Returns: + EditTimeline ready for FCPXML / EDL export. + """ + from src.core.models import EditClip, EditTimeline + + results_by_beat: dict[int, MatchResult] = {r.beat_id: r for r in results} + + clips: list[EditClip] = [] + cursor = 0.0 + + for beat in beats: + match = results_by_beat.get(beat.beat_id) + if match is None: + logger.warning("Beat %d has no match — gap in timeline.", beat.beat_id) + cursor += beat.duration_s + continue + + match_duration = max(0.0, match.duration_s) + source_duration = min(beat.duration_s, match_duration) if match_duration > 0 else beat.duration_s + trailer_tail_s = max(0.0, beat.duration_s - source_duration) + if trailer_tail_s > 0: + logger.warning( + "Beat %d uses %.2fs source + %.2fs generated trailer tail.", + beat.beat_id, + source_duration, + trailer_tail_s, + ) + + clip = EditClip( + clip_index=len(clips), + beat=beat, + match=match, + timeline_start_s=cursor, + timeline_end_s=cursor + beat.duration_s, + source_duration_s=source_duration, + trailer_tail_s=trailer_tail_s, + ) + clips.append(clip) + cursor += beat.duration_s + + timeline = EditTimeline( + title=cfg.paths.reference_trailer.stem, + frame_rate=cfg.export.edl_frame_rate, + clips=tuple(clips), + ) + + logger.info( + "Timeline built: %d clips, total duration %.2fs", + timeline.clip_count, timeline.total_duration_s, + ) + return timeline diff --git a/src/pipeline/reporter.py b/src/pipeline/reporter.py new file mode 100644 index 0000000..a84610d --- /dev/null +++ b/src/pipeline/reporter.py @@ -0,0 +1,427 @@ +""" +src/pipeline/reporter.py — Visual Match Report Generator + +Generates an HTML file containing side-by-side video clips of: + Left: The original beat from the reference trailer + Right: The matched scene from the source movie + +This allows instant visual verification of the CV pipeline's results. +""" + +from __future__ import annotations + +import logging +import subprocess +from pathlib import Path + +from src.core.config import AppConfig + +logger = logging.getLogger(__name__) + + +def _extract_clip(video_path: Path, start_s: float, duration_s: float, out_path: Path) -> None: + """Use ffmpeg to extract a silent, low-res preview clip.""" + out_path.parent.mkdir(parents=True, exist_ok=True) + + # Fast input seek close to the target, then accurate output seek for + # frame-faithful preview clips. A plain "-ss before -i" can land on a + # nearby keyframe and make the report look several frames out of sync. + preroll_s = 2.0 if start_s >= 2.0 else 0.0 + input_seek_s = max(0.0, start_s - preroll_s) + accurate_seek_s = start_s - input_seek_s + + cmd = [ + "ffmpeg", "-y", "-loglevel", "error", + "-ss", str(input_seek_s), + "-i", str(video_path), + "-ss", str(accurate_seek_s), + "-t", str(duration_s), + "-map", "0:v:0", + "-c:v", "libx264", + "-preset", "ultrafast", + "-crf", "28", + "-vf", "scale=640:-2", # scale down for lightweight report + "-an", # no audio + "-movflags", "+faststart", + str(out_path) + ] + + result = subprocess.run(cmd, capture_output=True) + if result.returncode != 0: + logger.error( + "ffmpeg clip extraction failed for %s:\n%s", + out_path.name, result.stderr.decode(errors="replace") + ) + + +def _extract_clip_with_black_tail( + video_path: Path, + start_s: float, + source_duration_s: float, + total_duration_s: float, + out_path: Path, +) -> None: + """Extract a source preview and append black frames for trailer-only tails.""" + tail_s = max(0.0, total_duration_s - source_duration_s) + if tail_s <= 0.02: + _extract_clip(video_path, start_s, source_duration_s, out_path) + return + + out_path.parent.mkdir(parents=True, exist_ok=True) + source_tmp = out_path.with_name(f"{out_path.stem}_source_tmp.mp4") + tail_tmp = out_path.with_name(f"{out_path.stem}_tail_tmp.mp4") + preroll_s = 2.0 if start_s >= 2.0 else 0.0 + input_seek_s = max(0.0, start_s - preroll_s) + accurate_seek_s = start_s - input_seek_s + + # First render the matched source portion with the same accurate seek path + # as _extract_clip(). Using trim=start=... after an input seek is brittle + # because FFmpeg may preserve non-zero packet timestamps around keyframes. + source_cmd = [ + "ffmpeg", "-y", "-loglevel", "error", + "-ss", str(input_seek_s), + "-i", str(video_path), + "-ss", str(accurate_seek_s), + "-t", str(source_duration_s), + "-map", "0:v:0", + "-c:v", "libx264", + "-preset", "ultrafast", + "-crf", "28", + "-vf", "scale=640:360,setsar=1,fps=25,setpts=PTS-STARTPTS", + "-an", + "-movflags", "+faststart", + str(source_tmp), + ] + + result = subprocess.run(source_cmd, capture_output=True) + if result.returncode != 0: + logger.error( + "ffmpeg source preview extraction failed for %s:\n%s", + out_path.name, + result.stderr.decode(errors="replace"), + ) + return + + tail_cmd = [ + "ffmpeg", "-y", "-loglevel", "error", + "-f", "lavfi", + "-i", f"color=c=black:s=640x360:r=25:d={tail_s}", + "-c:v", "libx264", + "-preset", "ultrafast", + "-crf", "28", + "-an", + "-movflags", "+faststart", + str(tail_tmp), + ] + result = subprocess.run(tail_cmd, capture_output=True) + if result.returncode != 0: + logger.error( + "ffmpeg black tail render failed for %s:\n%s", + out_path.name, + result.stderr.decode(errors="replace"), + ) + return + + concat_cmd = [ + "ffmpeg", "-y", "-loglevel", "error", + "-i", str(source_tmp), + "-i", str(tail_tmp), + "-filter_complex", "[0:v][1:v]concat=n=2:v=1:a=0[v]", + "-map", "[v]", + "-c:v", "libx264", + "-preset", "ultrafast", + "-crf", "28", + "-an", + "-movflags", "+faststart", + str(out_path), + ] + result = subprocess.run(concat_cmd, capture_output=True) + if result.returncode != 0: + logger.error( + "ffmpeg tailed preview concat failed for %s:\n%s", + out_path.name, + result.stderr.decode(errors="replace"), + ) + + for tmp in (source_tmp, tail_tmp): + try: + tmp.unlink(missing_ok=True) + except OSError: + pass + + +def _extract_segmented_clip( + video_path: Path, + segments: list, + total_duration_s: float, + out_path: Path, +) -> None: + """Render a beat-length source preview from multiple matched source islands.""" + if not segments: + _extract_clip_with_black_tail(video_path, 0.0, 0.0, total_duration_s, out_path) + return + + out_path.parent.mkdir(parents=True, exist_ok=True) + tmp_paths: list[Path] = [] + cursor = 0.0 + + def add_black(duration_s: float) -> None: + if duration_s <= 0.02: + return + tmp = out_path.with_name(f"{out_path.stem}_part_{len(tmp_paths):03d}_black.mp4") + cmd = [ + "ffmpeg", "-y", "-loglevel", "error", + "-f", "lavfi", + "-i", f"color=c=black:s=640x360:r=25:d={duration_s}", + "-c:v", "libx264", "-preset", "ultrafast", "-crf", "28", + "-an", "-movflags", "+faststart", + str(tmp), + ] + result = subprocess.run(cmd, capture_output=True) + if result.returncode == 0: + tmp_paths.append(tmp) + else: + logger.error("ffmpeg black segment render failed:\n%s", result.stderr.decode(errors="replace")) + + def add_source(start_s: float, duration_s: float) -> None: + if duration_s <= 0.02: + return + tmp = out_path.with_name(f"{out_path.stem}_part_{len(tmp_paths):03d}_src.mp4") + preroll_s = 2.0 if start_s >= 2.0 else 0.0 + input_seek_s = max(0.0, start_s - preroll_s) + accurate_seek_s = start_s - input_seek_s + cmd = [ + "ffmpeg", "-y", "-loglevel", "error", + "-ss", str(input_seek_s), + "-i", str(video_path), + "-ss", str(accurate_seek_s), + "-t", str(duration_s), + "-map", "0:v:0", + "-c:v", "libx264", "-preset", "ultrafast", "-crf", "28", + "-vf", "scale=640:360,setsar=1,fps=25,setpts=PTS-STARTPTS", + "-an", "-movflags", "+faststart", + str(tmp), + ] + result = subprocess.run(cmd, capture_output=True) + if result.returncode == 0 and tmp.exists(): + tmp_paths.append(tmp) + else: + logger.error("ffmpeg source segment render failed:\n%s", result.stderr.decode(errors="replace")) + + for segment in sorted(segments, key=lambda s: s.trailer_offset_s): + offset_s = max(0.0, float(segment.trailer_offset_s)) + duration_s = max(0.0, float(segment.duration_s)) + add_black(offset_s - cursor) + add_source(float(segment.in_point_s), duration_s) + cursor = max(cursor, offset_s + duration_s) + + add_black(total_duration_s - cursor) + + if len(tmp_paths) == 1: + tmp_paths[0].replace(out_path) + return + + inputs: list[str] = [] + labels: list[str] = [] + for idx, tmp in enumerate(tmp_paths): + inputs.extend(["-i", str(tmp)]) + labels.append(f"[{idx}:v]") + filter_complex = "".join(labels) + f"concat=n={len(tmp_paths)}:v=1:a=0[v]" + cmd = [ + "ffmpeg", "-y", "-loglevel", "error", + *inputs, + "-filter_complex", filter_complex, + "-map", "[v]", + "-c:v", "libx264", "-preset", "ultrafast", "-crf", "28", + "-an", "-movflags", "+faststart", + str(out_path), + ] + result = subprocess.run(cmd, capture_output=True) + if result.returncode != 0: + logger.error("ffmpeg segmented preview concat failed:\n%s", result.stderr.decode(errors="replace")) + + for tmp in tmp_paths: + try: + tmp.unlink(missing_ok=True) + except OSError: + pass + + +def _build_frame_locked_compare(ref_path: Path, src_path: Path, out_path: Path) -> None: + """Render reference and source into one side-by-side video stream.""" + out_path.parent.mkdir(parents=True, exist_ok=True) + normalize = ( + "fps=25,scale=640:360:force_original_aspect_ratio=decrease," + "pad=640:360:(ow-iw)/2:(oh-ih)/2,setsar=1,setpts=PTS-STARTPTS" + ) + filter_complex = ( + f"[0:v]{normalize}[ref];" + f"[1:v]{normalize}[src];" + "[ref][src]hstack=inputs=2[v]" + ) + cmd = [ + "ffmpeg", "-y", "-loglevel", "error", + "-i", str(ref_path), + "-i", str(src_path), + "-filter_complex", filter_complex, + "-map", "[v]", + "-c:v", "libx264", + "-preset", "ultrafast", + "-crf", "28", + "-an", + "-movflags", "+faststart", + str(out_path), + ] + result = subprocess.run(cmd, capture_output=True) + if result.returncode != 0: + logger.error( + "ffmpeg compare render failed for %s:\n%s", + out_path.name, + result.stderr.decode(errors="replace"), + ) + + +def generate_report(beats: list, results: list, cfg: AppConfig) -> Path: + """ + Generate an HTML side-by-side report. + Returns the path to the .html file. + """ + report_dir = cfg.paths.output_dir / "report" + report_dir.mkdir(parents=True, exist_ok=True) + + html_path = report_dir / "match_report.html" + results_by_beat = {r.beat_id: r for r in results} + + logger.info("Generating report clips in %s (this might take a moment) ...", report_dir) + + html = [ + "", + "AI Trailer Match Report", + "", + f"

AI Trailer Generator — Match Report

", + f"
Total Beats: {len(beats)} | Matched: {len(results)}
", + "" + ] + + for beat in beats: + res = results_by_beat.get(beat.beat_id) + + # Extract Reference Clip + ref_mp4 = report_dir / f"beat_{beat.beat_id:03d}_ref.mp4" + _extract_clip(beat.trailer_path, beat.start_s, beat.duration_s, ref_mp4) + + html.append("
") + + # Info Panel + html.append("
") + html.append(f"

Beat {beat.beat_id:03d}

") + html.append(f"

Type: {beat.beat_type.name}

") + html.append(f"

Trailer: {beat.start_s:.2f}s → {beat.end_s:.2f}s

") + + if res: + segments = list(getattr(res, "segments", ()) or []) + source_duration = sum(max(0.0, float(s.duration_s)) for s in segments) + if not segments: + source_duration = max(0.0, res.out_point_s - res.in_point_s) + preview_duration = min(beat.duration_s, source_duration) if source_duration > 0 else beat.duration_s + last_segment_end = max( + (float(s.trailer_offset_s) + float(s.duration_s) for s in segments), + default=preview_duration, + ) + trailer_tail_s = max(0.0, beat.duration_s - last_segment_end) + if getattr(res, "is_confirmed", True): + html.append("

MATCHED

") + else: + html.append("

PROVISIONAL MATCH

") + html.append(f"

Scene ID: {res.scene_id}

") + html.append(f"

Movie In: {res.in_point_s:.2f}s

") + html.append(f"

Source Dur: {source_duration:.2f}s

") + if len(segments) > 1: + html.append(f"

Segments: {len(segments)} matched visual islands

") + if trailer_tail_s > 0: + html.append(f"

Unmatched Tail: {trailer_tail_s:.2f}s placeholder

") + html.append(f"

Score: {res.match_score:.3f}

") + if trailer_tail_s > 0: + html.append("

Some trailer frames are still unmatched; report fills only those gaps with placeholder black.

") + + # Warn if score is low + if res.match_score < 0.80: + html.append("

⚠️ Score below 0.80. Verify visually.

") + + # Extract Source Clip + src_mp4 = report_dir / f"beat_{beat.beat_id:03d}_src.mp4" + compare_mp4 = report_dir / f"beat_{beat.beat_id:03d}_compare.mp4" + if segments: + _extract_segmented_clip(res.source_path, segments, beat.duration_s, src_mp4) + else: + _extract_clip_with_black_tail( + res.source_path, + res.in_point_s, + preview_duration, + beat.duration_s, + src_mp4, + ) + _build_frame_locked_compare(ref_mp4, src_mp4, compare_mp4) + else: + html.append("

NO MATCH

") + src_mp4 = None + compare_mp4 = None + + html.append(f"
python cli.py rematch --beat {beat.beat_id}
") + html.append("
") # /info + + # Video Panel + html.append("
") + if compare_mp4: + html.append(f"

Frame-Locked Compare

") + else: + html.append("
") + html.append(f"

Reference Trailer

") + html.append("

Matched Source

No Match
") + html.append("
") # /video-container + html.append("
") # /videos + html.append("
") # /beat-row + + html.append("") + + html_path.write_text("\n".join(html), encoding="utf-8") + return html_path diff --git a/src/pipeline/trailer_analyzer.py b/src/pipeline/trailer_analyzer.py new file mode 100644 index 0000000..a2e16b9 --- /dev/null +++ b/src/pipeline/trailer_analyzer.py @@ -0,0 +1,175 @@ +""" +src/pipeline/trailer_analyzer.py — Reference trailer → list[TrailerBeat] + +Responsibility: + 1. Run PySceneDetect on the REFERENCE TRAILER (not the source movie) + to detect cut boundaries → raw beat intervals + 2. Fingerprint the midpoint frame of each beat (for Vibe Check) + 3. Transcribe dialogue per beat via Whisper (optional, injected) + 4. Optionally classify BeatType via the LLM dramaturg (injected) + +Returns: list[TrailerBeat] ready to feed into run_matching(). +""" + +from __future__ import annotations + +import logging +from dataclasses import replace +from pathlib import Path +from typing import Callable, Sequence + +from src.core.config import AppConfig +from src.core.models import BeatType, DialogueLine, TrailerBeat +from src.cv.fingerprinting import fingerprint_frame +from src.cv.frame_extractor import grab_midpoint_frame, open_video + +logger = logging.getLogger(__name__) + +# Injection type aliases — keeps this module free of hard audio/LLM imports +TranscribeCallback = Callable[[Path, float, float, float], list[DialogueLine]] +ClassifyCallback = Callable[[list[TrailerBeat]], list[TrailerBeat]] + + +# --------------------------------------------------------------------------- +# Step 1: Scene detection on the reference trailer +# --------------------------------------------------------------------------- + +def _detect_trailer_beats(cfg: AppConfig) -> list[tuple[float, float, int, int]]: + """ + Run PySceneDetect on the reference trailer. + + Returns list of (start_s, end_s, start_frame, end_frame). + Uses the same ContentDetector thresholds as the source movie. + """ + try: + from scenedetect import open_video as sd_open_video, SceneManager + from scenedetect.detectors import ContentDetector + except ImportError: + raise ImportError("pip install scenedetect[opencv]") + + trailer_path = cfg.paths.reference_trailer + video = sd_open_video(str(trailer_path)) + manager = SceneManager() + manager.add_detector( + ContentDetector( + threshold=cfg.scene_detection.content_threshold, + min_scene_len=int( + cfg.scene_detection.min_scene_duration_s * video.frame_rate + ), + ) + ) + + logger.info("Detecting beats in reference trailer: %s …", trailer_path.name) + manager.detect_scenes(video=video, show_progress=False) + + raw = manager.get_scene_list() + result = [ + (s.get_seconds(), e.get_seconds(), s.get_frames(), e.get_frames()) + for s, e in raw + ] + logger.info("Detected %d beats in reference trailer.", len(result)) + return result + + +# --------------------------------------------------------------------------- +# Step 2: Fingerprint beats +# --------------------------------------------------------------------------- + +def _fingerprint_beats( + raw_beats: list[tuple[float, float, int, int]], + cfg: AppConfig, +) -> list[TrailerBeat]: + """Extract midpoint frame for each beat and compute fingerprints.""" + vc_cfg = cfg.cv.vibe_check + trailer_path = cfg.paths.reference_trailer + beats: list[TrailerBeat] = [] + + with open_video(trailer_path) as cap: + for idx, (start_s, end_s, start_frame, end_frame) in enumerate(raw_beats): + frame = grab_midpoint_frame(cap, start_s, end_s) + + if frame is None: + logger.warning("Beat %d: midpoint frame decode failed.", idx) + beats.append(TrailerBeat( + beat_id=idx, + trailer_path=trailer_path, + start_s=start_s, end_s=end_s, + start_frame=start_frame, end_frame=end_frame, + )) + continue + + luma_b, sat_b, phash = fingerprint_frame(frame, vc_cfg) + beats.append(TrailerBeat( + beat_id=idx, + trailer_path=trailer_path, + start_s=start_s, end_s=end_s, + start_frame=start_frame, end_frame=end_frame, + luma_hist=luma_b, + sat_hist=sat_b, + phash=phash, + )) + + return beats + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + +def analyze_reference_trailer( + cfg: AppConfig, + transcribe_callback: TranscribeCallback | None = None, + classify_callback: ClassifyCallback | None = None, +) -> list[TrailerBeat]: + """ + Full reference-trailer analysis pipeline. + + Args: + cfg: Application configuration. + transcribe_callback: Optional fn(path, start_s, end_s, offset_s) + → list[DialogueLine]. Injected to keep this + module free of faster-whisper imports. + classify_callback: Optional fn(beats) → beats with BeatType set. + Injected to keep this module LLM-free. + + Returns: + List of TrailerBeat objects with fingerprints (and optionally + dialogue + BeatType) populated. + """ + # Step 1 — cut detection + raw_beats = _detect_trailer_beats(cfg) + + # Step 2 — fingerprint + beats = _fingerprint_beats(raw_beats, cfg) + + # Step 3 — dialogue (optional) + if transcribe_callback is not None: + enriched: list[TrailerBeat] = [] + for beat in beats: + try: + lines = transcribe_callback( + beat.trailer_path, + beat.start_s, + beat.end_s, + beat.start_s, # time_offset so timestamps are absolute + ) + enriched.append(replace(beat, dialogue=tuple(lines))) + except Exception as exc: + logger.warning("Beat %d transcription failed: %s", beat.beat_id, exc) + enriched.append(beat) + beats = enriched + + # Step 4 — LLM dramaturgy (optional) + if classify_callback is not None: + try: + beats = classify_callback(beats) + except Exception as exc: + logger.warning("Beat classification failed: %s — keeping UNKNOWN.", exc) + + logger.info( + "Trailer analysis complete: %d beats, %d with dialogue, %d classified.", + len(beats), + sum(1 for b in beats if b.dialogue), + sum(1 for b in beats if b.beat_type != BeatType.UNKNOWN), + ) + return beats diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..65140f2 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +# tests package diff --git a/tests/test_config.py b/tests/test_config.py new file mode 100644 index 0000000..f0b728b --- /dev/null +++ b/tests/test_config.py @@ -0,0 +1,144 @@ +""" +tests/test_config.py — Smoke tests for config loading and model integrity. + +Run with: pytest tests/test_config.py -v +""" + +from pathlib import Path +import pytest + +from src.core.config import load_config, AppConfig +from src.core.models import ( + Scene, TrailerBeat, MatchResult, VibeHit, + EditClip, EditTimeline, BeatType, DialogueLine, +) + + +CONFIG_PATH = Path(__file__).parents[1] / "config.toml" + + +# --------------------------------------------------------------------------- +# Config loader +# --------------------------------------------------------------------------- + +class TestConfigLoader: + def test_loads_without_error(self) -> None: + cfg = load_config(CONFIG_PATH) + assert isinstance(cfg, AppConfig) + + def test_project_meta(self) -> None: + cfg = load_config(CONFIG_PATH) + assert cfg.version == "2.0.0" + assert cfg.log_level in ("DEBUG", "INFO", "WARNING", "ERROR") + + def test_cv_thresholds_in_range(self) -> None: + cfg = load_config(CONFIG_PATH) + ds = cfg.cv.deep_scan + assert 0.0 < ds.match_threshold < 1.0 + assert ds.coarse_step_seconds > 0 + + def test_vibe_check_crop_fractions(self) -> None: + cfg = load_config(CONFIG_PATH) + vc = cfg.cv.vibe_check + assert 0.0 < vc.crop_top_fraction < 1.0 + assert 0.0 < vc.crop_bottom_fraction < 1.0 + assert vc.crop_top_fraction + vc.crop_bottom_fraction < 1.0 + + def test_missing_config_raises(self, tmp_path: Path) -> None: + with pytest.raises(FileNotFoundError): + load_config(tmp_path / "nonexistent.toml") + + def test_paths_are_path_objects(self) -> None: + cfg = load_config(CONFIG_PATH) + assert isinstance(cfg.paths.source_movie, Path) + assert isinstance(cfg.paths.reference_trailer, Path) + + +# --------------------------------------------------------------------------- +# Data models — construction & properties +# --------------------------------------------------------------------------- + +class TestSceneModel: + def test_duration(self) -> None: + s = Scene( + scene_id=0, + source_path=Path("dummy.mp4"), + start_s=10.0, + end_s=25.5, + start_frame=240, + end_frame=612, + ) + assert s.duration_s == pytest.approx(15.5) + assert s.midpoint_s == pytest.approx(17.75) + + def test_immutable(self) -> None: + s = Scene( + scene_id=0, source_path=Path("x.mp4"), + start_s=0.0, end_s=1.0, + start_frame=0, end_frame=24, + ) + with pytest.raises(Exception): # FrozenInstanceError + s.scene_id = 99 # type: ignore[misc] + + +class TestTrailerBeatModel: + def test_beat_type_default(self) -> None: + b = TrailerBeat( + beat_id=0, trailer_path=Path("trailer.mp4"), + start_s=0.0, end_s=3.0, + start_frame=0, end_frame=72, + ) + assert b.beat_type == BeatType.UNKNOWN + + +class TestMatchResultModel: + def test_duration_computed(self) -> None: + mr = MatchResult( + beat_id=0, scene_id=3, + source_path=Path("movie.mp4"), + in_point_s=120.0, + out_point_s=123.5, + in_point_frame=2880, + match_score=0.87, + ) + assert mr.duration_s == pytest.approx(3.5) + + def test_repr_contains_key_info(self) -> None: + mr = MatchResult( + beat_id=1, scene_id=7, + source_path=Path("movie.mp4"), + in_point_s=60.0, out_point_s=63.0, + in_point_frame=1440, match_score=0.91, + ) + r = repr(mr) + assert "beat=1" in r + assert "scene=7" in r + + +class TestEditTimeline: + def _make_clip(self, idx: int, t_start: float, t_end: float) -> EditClip: + beat = TrailerBeat( + beat_id=idx, trailer_path=Path("t.mp4"), + start_s=t_start, end_s=t_end, + start_frame=0, end_frame=1, + ) + match = MatchResult( + beat_id=idx, scene_id=0, + source_path=Path("m.mp4"), + in_point_s=0.0, out_point_s=t_end - t_start, + in_point_frame=0, match_score=0.9, + ) + return EditClip( + clip_index=idx, beat=beat, match=match, + timeline_start_s=t_start, timeline_end_s=t_end, + ) + + def test_total_duration(self) -> None: + clips = (self._make_clip(0, 0.0, 5.0), self._make_clip(1, 5.0, 9.0)) + tl = EditTimeline(title="Test Trailer", frame_rate=23.976, clips=clips) + assert tl.total_duration_s == pytest.approx(9.0) + assert tl.clip_count == 2 + + def test_empty_timeline(self) -> None: + tl = EditTimeline(title="Empty", frame_rate=24.0, clips=()) + assert tl.total_duration_s == 0.0 diff --git a/tests/test_deep_scan.py b/tests/test_deep_scan.py new file mode 100644 index 0000000..c220ad3 --- /dev/null +++ b/tests/test_deep_scan.py @@ -0,0 +1,140 @@ +""" +tests/test_deep_scan.py — Unit tests for frame_extractor and deep_scan + +Uses synthetic in-memory videos (cv2.VideoWriter → temp file) so no real +video files are required. Tests cover the pure logic, not hardware decoding. +""" + +from __future__ import annotations + +import tempfile +from pathlib import Path + +import cv2 +import numpy as np +import pytest + +from src.cv.frame_extractor import ( + get_video_info, + grab_frame_at, + iter_frames_stepped, + open_video, +) +from src.cv.fingerprinting import text_safe_crop + + +# --------------------------------------------------------------------------- +# Helpers: build a tiny synthetic video on disk +# --------------------------------------------------------------------------- + +FPS = 24 +WIDTH = 320 +HEIGHT = 240 +SECS = 3 + + +def _make_synthetic_video(path: Path, color_bgr: tuple[int, int, int] = (0, 128, 255)) -> Path: + """Write a 3-second single-colour video to *path*.""" + fourcc = cv2.VideoWriter_fourcc(*"mp4v") + writer = cv2.VideoWriter(str(path), fourcc, float(FPS), (WIDTH, HEIGHT)) + frame = np.full((HEIGHT, WIDTH, 3), color_bgr, dtype=np.uint8) + for _ in range(FPS * SECS): + writer.write(frame) + writer.release() + return path + + +@pytest.fixture +def synthetic_video(tmp_path: Path) -> Path: + return _make_synthetic_video(tmp_path / "test.mp4") + + +# --------------------------------------------------------------------------- +# open_video +# --------------------------------------------------------------------------- + +class TestOpenVideo: + def test_opens_valid_file(self, synthetic_video: Path) -> None: + with open_video(synthetic_video) as cap: + assert cap.isOpened() + + def test_raises_on_missing_file(self, tmp_path: Path) -> None: + with pytest.raises(FileNotFoundError): + with open_video(tmp_path / "ghost.mp4"): + pass + + +# --------------------------------------------------------------------------- +# get_video_info +# --------------------------------------------------------------------------- + +class TestGetVideoInfo: + def test_returns_correct_fps(self, synthetic_video: Path) -> None: + info = get_video_info(synthetic_video) + assert info["fps"] == pytest.approx(FPS, rel=0.05) + + def test_duration_approx(self, synthetic_video: Path) -> None: + info = get_video_info(synthetic_video) + assert info["duration_s"] == pytest.approx(SECS, rel=0.1) + + def test_resolution(self, synthetic_video: Path) -> None: + info = get_video_info(synthetic_video) + assert info["width"] == WIDTH + assert info["height"] == HEIGHT + + +# --------------------------------------------------------------------------- +# grab_frame_at +# --------------------------------------------------------------------------- + +class TestGrabFrameAt: + def test_returns_ndarray(self, synthetic_video: Path) -> None: + with open_video(synthetic_video) as cap: + frame = grab_frame_at(cap, 1.0) + assert frame is not None + assert isinstance(frame, np.ndarray) + assert frame.shape == (HEIGHT, WIDTH, 3) + + def test_returns_none_past_end(self, synthetic_video: Path) -> None: + with open_video(synthetic_video) as cap: + frame = grab_frame_at(cap, 9999.0) + # May return None or a repeated last frame depending on codec; + # we only assert no exception is raised. + assert frame is None or isinstance(frame, np.ndarray) + + +# --------------------------------------------------------------------------- +# iter_frames_stepped +# --------------------------------------------------------------------------- + +class TestIterFramesStepped: + def test_yields_correct_count(self, synthetic_video: Path) -> None: + with open_video(synthetic_video) as cap: + frames = list(iter_frames_stepped(cap, 0.0, 1.0, 0.5)) + # Expect timestamps: 0.0, 0.5, 1.0 → 3 frames + assert len(frames) == 3 + + def test_timestamps_increasing(self, synthetic_video: Path) -> None: + with open_video(synthetic_video) as cap: + frames = list(iter_frames_stepped(cap, 0.0, 2.0, 0.5)) + timestamps = [t for t, _ in frames] + assert timestamps == sorted(timestamps) + + def test_invalid_step_raises(self, synthetic_video: Path) -> None: + with open_video(synthetic_video) as cap: + with pytest.raises(ValueError, match="step_s"): + list(iter_frames_stepped(cap, 0.0, 1.0, 0.0)) + + +# --------------------------------------------------------------------------- +# text_safe_crop integration (sanity: cropped height consistent) +# --------------------------------------------------------------------------- + +class TestCropSanity: + def test_crop_reduces_height(self, synthetic_video: Path) -> None: + with open_video(synthetic_video) as cap: + frame = grab_frame_at(cap, 0.5) + assert frame is not None + cropped = text_safe_crop(frame, 0.15, 0.30) + assert cropped.shape[0] < frame.shape[0] + assert cropped.shape[1] == frame.shape[1] # width unchanged diff --git a/tests/test_export.py b/tests/test_export.py new file mode 100644 index 0000000..bd24791 --- /dev/null +++ b/tests/test_export.py @@ -0,0 +1,218 @@ +""" +tests/test_export.py — Unit tests for timecode conversion and export writers + +Tests use synthetic EditTimeline objects (no real video files needed). +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from src.export.timecode import ( + seconds_to_fcpxml, + seconds_to_smpte, + fcpxml_frame_duration, + fcpxml_format_name, + seconds_to_frame_count, +) + + +# --------------------------------------------------------------------------- +# Timecode helpers +# --------------------------------------------------------------------------- + +class TestSecondsToFcpxml: + def test_zero(self) -> None: + assert seconds_to_fcpxml(0.0, 24.0) == "0s" + + def test_one_second_at_24fps(self) -> None: + # 1.0s @ 24fps → 24 frames → 24/24s = 1/1s + result = seconds_to_fcpxml(1.0, 24.0) + assert result == "1/1s" + + def test_one_second_at_23976(self) -> None: + # 1s @ 23.976 → 24000/24000 * 1001/1001 = 1001/1000 ... let's just check it's rational + result = seconds_to_fcpxml(1.0, 23.976) + assert result.endswith("s") + assert "/" in result + + def test_ten_seconds_at_25fps(self) -> None: + # 10s @ 25fps → 250 frames → 250/25s = 10/1s + result = seconds_to_fcpxml(10.0, 25.0) + assert result == "10/1s" + + def test_rational_is_reduced(self) -> None: + # Should never produce 24/24s + result = seconds_to_fcpxml(1.0, 24.0) + num, den = result.rstrip("s").split("/") + from math import gcd + assert gcd(int(num), int(den)) == 1 + + +class TestSecondsToSmpte: + def test_zero(self) -> None: + assert seconds_to_smpte(0.0, 24.0) == "00:00:00:00" + + def test_one_minute(self) -> None: + assert seconds_to_smpte(60.0, 25.0) == "00:01:00:00" + + def test_one_hour(self) -> None: + assert seconds_to_smpte(3600.0, 24.0) == "01:00:00:00" + + def test_frames_overflow(self) -> None: + # 25fps: 26 frames → 1s + 1 frame = 00:00:01:01 + result = seconds_to_smpte(26 / 25, 25.0) + assert result == "00:00:01:01" + + def test_format_length(self) -> None: + result = seconds_to_smpte(123.456, 23.976) + parts = result.split(":") + assert len(parts) == 4 + assert all(len(p) == 2 for p in parts) + + +class TestFcpxmlHelpers: + def test_frame_duration_24fps(self) -> None: + assert fcpxml_frame_duration(24.0) == "1/24s" + + def test_frame_duration_23976(self) -> None: + fd = fcpxml_frame_duration(23.976) + # Should be "1001/24000s" + assert fd == "1001/24000s" + + def test_format_name_1080p_2398(self) -> None: + name = fcpxml_format_name(23.976, 1920, 1080) + assert "1080" in name + assert "2398" in name + + def test_frame_count_roundtrip(self) -> None: + fps = 25.0 + seconds = 10.0 + frames = seconds_to_frame_count(seconds, fps) + assert frames == 250 + + +# --------------------------------------------------------------------------- +# EDL writer (string output) +# --------------------------------------------------------------------------- + +class TestEdlWriter: + def _make_timeline(self) -> "src.core.models.EditTimeline": # type: ignore + from src.core.models import ( + BeatType, EditClip, EditTimeline, MatchResult, TrailerBeat, + ) + + beat = TrailerBeat( + beat_id=0, trailer_path=Path("trailer.mp4"), + start_s=0.0, end_s=5.0, start_frame=0, end_frame=120, + beat_type=BeatType.HOOK, + ) + match = MatchResult( + beat_id=0, scene_id=3, + source_path=Path("movie.mp4"), + in_point_s=30.0, out_point_s=35.0, + in_point_frame=720, match_score=0.88, + ) + clip = EditClip( + clip_index=0, beat=beat, match=match, + timeline_start_s=0.0, timeline_end_s=5.0, + ) + return EditTimeline( + title="TestTrailer", frame_rate=25.0, clips=(clip,) + ) + + def test_edl_contains_title(self, tmp_path: Path) -> None: + from src.core.config import load_config + from src.export.edl_writer import write_edl + + cfg = load_config() + tl = self._make_timeline() + out = write_edl(tl, cfg, output_path=tmp_path / "test.edl") + + text = out.read_text(encoding="utf-8") + assert "TITLE: TestTrailer" in text + + def test_edl_has_event_line(self, tmp_path: Path) -> None: + from src.core.config import load_config + from src.export.edl_writer import write_edl + + cfg = load_config() + tl = self._make_timeline() + out = write_edl(tl, cfg, output_path=tmp_path / "test.edl") + + text = out.read_text(encoding="utf-8") + assert "001" in text # event number + assert "AX" in text # reel name + + +# --------------------------------------------------------------------------- +# FCPXML writer (XML structure) +# --------------------------------------------------------------------------- + +class TestFcpxmlWriter: + def _make_timeline(self) -> "src.core.models.EditTimeline": # type: ignore + from src.core.models import ( + BeatType, EditClip, EditTimeline, MatchResult, TrailerBeat, + ) + + beat = TrailerBeat( + beat_id=0, trailer_path=Path("trailer.mp4"), + start_s=0.0, end_s=5.0, start_frame=0, end_frame=120, + beat_type=BeatType.HOOK, + ) + match = MatchResult( + beat_id=0, scene_id=3, + source_path=Path("B:/Proxy/movie.mp4"), + in_point_s=30.0, out_point_s=35.0, + in_point_frame=720, match_score=0.88, + ) + clip = EditClip( + clip_index=0, beat=beat, match=match, + timeline_start_s=0.0, timeline_end_s=5.0, + ) + return EditTimeline( + title="TestTrailer", frame_rate=25.0, clips=(clip,) + ) + + def test_fcpxml_is_valid_xml(self, tmp_path: Path) -> None: + from xml.etree import ElementTree as ET + from src.core.config import load_config + from src.export.fcpxml_writer import write_fcpxml + + cfg = load_config() + tl = self._make_timeline() + out = write_fcpxml(tl, cfg, output_path=tmp_path / "test.fcpxml") + + text = out.read_text(encoding="utf-8") + text_no_doctype = "\n".join( + line for line in text.splitlines() + if not line.strip().startswith(" None: + from xml.etree import ElementTree as ET + from src.core.config import load_config + from src.export.fcpxml_writer import write_fcpxml + + cfg = load_config() + tl = self._make_timeline() + out = write_fcpxml(tl, cfg, output_path=tmp_path / "test.fcpxml") + + text = out.read_text(encoding="utf-8") + text_no_doctype = "\n".join( + line for line in text.splitlines() + if not line.strip().startswith(" np.ndarray: + """256×256 solid blue BGR frame.""" + frame = np.zeros((256, 256, 3), dtype=np.uint8) + frame[:, :] = (255, 0, 0) # BGR blue + return frame + + +@pytest.fixture +def solid_red_frame() -> np.ndarray: + """256×256 solid red BGR frame.""" + frame = np.zeros((256, 256, 3), dtype=np.uint8) + frame[:, :] = (0, 0, 255) # BGR red + return frame + + +# --------------------------------------------------------------------------- +# text_safe_crop +# --------------------------------------------------------------------------- + +class TestTextSafeCrop: + def test_removes_correct_rows(self, solid_blue_frame: np.ndarray) -> None: + cropped = text_safe_crop(solid_blue_frame, crop_top=0.15, crop_bottom=0.30) + h = solid_blue_frame.shape[0] # 256 + expected_h = int(h * (1.0 - 0.30)) - int(h * 0.15) + assert cropped.shape[0] == expected_h + + def test_zero_crop_returns_same_size(self, solid_blue_frame: np.ndarray) -> None: + cropped = text_safe_crop(solid_blue_frame, crop_top=0.0, crop_bottom=0.0) + assert cropped.shape == solid_blue_frame.shape + + def test_invalid_top_raises(self, solid_blue_frame: np.ndarray) -> None: + with pytest.raises(ValueError, match="crop_top"): + text_safe_crop(solid_blue_frame, crop_top=1.0, crop_bottom=0.0) + + def test_invalid_bottom_raises(self, solid_blue_frame: np.ndarray) -> None: + with pytest.raises(ValueError, match="crop_bottom"): + text_safe_crop(solid_blue_frame, crop_top=0.0, crop_bottom=-0.1) + + def test_overlapping_crops_raise(self, solid_blue_frame: np.ndarray) -> None: + with pytest.raises(ValueError, match="must be < 1.0"): + text_safe_crop(solid_blue_frame, crop_top=0.6, crop_bottom=0.5) + + +# --------------------------------------------------------------------------- +# Histograms +# --------------------------------------------------------------------------- + +class TestHistograms: + def test_output_shape(self, solid_blue_frame: np.ndarray) -> None: + luma, sat = extract_hs_histograms(solid_blue_frame, bins_hue=50, bins_sat=60) + assert luma.shape == (50,) + assert sat.shape == (60,) + + def test_normalised(self, solid_blue_frame: np.ndarray) -> None: + import numpy as np + luma, sat = extract_hs_histograms(solid_blue_frame, bins_hue=50, bins_sat=60) + # L2-normalised → norm ≈ 1.0 + assert np.linalg.norm(luma) == pytest.approx(1.0, abs=1e-5) + assert np.linalg.norm(sat) == pytest.approx(1.0, abs=1e-5) + + def test_same_frame_correl_is_one(self, solid_blue_frame: np.ndarray) -> None: + import cv2 + luma, _ = extract_hs_histograms(solid_blue_frame, bins_hue=50, bins_sat=60) + score = compare_histograms(luma, luma, method=cv2.HISTCMP_CORREL) + assert score == pytest.approx(1.0, abs=1e-5) + + def test_different_frames_correl_lower( + self, + solid_blue_frame: np.ndarray, + solid_red_frame: np.ndarray, + ) -> None: + import cv2 + luma_b, _ = extract_hs_histograms(solid_blue_frame, 50, 60) + luma_r, _ = extract_hs_histograms(solid_red_frame, 50, 60) + score = compare_histograms(luma_b, luma_r, method=cv2.HISTCMP_CORREL) + assert score < 1.0 + + +# --------------------------------------------------------------------------- +# Serialisation round-trip +# --------------------------------------------------------------------------- + +class TestSerialisation: + def test_round_trip(self, solid_blue_frame: np.ndarray) -> None: + luma, _ = extract_hs_histograms(solid_blue_frame, 50, 60) + restored = bytes_to_hist(hist_to_bytes(luma)) + np.testing.assert_array_almost_equal(luma, restored)