Initial project import
This commit is contained in:
@@ -0,0 +1,15 @@
|
||||
# =============================================================================
|
||||
# AI Trailer Generator v2 — Environment Variables
|
||||
# =============================================================================
|
||||
# Copy this file to .env and fill in your actual keys.
|
||||
# .env is listed in .gitignore and will NEVER be committed.
|
||||
# =============================================================================
|
||||
|
||||
# OpenRouter API key (required when [llm] provider = "openrouter")
|
||||
OPENROUTER_API_KEY=sk-or-v1-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
|
||||
|
||||
# OpenAI API key (required when [llm] provider = "openai")
|
||||
# OPENAI_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
|
||||
|
||||
# Universal fallback (used if provider-specific key is not set)
|
||||
# LLM_API_KEY=
|
||||
+44
@@ -0,0 +1,44 @@
|
||||
# ---------------------------------------------------------------------------
|
||||
# AI Trailer Generator v2 — .gitignore
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Python
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*.pyo
|
||||
*.pyd
|
||||
*.egg-info/
|
||||
dist/
|
||||
build/
|
||||
*.whl
|
||||
.venv/
|
||||
venv/
|
||||
.mypy_cache/
|
||||
.ruff_cache/
|
||||
.pytest_cache/
|
||||
|
||||
# Project-generated artefacts (potentially huge)
|
||||
.cache/
|
||||
output/
|
||||
proxy/
|
||||
*.mp4
|
||||
*.mov
|
||||
*.mxf
|
||||
*.wav
|
||||
*.mp3
|
||||
*.jpg
|
||||
*.jpeg
|
||||
*.png
|
||||
|
||||
# IDE
|
||||
.vscode/
|
||||
.idea/
|
||||
*.swp
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
|
||||
# Secrets / local overrides
|
||||
.env
|
||||
config.local.toml
|
||||
@@ -0,0 +1,384 @@
|
||||
# AI Trailer Generator v2
|
||||
|
||||
**Frame-accurate trailer reconstruction via pure Computer Vision**
|
||||
|
||||
> Gibt einen Reference Trailer und den dazugehörigen Quellfilm hinein — bekommt eine fertige FCPXML/EDL heraus, die den Trailer Frame-genau aus dem Quellfilm nachbaut.
|
||||
|
||||
---
|
||||
|
||||
## Das Kernprinzip
|
||||
|
||||
Standardmäßig kein LLM für visuelles Matching. Optional kann ein Vision-Layer
|
||||
gecachte 3-Frame-Beschreibungen als zusätzliche Suchanker liefern; der finale
|
||||
Match bleibt aber CV-verifiziert.
|
||||
|
||||
| Phase | Was passiert | Technologie |
|
||||
|-------|-------------|-------------|
|
||||
| **0 — Prep** | Reference Trailer analysieren & Beats extrahieren | PySceneDetect + OpenCV |
|
||||
| **1 — Global Scan**| Gesamten Quellfilm via FFmpeg-Stream (2 FPS) gegen alle Beats scannen | FFmpeg Pipe + Luma-Histogramm |
|
||||
| **1b — Optional Vision Seeds** | Unsichere Top-K Szenen mit 3-Frame-Beschreibungen cachen | OpenAI-kompatibles Vision-LLM |
|
||||
| **2 — Refine** | Beste Treffer auf Frame-Ebene präzisieren | OpenCV `matchTemplate` |
|
||||
| **3 — Dramaturgie** | Narrative BeatType-Klassifikation aus Dialog-Text | OpenRouter LLM |
|
||||
| **4 — Export** | Timeline → FCPXML 1.10 oder CMX 3600 EDL | xml.etree + eigener Timecode-Layer |
|
||||
|
||||
**Text-Safe Crop:** Obere 15% und untere 30% des Frames werden vor jedem Vergleich ausgeblendet, um Title Cards, Logos und Letterbox zu ignorieren.
|
||||
|
||||
---
|
||||
|
||||
## Voraussetzungen
|
||||
|
||||
- Python **3.11+**
|
||||
- [ffmpeg](https://ffmpeg.org/download.html) im PATH (für Whisper Audio-Extraktion)
|
||||
- CUDA-fähige GPU empfohlen (für faster-whisper; CPU funktioniert auch)
|
||||
|
||||
---
|
||||
|
||||
## Setup
|
||||
|
||||
### 1. Virtual Environment erstellen & aktivieren
|
||||
|
||||
```powershell
|
||||
# Im Projektordner
|
||||
python -m venv .venv
|
||||
.\.venv\Scripts\Activate.ps1
|
||||
|
||||
# Falls ExecutionPolicy blockiert:
|
||||
# Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser
|
||||
```
|
||||
|
||||
### 2. Abhängigkeiten installieren
|
||||
|
||||
```powershell
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
### 3. API-Key konfigurieren
|
||||
|
||||
```powershell
|
||||
# .env aus dem Template kopieren
|
||||
Copy-Item .env.example .env
|
||||
|
||||
# Dann .env öffnen und den echten Key eintragen:
|
||||
# OPENROUTER_API_KEY=sk-or-v1-...
|
||||
```
|
||||
|
||||
### 4. Videodateien eintragen
|
||||
|
||||
`config.toml` öffnen und die Pfade anpassen:
|
||||
|
||||
```toml
|
||||
[paths]
|
||||
source_movie = "B:/Proxy/DeinFilm_FTR.mp4"
|
||||
reference_trailer = "F:/Encodings/DeinFilm_Trailer.mp4"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Verwendung
|
||||
|
||||
```powershell
|
||||
# Vollständige Pipeline (analyze → match → report → export)
|
||||
python cli.py run
|
||||
|
||||
# Ohne Whisper-Transkription (schneller)
|
||||
python cli.py run --no-audio
|
||||
|
||||
# Ohne LLM-Klassifikation
|
||||
python cli.py run --no-audio --no-llm
|
||||
|
||||
# Schrittweise
|
||||
python cli.py analyze # Reference Trailer → Beats erkennen
|
||||
python cli.py match # Globaler FFmpeg Scan (Szenen-unabhängig)
|
||||
python cli.py report # HTML Report mit Video-Vergleich bauen
|
||||
python cli.py export --format both # FCPXML + EDL ausgeben
|
||||
|
||||
# Gezielt nur einen Beat bearbeiten (empfohlen für erste Iterationen)
|
||||
python cli.py match --beat 5
|
||||
python cli.py match --beat 5 --vision # optionale gecachte Vision-Seeds
|
||||
python cli.py report --beat 5
|
||||
python cli.py export --beat 5 --format both
|
||||
|
||||
# Fehlerhafte Matches korrigieren
|
||||
python cli.py rematch --beat 5 --threshold 0.50 # Schwelle anpassen (Globaler Scan wird für diesen Beat wiederholt)
|
||||
python cli.py rematch --beat 5 --refine # Cached Match per lokalem Bildinhalt-Offset nachschärfen
|
||||
```
|
||||
|
||||
Der HTML-Report regeneriert seine Preview-Clips bei jedem Lauf mit genauer
|
||||
FFmpeg-Nachsuche und synchronisiert die beiden Video-Player pro Beat. Dadurch
|
||||
ist der Report zur Frame-Prüfung geeignet und zeigt keine alten gecachten
|
||||
Preview-Clips.
|
||||
Source-Previews bekommen bei Trailer-only-Tails denselben schwarzen Tail wie der
|
||||
Export, damit der Browser nicht einen zu kurzen Source-Clip gegen den längeren
|
||||
Referenzbeat weiterspult oder loopt.
|
||||
Zur Synchronprüfung rendert der Report ein einzelnes Frame-Locked-Compare-Video
|
||||
mit Referenz und Source in demselben MP4-Stream. Dieses Compare-Video ist
|
||||
maßgeblich, weil zwei getrennte Browser-Videoelemente nie zuverlässig
|
||||
framegenau synchron bleiben.
|
||||
|
||||
Wenn ein Trailer-Beat am Ende eine Blende, Schwarzfläche oder Textkarte enthält,
|
||||
die im Source-Film nicht als normaler Shot vorhanden ist, endet der Source-Match
|
||||
am letzten stabil passenden Frame. Exportierte Timelines behalten trotzdem die
|
||||
volle Beat-Länge und fügen danach automatisch einen schwarzen Trailer-Tail mit
|
||||
Marker für Fade/Dissolve ein.
|
||||
|
||||
Gezielte Ein-Beat-Matches nutzen zusätzlich vorhandene automatische Nachbarbeats
|
||||
aus dem Cache als zeitliche Suchanker. Das hilft bei aufeinanderfolgenden Shots,
|
||||
ohne manuelle Szenen oder Timecodes zu kuratieren.
|
||||
Bei `match --beat N` wird ein alter Cache-Treffer für genau diesen Beat entfernt
|
||||
und nur ein neu gefundener automatischer Treffer wieder eingetragen. Ein
|
||||
fehlgeschlagener neuer Lauf kann dadurch keinen alten falschen Report-Treffer
|
||||
stehen lassen.
|
||||
|
||||
Der globale Bildvergleich arbeitet auf kontrast-normalisierten Luma- und
|
||||
Kantenfeatures statt auf rohen Farb-Pixeln. Dadurch bleiben Schwarzweiß- oder
|
||||
anders gegradete Trailerbilder mit dem Source-Material vergleichbar, während
|
||||
unähnliche Farbshots schlechter ranken.
|
||||
Die Inpoint-Feinjustage bestimmt den Versatz lokal aus dem Bildinhalt: Um einen
|
||||
groben Treffer herum werden mehrere Referenzframes gegen mehrere Source-Offsets
|
||||
verglichen, und der beste gemeinsame Offset wird übernommen. Das ist schneller
|
||||
als ein erneuter globaler Scan und vermeidet pauschale Frame-Prerolls.
|
||||
Schwarze Referenzframes aus Blenden oder Titel-Tails werden für diese
|
||||
Offset-Messung ausgelassen, damit echte Bildbewegung und nicht die Blende selbst
|
||||
den Inpoint bestimmt.
|
||||
`rematch --refine` nutzt denselben lokalen FFmpeg/Pillow-Aligner und schreibt
|
||||
den korrigierten Inpoint direkt zurück in `.cache/match_results.json`.
|
||||
|
||||
Zusätzlich werden aus den besten szenenweiten Luma/Histogramm-Kandidaten
|
||||
mehrere Inpoint-Suchanker erzeugt. Diese Scene-Seeds verwenden keine harte
|
||||
pHash-Sperre, weil pHash bei stark anders gegradeten Trailerbildern echte
|
||||
Matches zu früh ausschließen kann.
|
||||
Optional kann `python cli.py match --beat N --vision` einen Vision-Layer
|
||||
zuschalten. Dann werden pro Trailer-Beat und pro wenigen Scene-Level-Kandidaten
|
||||
je drei Frames (Anfang, Mitte, Ende) von einem visionfähigen OpenAI-kompatiblen
|
||||
Modell beschrieben. Die Beschreibungen liegen in
|
||||
`.cache/vision_descriptions.json` und werden wiederverwendet. Vision erzeugt
|
||||
nur zusätzliche Suchanker; der eigentliche Match muss weiterhin durch CV,
|
||||
Content-Reranking, Timing und Duration-Coverage bestätigt werden.
|
||||
Der gewichtete Vision-Seed-Pfad ersetzt standardmäßig keinen normalen
|
||||
FFmpeg-Vollscan. Vision-Beschreibungen sind semantische Hinweise, aber keine
|
||||
Beweise; der volle CV-Scan bleibt deshalb aktiv, damit falsch bewertete
|
||||
Vision-Szenen echte Treffer nicht verdrängen. Für schnelle Experimente kann
|
||||
`skip_coarse_scan_with_weighted_seeds = true` gesetzt werden.
|
||||
Gewichtete Vision-Seeds werden nicht zuerst durch den alten Midpoint-Template
|
||||
Refine verschoben; sie gehen direkt in die lokale Content-Alignment-Prüfung.
|
||||
Das schützt wiederholte Gesprächseinstellungen, bei denen ähnliche Momente
|
||||
mehrfach in derselben Szene vorkommen.
|
||||
Innerhalb der automatisch von Vision vorgeschlagenen Szenen läuft zusätzlich
|
||||
eine dichte lokale Bildsequenzsuche. Sie misst den Phasenversatz in kleinen
|
||||
Zeitschritten direkt am Bildinhalt und bevorzugt Kandidaten mit genügend
|
||||
Restdauer in derselben Source-Szene. Das ist kein manueller Override: Vision
|
||||
grenzt nur Suchbereiche ein, die Auswahl bleibt Content-, Timing- und
|
||||
Coverage-getrieben.
|
||||
Nach einem dichten Vision-Treffer darf der spätere lokale Aligner nur noch im
|
||||
Bereich dieses Scan-Schritts nachjustieren. So kann ein korrekt gefundener
|
||||
Bewegungsmoment nicht wieder um viele Frames in eine ähnlich aussehende Phase
|
||||
derselben Szene verschoben werden.
|
||||
Wenn mehrere Vision-Kandidaten in derselben Source-Szene ähnlich gut scoren
|
||||
und die Beat-Dauer abdecken, bevorzugt der Matcher die frühere Phase. Das
|
||||
verhindert, dass ein späterer, minimal stärkerer Standbildtreffer die
|
||||
Bewegungsphase des Trailers sichtbar überholt.
|
||||
Enthält ein Trailerbeat selbst einen harten Umschnitt, werden Kandidaten an
|
||||
angrenzenden Source-Szenengrenzen zusätzlich als zusammenhängender Multi-Shot-
|
||||
Span geprüft. Ein Match darf dann über eine Source-Szenengrenze laufen, aber
|
||||
nur wenn die relative Source-Grenze zeitlich zu einem erkannten Trailer-Umschnitt
|
||||
passt. So kann ein Beat aus Frage/Antwort-Shots vollständig erfasst werden,
|
||||
ohne Szenen willkürlich zusammenzukleben.
|
||||
Auch der lokale Content-Aligner darf einen Inpoint nur noch übernehmen, wenn
|
||||
die feste Whole-Frame-/Spatial-Validation dadurch besser wird.
|
||||
Vor dem teuren Frame-Refine wird der gesamte Kandidatenpool mit einer schnellen
|
||||
festen Inhaltsprüfung neu sortiert. Dadurch können korrekte Treffer aus
|
||||
wiederholten Einstellungen einer Szene nach oben kommen, auch wenn ein freier
|
||||
Template-Peak an anderer Stelle numerisch stärker war. Suchanker bleiben im
|
||||
Pool erhalten, dürfen aber erst nach der Inhaltsprüfung nach oben rücken. Wenn
|
||||
ein Kandidat visuell plausibel ist, aber wegen Trailerblende oder kurzem
|
||||
Source-Span die normale Coverage knapp verfehlt, wird er als provisional Match
|
||||
behalten statt als `NO MATCH` verworfen.
|
||||
Dieses Reranking berücksichtigt zusätzlich die verbleibende Szenenlänge ab dem
|
||||
Kandidaten-Inpoint. Dadurch werden zu späte ähnliche Gesprächsphasen innerhalb
|
||||
derselben Szene nicht mehr vor frühere, tragfähigere Phasen sortiert.
|
||||
Das Inhalts-Reranking nutzt bewusst nur wenige repräsentative Referenzframes und
|
||||
eine begrenzte Kandidatenzahl. So bleiben wiederholte Szenen auffindbar, ohne
|
||||
dass der Lauf durch tausende Random-Seeks minutenlang festhängt.
|
||||
Confirmed Matches werden zusätzlich durch eine feste nahezu-Whole-Frame-Prüfung
|
||||
aus Luma, Kanten, Farbhistogramm und räumlichen 4x4-Farbhistogrammen gedeckelt.
|
||||
Dadurch kann ein freier Template-Hit mit ähnlicher Fenster-/Gesichtsstruktur
|
||||
nicht mehr als sicherer Match gelten, wenn die Gesamtkomposition oder die
|
||||
Bewegungsphase sichtbar eine andere Szene ist.
|
||||
Für gewichtete Vision-Kandidaten gibt es zusätzlich eine eigene Provisional-
|
||||
Bewertung aus Content-Score, Restdauer und Seed-Stärke. Dadurch können echte,
|
||||
aber durch Trailer-Grading/Crop numerisch schwache Treffer im Report landen,
|
||||
ohne als confirmed Match durchzugehen.
|
||||
Die Cache-Normalisierung für Report/Export verwendet dieselbe niedrigere
|
||||
Content-Untergrenze für nicht bestätigte Vision-Provisional-Treffer, damit ein
|
||||
gerade gefundener automatischer Match nicht beim Report-Aufbau wieder
|
||||
weggefiltert wird.
|
||||
Sie übernimmt auch die Multi-Shot-Coverage-Regel: gecachte Treffer, die passend
|
||||
zu internen Trailer-Umschnitten über angrenzende Source-Szenen laufen, werden
|
||||
nicht mehr auf die erste Source-Szene zurückgekürzt.
|
||||
Gezielte Einzel-Beat-Matches gewichten außerdem die automatisch aus Nachbarbeats
|
||||
abgeleiteten Continuity-Seeds. Wenn ein Beat direkt an einen bereits passenden
|
||||
Vorgänger anschließt, kann ein späterer ähnlich aussehender Moment derselben
|
||||
Dialogszene den erwarteten Anschluss nicht mehr nur wegen eines höheren
|
||||
Standbildscores verdrängen.
|
||||
Diese Continuity-Seeds sind aber nur Suchanker: in derselben Szene darf ein
|
||||
späterer Inpoint gewinnen, wenn die mehrframeige Content-Prüfung die
|
||||
Bewegungsphase klar besser trifft. Dadurch bleiben Anschlussmatches stabil,
|
||||
ohne Hand-/Kopfbewegungen auf einen falschen Zeitpunkt festzunageln.
|
||||
Continuity- und Vision-Seeds allein schalten den globalen FFmpeg-Scan
|
||||
standardmäßig nicht ab. Sie sind Suchanker, keine Beweise; der volle CV-Scan
|
||||
bleibt aktiv, damit semantisch plausible, aber falsche Vision-Treffer echte
|
||||
Bildmatches nicht verdrängen.
|
||||
Lange Trailerbeats werden nicht mehr automatisch über ihre gesamte Beat-Länge
|
||||
gegen einen einzigen Source-Clip validiert. Sobald nach einem sichtbaren
|
||||
Source-Abschnitt eine anhaltende Schwarzblende oder Titel-/Credit-Insel beginnt,
|
||||
endet der matchbare Referenzbereich dort; zwei aufeinanderfolgende dunkle
|
||||
Samples reichen dafür. Spätere Text-/Creditbilder im selben Beat gehen damit
|
||||
nicht mehr in Reranking, Validation oder Span-Schätzung ein.
|
||||
Zusätzlich werden sehr dunkle, kontrastarme oder noch nicht sauber
|
||||
auf-/abgeblendete Referenzframes aus Score, Inhalts-Reranking,
|
||||
Phasen-Alignment und Motion-Templates herausgenommen. Blenden sollen bestimmen,
|
||||
wie der Clip später exportiert wird, aber nicht, ob der Bildinhalt als Match
|
||||
gilt.
|
||||
Treffer unter `provisional_content_threshold` werden gar nicht mehr gespeichert
|
||||
oder aus alten Cache-Ergebnissen übernommen. Das verhindert, dass offensichtlich
|
||||
falsche Szenen im Report als Match-Kandidat weiterleben.
|
||||
|
||||
### Log-Level
|
||||
|
||||
```powershell
|
||||
python cli.py run --log-level DEBUG
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Projektstruktur
|
||||
|
||||
```
|
||||
ai_trailer_2026/
|
||||
│
|
||||
├── config.toml ← Alle Parameter (kein Hardcoding im Code)
|
||||
├── .env ← API-Keys (NICHT commiten)
|
||||
├── cli.py ← Einstiegspunkt
|
||||
│
|
||||
├── src/
|
||||
│ ├── core/
|
||||
│ │ ├── config.py load_config() → AppConfig (frozen dataclasses)
|
||||
│ │ └── models.py Scene, TrailerBeat, VibeHit, MatchResult, EditTimeline
|
||||
│ ├── cv/
|
||||
│ │ ├── fingerprinting.py Text-Safe Crop · HS-Histogramme · pHash
|
||||
│ │ ├── vibe_check.py Phase 1: Histogram+pHash Filter
|
||||
│ │ ├── scene_indexer.py PySceneDetect → Fingerprint → JSON-Cache
|
||||
│ │ ├── frame_extractor.py VideoCapture-Wrapper
|
||||
│ │ └── deep_scan.py Phase 2: Coarse+Refine Template-Matching
|
||||
│ ├── audio/
|
||||
│ │ └── transcriber.py faster-whisper Transkription
|
||||
│ ├── llm/
|
||||
│ │ ├── dramaturg.py OpenRouter → BeatType (Dialog/Dramaturgie)
|
||||
│ │ └── vision_cache.py optionale gecachte 3-Frame Vision-Seeds
|
||||
│ ├── pipeline/
|
||||
│ │ ├── trailer_analyzer.py Reference-Trailer → TrailerBeat[]
|
||||
│ │ └── matcher.py Orchestrierung + EditTimeline-Builder
|
||||
│ └── export/
|
||||
│ ├── timecode.py Sekunden ↔ FCPXML-Rational ↔ SMPTE
|
||||
│ ├── fcpxml_writer.py FCPXML 1.10
|
||||
│ └── edl_writer.py CMX 3600 EDL
|
||||
│
|
||||
├── output/ ← FCPXML/EDL Output (gitignored)
|
||||
├── .cache/ ← Szenen-Index + Match-Ergebnisse (gitignored)
|
||||
└── tests/ 52 Unit-Tests (pytest)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Cache-Verhalten
|
||||
|
||||
Damit nicht bei jedem Lauf der gesamte Quellfilm neu analysiert werden muss:
|
||||
|
||||
| Datei | Inhalt | Neu bauen mit |
|
||||
|-------|--------|---------------|
|
||||
| `.cache/scene_index.json` | Alle Quellfilm-Szenen + Fingerprints | `--force-reindex` |
|
||||
| `.cache/trailer_beats.json` | Erkannte Trailer-Beats | `python cli.py analyze` erneut |
|
||||
| `.cache/match_results.json` | CV-Matching-Ergebnisse | `python cli.py match` erneut |
|
||||
| `.cache/vision_descriptions.json` | Optionale 3-Frame Vision-Beschreibungen für Beats/Szenen | löschen oder anderes Vision-Modell konfigurieren |
|
||||
|
||||
---
|
||||
|
||||
## Tests
|
||||
|
||||
```powershell
|
||||
pytest tests/ -v
|
||||
```
|
||||
|
||||
Alle Tests laufen ohne echte Videodateien (synthetische Frames via numpy/OpenCV).
|
||||
|
||||
---
|
||||
|
||||
## Konfiguration (Auszug)
|
||||
|
||||
Alle Werte in `config.toml` — keine hardgecodeten Konstanten im Code.
|
||||
|
||||
```toml
|
||||
[cv.vibe_check]
|
||||
top_k_candidates = 10 # Top-K Kandidaten für Deep Scan
|
||||
phash_max_distance = 12 # Hamming-Distanz Schwelle (0–64)
|
||||
crop_top_fraction = 0.15 # Obere 15% ausblenden (Logos)
|
||||
crop_bottom_fraction = 0.30 # Untere 30% ausblenden (Letterbox/Subs)
|
||||
|
||||
[cv.deep_scan]
|
||||
coarse_step_seconds = 0.5 # Scan-Schrittgröße (Coarse Pass)
|
||||
match_threshold = 0.65 # Mindestscore für bestätigte automatische Matches
|
||||
provisional_match_threshold = 0.45 # Niedrigere automatische Kandidaten im Report zeigen
|
||||
coarse_candidate_threshold = 0.50 # Niedrigeres Gate vor Multi-Frame-Refine
|
||||
refine_window_seconds = 0.6 # Suchfenster für framegenaue Inpoint-Feinjustage
|
||||
refine_step_seconds = 0.04 # ~1 Frame bei 25fps (Refine Pass)
|
||||
content_align_window_seconds = 0.48 # Lokales Suchfenster um einen groben Treffer
|
||||
content_align_sample_step_s = 0.28 # Referenzframes für direkten Bildinhalt-Offset
|
||||
content_validation_weight = 0.35 # Gewicht der festen Whole-Frame-/Spatial-Endprüfung
|
||||
provisional_content_threshold = 0.42 # Untergrenze für Report-/Cache-Kandidaten
|
||||
start_tie_break_score_delta = 0.015 # Bei fast gleichen Scores früheren Inpoint wählen
|
||||
start_preroll_frames = 0 # Kein pauschaler Start-Ausgleich; Offset kommt aus Bildinhalt
|
||||
sequence_candidate_count = 240 # Breiter Kandidatenpool vor Inhalts-Reranking
|
||||
max_refine_candidates = 6 # Teurer Frame-Refine läuft nur auf den besten Inhaltskandidaten
|
||||
scene_seed_top_k = 30 # Scene-Level-Kandidaten als zusätzliche Suchanker
|
||||
scene_seed_points_per_scene = 6 # Inpoint-Samples pro Scene-Level-Kandidat
|
||||
content_rerank_candidate_count = 100 # Grobe Kandidaten vor Inhalts-Reranking
|
||||
skip_coarse_scan_with_weighted_seeds = false # Vision-Seeds nur als Hinweise; Vollscan bleibt robust
|
||||
sequence_score_weight = 0.55 # Gewicht für mehrere zeitliche Vergleichsframes
|
||||
span_score_weight = 0.15 # Gewicht für Stabilität bis zum Beat-Ende
|
||||
coarse_score_weight = 0.10 # Gewicht des groben Midpoint-Treffers
|
||||
duration_score_weight = 0.20 # Gewicht für nutzbare Länge des Source-Treffers
|
||||
duration_tie_break_score_delta = 0.03 # Bei ähnlichem Score längeren Treffer bevorzugen
|
||||
min_duration_coverage = 0.65 # Treffer muss mindestens 65% des matchbaren Referenzanteils tragen
|
||||
continuity_seed_offsets_s = [-1.0, 0.0, 0.5, 1.0, 1.5, 2.0, 3.0] # Suchanker um gematchte Nachbarbeats
|
||||
span_sample_step_s = 0.08 # Schrittweite für End-/Drift-Erkennung
|
||||
trim_tail_frames = 4 # Sicherheitsabstand gegen kurze Blitzer am Ende
|
||||
scene_boundary_epsilon_s = 0.12 # Szenengrenzen-Toleranz gegen 1-2 Frame Cut-Drift
|
||||
scoreable_luma_mean_min = 24.0 # Zu dunkle/Fade-Frames nicht scoren
|
||||
scoreable_luma_p90_min = 58.0 # Helle Bildanteile müssen sichtbar genug sein
|
||||
scoreable_contrast_min = 24.0 # Kontrastarme Blenden/Titelinseln ignorieren
|
||||
|
||||
[vision]
|
||||
enabled = false # Kostenkontrolle: per CLI mit --vision aktivierbar
|
||||
model = "google/gemma-4-31b-it" # Muss ein visionfähiges OpenAI-kompatibles Modell sein
|
||||
scene_candidate_top_k = 8 # Nur wenige Top-Szenen pro Beat beschreiben
|
||||
max_new_descriptions_per_run = 12 # API-Kosten pro Lauf begrenzen
|
||||
max_seed_scenes = 3 # Nur beste Vision-Szenen als Suchanker weitergeben
|
||||
seed_points_per_scene = 12 # Inpoint-Samples pro Vision-Szene
|
||||
seed_score = 0.88 # Vision-Seeds bekommen mehr Priorität als normale Scene-Seeds
|
||||
max_refine_candidates = 6 # Vision-Pfad prüft mehrere Bewegungsphasen derselben Szene
|
||||
local_scan_step_s = 0.12 # Dichte lokale Bildsuche in Vision-Szenen
|
||||
local_scan_max_points_per_scene = 180 # Laufzeitgrenze pro Source-Szene
|
||||
local_scan_top_candidates = 18 # Beste lokale Kandidaten gehen ins Refinement
|
||||
local_scan_tie_break_score_delta = 0.08 # Ähnliche Vision-Treffer: frühere Phase bevorzugen
|
||||
multi_shot_cut_corr_threshold = 0.20 # Interne Trailer-Umschnitte erkennen
|
||||
multi_shot_boundary_tolerance_s = 0.20 # Source-Grenze muss zum Trailer-Cut passen
|
||||
fullscan_fallback = false # Nur relevant, wenn skip_coarse_scan_with_weighted_seeds=true ist
|
||||
content_threshold = 0.22 # Lockeres Content-Gate nur für gewichtete Vision-Seeds
|
||||
similarity_threshold = 0.18 # Mindest-Textähnlichkeit für Vision-Seeds
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Lizenz
|
||||
|
||||
Internes Tool — nicht für den öffentlichen Vertrieb.
|
||||
@@ -0,0 +1,899 @@
|
||||
"""
|
||||
cli.py — AI Trailer Generator v2 — Command-Line Interface
|
||||
|
||||
Usage:
|
||||
python cli.py analyze [--config CONFIG] [--no-audio] [--no-llm]
|
||||
python cli.py match [--config CONFIG] [--force-reindex]
|
||||
python cli.py rematch --beat N [--threshold F] [--refine]
|
||||
python cli.py report [--config CONFIG]
|
||||
python cli.py run [--config CONFIG] [--force-reindex] [--no-audio] [--no-llm]
|
||||
python cli.py export [--config CONFIG] [--format fcpxml|edl|both]
|
||||
|
||||
On --no-audio / --no-llm:
|
||||
These flags do NOT affect matching quality.
|
||||
Whisper and the LLM only assign narrative labels (HOOK/SETUP/CLIMAX)
|
||||
to beats in the export metadata. The CV pipeline is identical either way.
|
||||
Use them for fast iterations: they skip large model downloads.
|
||||
|
||||
All heavy imports are deferred so --help is instant.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Logging setup
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _setup_logging(level: str = "INFO") -> None:
|
||||
# Force UTF-8 for Windows console emoji printing
|
||||
if sys.stdout.encoding != 'utf-8':
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s %(levelname)-8s %(name)s — %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
level=getattr(logging, level.upper(), logging.INFO),
|
||||
stream=sys.stdout,
|
||||
)
|
||||
logging.getLogger("PIL").setLevel(logging.WARNING)
|
||||
|
||||
|
||||
def _ensure_utf8_console() -> None:
|
||||
"""Make argparse help safe on Windows before logging is configured."""
|
||||
if sys.stdout.encoding != "utf-8":
|
||||
sys.stdout.reconfigure(encoding="utf-8")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Cache helpers (match results ↔ JSON)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _results_cache_path(cfg: "AppConfig") -> Path: # type: ignore[name-defined]
|
||||
return cfg.paths.cache_dir / "match_results.json"
|
||||
|
||||
|
||||
def _save_results(results: list, cfg: "AppConfig") -> None: # type: ignore[name-defined]
|
||||
from src.core.models import MatchResult
|
||||
data = [
|
||||
{
|
||||
"beat_id": r.beat_id,
|
||||
"scene_id": r.scene_id,
|
||||
"source_path": str(r.source_path),
|
||||
"in_point_s": r.in_point_s,
|
||||
"out_point_s": r.out_point_s,
|
||||
"in_point_frame": r.in_point_frame,
|
||||
"match_score": r.match_score,
|
||||
"match_location": list(r.match_location),
|
||||
"is_confirmed": r.is_confirmed,
|
||||
"segments": [
|
||||
{
|
||||
"trailer_offset_s": s.trailer_offset_s,
|
||||
"duration_s": s.duration_s,
|
||||
"scene_id": s.scene_id,
|
||||
"in_point_s": s.in_point_s,
|
||||
"out_point_s": s.out_point_s,
|
||||
"match_score": s.match_score,
|
||||
"is_confirmed": s.is_confirmed,
|
||||
}
|
||||
for s in getattr(r, "segments", ())
|
||||
],
|
||||
}
|
||||
for r in results
|
||||
]
|
||||
p = _results_cache_path(cfg)
|
||||
p.parent.mkdir(parents=True, exist_ok=True)
|
||||
p.write_text(json.dumps(data, indent=2), encoding="utf-8")
|
||||
logging.getLogger(__name__).info("Match results cached → %s", p)
|
||||
|
||||
|
||||
def _load_results(cfg: "AppConfig") -> list: # type: ignore[name-defined]
|
||||
from src.core.models import MatchResult, MatchSegment
|
||||
p = _results_cache_path(cfg)
|
||||
if not p.exists():
|
||||
raise FileNotFoundError(f"No cached results at {p}. Run 'match' first.")
|
||||
raw = json.loads(p.read_text(encoding="utf-8"))
|
||||
return [
|
||||
MatchResult(
|
||||
beat_id=d["beat_id"],
|
||||
scene_id=d["scene_id"],
|
||||
source_path=Path(d["source_path"]),
|
||||
in_point_s=d["in_point_s"],
|
||||
out_point_s=d["out_point_s"],
|
||||
in_point_frame=d["in_point_frame"],
|
||||
match_score=d["match_score"],
|
||||
match_location=tuple(d["match_location"]),
|
||||
is_confirmed=d.get("is_confirmed", True),
|
||||
segments=tuple(
|
||||
MatchSegment(
|
||||
trailer_offset_s=float(s["trailer_offset_s"]),
|
||||
duration_s=float(s["duration_s"]),
|
||||
scene_id=int(s["scene_id"]),
|
||||
in_point_s=float(s["in_point_s"]),
|
||||
out_point_s=float(s["out_point_s"]),
|
||||
match_score=float(s["match_score"]),
|
||||
is_confirmed=bool(s.get("is_confirmed", True)),
|
||||
)
|
||||
for s in d.get("segments", ())
|
||||
),
|
||||
)
|
||||
for d in raw
|
||||
]
|
||||
|
||||
|
||||
def _load_scene_cache_light(cfg) -> list[dict]:
|
||||
p = cfg.paths.cache_dir / "scene_index.json"
|
||||
if not p.exists():
|
||||
return []
|
||||
return json.loads(p.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def _scene_fps_light(scene: dict, cfg) -> float:
|
||||
duration_s = max(0.0, float(scene["end_s"]) - float(scene["start_s"]))
|
||||
frame_count = max(0, int(scene["end_frame"]) - int(scene["start_frame"]))
|
||||
return frame_count / duration_s if duration_s > 0 and frame_count > 0 else cfg.export.edl_frame_rate
|
||||
|
||||
|
||||
def _scene_for_time_light(scenes: list[dict], t_sec: float, cfg) -> dict | None:
|
||||
for idx, scene in enumerate(scenes):
|
||||
if float(scene["start_s"]) <= t_sec < float(scene["end_s"]):
|
||||
if (
|
||||
float(scene["end_s"]) - t_sec <= cfg.cv.deep_scan.scene_boundary_epsilon_s
|
||||
and idx + 1 < len(scenes)
|
||||
):
|
||||
return scenes[idx + 1]
|
||||
return scene
|
||||
return None
|
||||
|
||||
|
||||
def _scene_by_id_light(scenes: list[dict], scene_id: int) -> dict | None:
|
||||
return next((s for s in scenes if int(s["scene_id"]) == scene_id), None)
|
||||
|
||||
|
||||
def _contiguous_duration_light(beat, in_point_s: float, scenes: list[dict], cfg, matchable_duration_s: float) -> float:
|
||||
if matchable_duration_s <= 0:
|
||||
return 0.0
|
||||
try:
|
||||
from src.cv.global_scan import _reference_internal_cut_offsets
|
||||
cut_offsets = _reference_internal_cut_offsets(beat, cfg)
|
||||
except Exception:
|
||||
cut_offsets = []
|
||||
|
||||
start_idx = None
|
||||
for idx, scene in enumerate(scenes):
|
||||
if float(scene["start_s"]) <= in_point_s < float(scene["end_s"]):
|
||||
start_idx = idx
|
||||
break
|
||||
if start_idx is None:
|
||||
return 0.0
|
||||
|
||||
target_end = in_point_s + matchable_duration_s
|
||||
current_end = in_point_s
|
||||
for scene in scenes[start_idx:]:
|
||||
scene_end = float(scene["end_s"])
|
||||
if target_end <= scene_end:
|
||||
return matchable_duration_s
|
||||
|
||||
boundary_offset = scene_end - in_point_s
|
||||
if not any(
|
||||
abs(boundary_offset - cut_offset) <= cfg.vision.multi_shot_boundary_tolerance_s
|
||||
for cut_offset in cut_offsets
|
||||
):
|
||||
tail_s = max(0.0, cfg.cv.deep_scan.trim_tail_frames / _scene_fps_light(scene, cfg))
|
||||
return max(0.0, scene_end - in_point_s - tail_s)
|
||||
current_end = scene_end
|
||||
|
||||
return max(0.0, current_end - in_point_s)
|
||||
|
||||
|
||||
def _normalize_cached_results(beats: list, results: list, cfg) -> list:
|
||||
"""
|
||||
Re-apply current generic timing rules to cached results.
|
||||
|
||||
This keeps old automatic cache entries from preserving obsolete scene-boundary
|
||||
or tail-trim behavior without introducing manual per-beat truth.
|
||||
"""
|
||||
from dataclasses import replace
|
||||
|
||||
scenes = _load_scene_cache_light(cfg)
|
||||
if not scenes:
|
||||
return results
|
||||
|
||||
beats_by_id = {b.beat_id: b for b in beats}
|
||||
normalized = []
|
||||
for result in results:
|
||||
beat = beats_by_id.get(result.beat_id)
|
||||
if result.match_score < cfg.cv.deep_scan.provisional_match_threshold:
|
||||
continue
|
||||
|
||||
scene = _scene_for_time_light(scenes, result.in_point_s, cfg)
|
||||
declared_scene = _scene_by_id_light(scenes, result.scene_id)
|
||||
|
||||
# If the automatic matcher selected a scene but its in-point sits just
|
||||
# before that scene's detected start, treat this as scene-boundary drift
|
||||
# and clamp to the declared scene. This is generic: no beat IDs, no
|
||||
# manual timestamps, just consistent scene/time reconciliation.
|
||||
if declared_scene is not None:
|
||||
declared_start = float(declared_scene["start_s"])
|
||||
declared_end = float(declared_scene["end_s"])
|
||||
declared_fps = _scene_fps_light(declared_scene, cfg)
|
||||
boundary_tolerance_s = (
|
||||
cfg.cv.deep_scan.scene_boundary_epsilon_s
|
||||
+ cfg.cv.deep_scan.start_preroll_frames / declared_fps
|
||||
)
|
||||
if declared_start - boundary_tolerance_s <= result.in_point_s < declared_end:
|
||||
scene = declared_scene
|
||||
|
||||
if beat is None or scene is None:
|
||||
normalized.append(result)
|
||||
continue
|
||||
|
||||
fps = _scene_fps_light(scene, cfg)
|
||||
adjusted_in_s = result.in_point_s
|
||||
scene_changed = int(scene["scene_id"]) != result.scene_id
|
||||
starts_before_scene = result.in_point_s < float(scene["start_s"])
|
||||
if scene_changed or starts_before_scene or result.duration_s <= 0.12:
|
||||
adjusted_in_s = max(0.0, result.in_point_s - (cfg.cv.deep_scan.start_preroll_frames / fps))
|
||||
adjusted_in_s = max(float(scene["start_s"]), adjusted_in_s)
|
||||
scene = _scene_for_time_light(scenes, adjusted_in_s, cfg) or scene
|
||||
fps = _scene_fps_light(scene, cfg)
|
||||
|
||||
matchable_duration_s = beat.duration_s
|
||||
try:
|
||||
from src.cv.global_scan import estimate_matchable_reference_duration
|
||||
matchable_duration_s = estimate_matchable_reference_duration(beat, cfg)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
tail_s = max(0.0, cfg.cv.deep_scan.trim_tail_frames / fps)
|
||||
single_scene_duration_s = max(0.0, min(beat.duration_s, float(scene["end_s"]) - adjusted_in_s) - tail_s)
|
||||
contiguous_duration_s = _contiguous_duration_light(
|
||||
beat,
|
||||
adjusted_in_s,
|
||||
scenes,
|
||||
cfg,
|
||||
matchable_duration_s,
|
||||
)
|
||||
max_duration_s = max(single_scene_duration_s, min(beat.duration_s, contiguous_duration_s))
|
||||
|
||||
normalized_result = result
|
||||
if (
|
||||
scene_changed
|
||||
or starts_before_scene
|
||||
or result.duration_s <= 0.12
|
||||
or result.out_point_s > adjusted_in_s + max_duration_s + (1.0 / fps)
|
||||
):
|
||||
normalized_result = replace(
|
||||
result,
|
||||
scene_id=int(scene["scene_id"]),
|
||||
in_point_s=adjusted_in_s,
|
||||
out_point_s=adjusted_in_s + max_duration_s,
|
||||
in_point_frame=int(adjusted_in_s * fps),
|
||||
)
|
||||
|
||||
coverage = (
|
||||
max(0.0, normalized_result.duration_s) / matchable_duration_s
|
||||
if matchable_duration_s > 0 else 0.0
|
||||
)
|
||||
if coverage < cfg.cv.deep_scan.min_duration_coverage:
|
||||
continue
|
||||
|
||||
try:
|
||||
from src.cv.content_align import align_cached_match_by_content
|
||||
_, content_score = align_cached_match_by_content(
|
||||
beat,
|
||||
normalized_result.in_point_s,
|
||||
cfg,
|
||||
search_window_s=min(0.8, cfg.cv.deep_scan.content_align_window_seconds),
|
||||
fps=12.5,
|
||||
)
|
||||
content_gate = (
|
||||
cfg.cv.deep_scan.provisional_content_threshold
|
||||
if normalized_result.is_confirmed
|
||||
else min(cfg.cv.deep_scan.provisional_content_threshold, cfg.vision.content_threshold)
|
||||
)
|
||||
if content_score < content_gate:
|
||||
continue
|
||||
if content_score < cfg.cv.deep_scan.match_threshold and normalized_result.is_confirmed:
|
||||
normalized_result = replace(
|
||||
normalized_result,
|
||||
match_score=min(normalized_result.match_score, content_score),
|
||||
is_confirmed=False,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
normalized.append(normalized_result)
|
||||
|
||||
return normalized
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Command handlers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _build_transcribe_callback(cfg):
|
||||
"""Return a transcribe_callback closure, or None if audio is disabled."""
|
||||
from src.audio.transcriber import transcribe_video
|
||||
|
||||
def _cb(path, start_s, end_s, offset_s):
|
||||
return transcribe_video(path, cfg, start_s=start_s, end_s=end_s, time_offset_s=offset_s)
|
||||
|
||||
return _cb
|
||||
|
||||
|
||||
def _build_classify_callback(cfg):
|
||||
"""Return a classify_callback closure."""
|
||||
from src.llm.dramaturg import classify_beats
|
||||
|
||||
def _cb(beats):
|
||||
return classify_beats(beats, cfg)
|
||||
|
||||
return _cb
|
||||
|
||||
|
||||
def cmd_analyze(args: argparse.Namespace, cfg) -> list:
|
||||
from src.pipeline.trailer_analyzer import analyze_reference_trailer
|
||||
|
||||
transcribe_cb = _build_transcribe_callback(cfg) if not args.no_audio else None
|
||||
classify_cb = _build_classify_callback(cfg) if not args.no_llm else None
|
||||
|
||||
beats = analyze_reference_trailer(
|
||||
cfg,
|
||||
transcribe_callback=transcribe_cb,
|
||||
classify_callback=classify_cb,
|
||||
)
|
||||
|
||||
# Persist beats for downstream commands (including histogram bytes as hex)
|
||||
beats_cache = cfg.paths.cache_dir / "trailer_beats.json"
|
||||
beats_cache.parent.mkdir(parents=True, exist_ok=True)
|
||||
beats_data = [
|
||||
{
|
||||
"beat_id": b.beat_id,
|
||||
"start_s": b.start_s,
|
||||
"end_s": b.end_s,
|
||||
"start_frame": b.start_frame,
|
||||
"end_frame": b.end_frame,
|
||||
"beat_type": b.beat_type.name,
|
||||
"dialogue": [{"start_s": d.start_s, "end_s": d.end_s, "text": d.text} for d in b.dialogue],
|
||||
"phash": b.phash,
|
||||
"luma_hist": b.luma_hist.hex() if b.luma_hist else None,
|
||||
"sat_hist": b.sat_hist.hex() if b.sat_hist else None,
|
||||
}
|
||||
for b in beats
|
||||
]
|
||||
beats_cache.write_text(json.dumps(beats_data, indent=2, ensure_ascii=False), encoding="utf-8")
|
||||
print(f"\n\u2705 {len(beats)} beats analyzed \u2192 {beats_cache}")
|
||||
return beats
|
||||
|
||||
|
||||
def _load_beats(cfg) -> list:
|
||||
from src.core.models import BeatType, DialogueLine, TrailerBeat
|
||||
|
||||
p = cfg.paths.cache_dir / "trailer_beats.json"
|
||||
if not p.exists():
|
||||
raise FileNotFoundError(f"No cached beats at {p}. Run 'analyze' first.")
|
||||
|
||||
raw = json.loads(p.read_text(encoding="utf-8"))
|
||||
beats = []
|
||||
for d in raw:
|
||||
dialogue = tuple(
|
||||
DialogueLine(start_s=x["start_s"], end_s=x["end_s"], text=x["text"])
|
||||
for x in d.get("dialogue", [])
|
||||
)
|
||||
beats.append(TrailerBeat(
|
||||
beat_id=d["beat_id"],
|
||||
trailer_path=cfg.paths.reference_trailer,
|
||||
start_s=d["start_s"],
|
||||
end_s=d["end_s"],
|
||||
start_frame=d["start_frame"],
|
||||
end_frame=d["end_frame"],
|
||||
beat_type=BeatType[d.get("beat_type", "UNKNOWN")],
|
||||
dialogue=dialogue,
|
||||
phash=d.get("phash"),
|
||||
luma_hist=bytes.fromhex(d["luma_hist"]) if d.get("luma_hist") else None,
|
||||
sat_hist= bytes.fromhex(d["sat_hist"]) if d.get("sat_hist") else None,
|
||||
))
|
||||
return beats
|
||||
|
||||
|
||||
def _select_beats(beats: list, beat_id: int | None) -> list:
|
||||
"""Return all beats or exactly one requested beat."""
|
||||
if beat_id is None:
|
||||
return beats
|
||||
selected = [b for b in beats if b.beat_id == beat_id]
|
||||
if not selected:
|
||||
raise ValueError(f"Beat {beat_id} not found. Run 'analyze' first.")
|
||||
return selected
|
||||
|
||||
|
||||
def _select_results(results: list, beat_ids: set[int] | None) -> list:
|
||||
"""Return all results or only results for the requested beats."""
|
||||
if beat_ids is None:
|
||||
return results
|
||||
return [r for r in results if r.beat_id in beat_ids]
|
||||
|
||||
|
||||
def _find_scene_for_in_point(cfg, in_point_s: float):
|
||||
from src.cv.scene_indexer import build_scene_index
|
||||
|
||||
scenes = build_scene_index(cfg)
|
||||
for idx, scene in enumerate(scenes):
|
||||
if scene.start_s <= in_point_s < scene.end_s:
|
||||
if (
|
||||
scene.end_s - in_point_s <= cfg.cv.deep_scan.scene_boundary_epsilon_s
|
||||
and idx + 1 < len(scenes)
|
||||
):
|
||||
return scenes[idx + 1]
|
||||
return scene
|
||||
return None
|
||||
|
||||
|
||||
def _reference_scoreable_segments(beat, cfg) -> list[tuple[float, float]]:
|
||||
"""Find visible source-matchable islands inside a trailer beat."""
|
||||
from src.cv.frame_extractor import grab_frame_at_path
|
||||
from src.cv.global_scan import _is_scoreable_reference_frame
|
||||
|
||||
step_s = max(0.08, cfg.cv.deep_scan.span_sample_step_s)
|
||||
min_segment_s = max(0.32, step_s * 3.0)
|
||||
bridge_gap_s = max(0.18, step_s * 2.0)
|
||||
raw: list[tuple[float, float]] = []
|
||||
start: float | None = None
|
||||
last_seen: float | None = None
|
||||
t = 0.0
|
||||
while t <= beat.duration_s:
|
||||
frame = grab_frame_at_path(beat.trailer_path, beat.start_s + t)
|
||||
scoreable = frame is not None and _is_scoreable_reference_frame(frame, cfg)
|
||||
if scoreable:
|
||||
if start is None:
|
||||
start = t
|
||||
last_seen = t
|
||||
elif start is not None and last_seen is not None and t - last_seen > bridge_gap_s:
|
||||
end = min(beat.duration_s, last_seen + step_s)
|
||||
if end - start >= min_segment_s:
|
||||
raw.append((start, end))
|
||||
start = None
|
||||
last_seen = None
|
||||
t = round(t + step_s, 6)
|
||||
|
||||
if start is not None and last_seen is not None:
|
||||
end = min(beat.duration_s, last_seen + step_s)
|
||||
if end - start >= min_segment_s:
|
||||
raw.append((start, end))
|
||||
|
||||
return raw
|
||||
|
||||
|
||||
def _attach_visual_segments(results: list, beats: list, cfg) -> list:
|
||||
"""Attach automatic sub-shot matches for multi-island trailer beats."""
|
||||
from dataclasses import replace
|
||||
from src.core.models import MatchResult, MatchSegment
|
||||
from src.cv.global_scan import run_global_scan
|
||||
|
||||
by_id = {b.beat_id: b for b in beats}
|
||||
expanded: list[MatchResult] = []
|
||||
for result in results:
|
||||
beat = by_id.get(result.beat_id)
|
||||
if beat is None:
|
||||
expanded.append(result)
|
||||
continue
|
||||
|
||||
islands = _reference_scoreable_segments(beat, cfg)
|
||||
if len(islands) <= 1:
|
||||
primary = MatchSegment(
|
||||
trailer_offset_s=0.0,
|
||||
duration_s=max(0.0, result.duration_s),
|
||||
scene_id=result.scene_id,
|
||||
in_point_s=result.in_point_s,
|
||||
out_point_s=result.out_point_s,
|
||||
match_score=result.match_score,
|
||||
is_confirmed=result.is_confirmed,
|
||||
)
|
||||
expanded.append(replace(result, segments=(primary,)))
|
||||
continue
|
||||
|
||||
segments: list[MatchSegment] = []
|
||||
first_start, first_end = islands[0]
|
||||
first_duration = min(max(0.0, result.duration_s), max(0.0, first_end - first_start))
|
||||
segments.append(
|
||||
MatchSegment(
|
||||
trailer_offset_s=first_start,
|
||||
duration_s=first_duration,
|
||||
scene_id=result.scene_id,
|
||||
in_point_s=result.in_point_s,
|
||||
out_point_s=result.in_point_s + first_duration,
|
||||
match_score=result.match_score,
|
||||
is_confirmed=result.is_confirmed,
|
||||
)
|
||||
)
|
||||
|
||||
for start_s, end_s in islands[1:]:
|
||||
segment_beat = replace(
|
||||
beat,
|
||||
start_s=beat.start_s + start_s,
|
||||
end_s=beat.start_s + end_s,
|
||||
)
|
||||
segment_matches = run_global_scan([segment_beat], cfg, seed_in_points=None)
|
||||
if not segment_matches:
|
||||
continue
|
||||
seg = segment_matches[0]
|
||||
seg_dur = min(max(0.0, end_s - start_s), max(0.0, seg.duration_s))
|
||||
segments.append(
|
||||
MatchSegment(
|
||||
trailer_offset_s=start_s,
|
||||
duration_s=seg_dur,
|
||||
scene_id=seg.scene_id,
|
||||
in_point_s=seg.in_point_s,
|
||||
out_point_s=seg.in_point_s + seg_dur,
|
||||
match_score=seg.match_score,
|
||||
is_confirmed=seg.is_confirmed,
|
||||
)
|
||||
)
|
||||
|
||||
expanded.append(replace(result, segments=tuple(segments)))
|
||||
return expanded
|
||||
|
||||
|
||||
def cmd_match(args: argparse.Namespace, cfg) -> list:
|
||||
from src.pipeline.matcher import run_matching
|
||||
from dataclasses import replace
|
||||
|
||||
if getattr(args, "vision", False):
|
||||
cfg = replace(cfg, vision=replace(cfg.vision, enabled=True))
|
||||
if getattr(args, "no_vision", False):
|
||||
cfg = replace(cfg, vision=replace(cfg.vision, enabled=False))
|
||||
|
||||
all_beats = _load_beats(cfg)
|
||||
beats = _select_beats(all_beats, getattr(args, "beat", None))
|
||||
cached = _normalize_cached_results(all_beats, _load_results(cfg), cfg) if _results_cache_path(cfg).exists() else []
|
||||
seed_in_points = (
|
||||
_continuity_seed_in_points(args.beat, all_beats, cached, cfg)
|
||||
if getattr(args, "beat", None) is not None
|
||||
else None
|
||||
)
|
||||
results = run_matching(
|
||||
cfg,
|
||||
beats,
|
||||
force_reindex=args.force_reindex,
|
||||
seed_in_points=seed_in_points,
|
||||
)
|
||||
results = _attach_visual_segments(results, beats, cfg)
|
||||
|
||||
# A targeted one-beat match should improve the cache without deleting
|
||||
# automatic matches for other beats.
|
||||
if getattr(args, "beat", None) is not None and _results_cache_path(cfg).exists():
|
||||
cached = [r for r in cached if r.beat_id != args.beat]
|
||||
for result in results:
|
||||
cached = _update_result(result, cached)
|
||||
results_to_save = cached
|
||||
else:
|
||||
results_to_save = results
|
||||
|
||||
_save_results(results_to_save, cfg)
|
||||
|
||||
print(f"\n✅ {len(results)} / {len(beats)} beats matched.")
|
||||
for r in results:
|
||||
print(f" Beat {r.beat_id:03d} → scene {r.scene_id:04d} "
|
||||
f"in={r.in_point_s:>8.3f}s score={r.match_score:.3f}")
|
||||
return results
|
||||
|
||||
|
||||
def _update_result(new_result, results: list) -> list:
|
||||
"""Replace or insert a MatchResult in the list (by beat_id)."""
|
||||
updated = [r for r in results if r.beat_id != new_result.beat_id]
|
||||
updated.append(new_result)
|
||||
return sorted(updated, key=lambda r: r.beat_id)
|
||||
|
||||
|
||||
def _continuity_seed_in_points(beat_id: int, beats: list, results: list, cfg) -> dict[int, list[float | tuple[float, float]]]:
|
||||
beats_by_id = {b.beat_id: b for b in beats}
|
||||
results_by_id = {r.beat_id: r for r in results}
|
||||
target = beats_by_id.get(beat_id)
|
||||
if target is None:
|
||||
return {}
|
||||
|
||||
seeds: list[tuple[float, float]] = []
|
||||
base_score = max(cfg.cv.deep_scan.coarse_candidate_threshold + 0.08, 0.92)
|
||||
prev_matches = [
|
||||
(b, results_by_id[b.beat_id])
|
||||
for b in beats
|
||||
if b.beat_id < beat_id and b.beat_id in results_by_id
|
||||
]
|
||||
if prev_matches:
|
||||
prev_beat, prev_result = max(prev_matches, key=lambda item: item[0].beat_id)
|
||||
trailer_gap_s = max(0.0, target.start_s - prev_beat.end_s)
|
||||
expected = prev_result.out_point_s + trailer_gap_s
|
||||
for offset in cfg.cv.deep_scan.continuity_seed_offsets_s:
|
||||
offset_score = max(
|
||||
cfg.cv.deep_scan.coarse_candidate_threshold,
|
||||
base_score - abs(offset) * 0.06,
|
||||
)
|
||||
seeds.append((expected + offset, offset_score))
|
||||
|
||||
next_matches = [
|
||||
(b, results_by_id[b.beat_id])
|
||||
for b in beats
|
||||
if b.beat_id > beat_id and b.beat_id in results_by_id
|
||||
]
|
||||
if next_matches:
|
||||
next_beat, next_result = min(next_matches, key=lambda item: item[0].beat_id)
|
||||
trailer_gap_s = max(0.0, next_beat.start_s - target.end_s)
|
||||
expected = next_result.in_point_s - trailer_gap_s - target.duration_s
|
||||
for offset in cfg.cv.deep_scan.continuity_seed_offsets_s:
|
||||
offset_score = max(
|
||||
cfg.cv.deep_scan.coarse_candidate_threshold,
|
||||
base_score - abs(offset) * 0.06,
|
||||
)
|
||||
seeds.append((expected - offset, offset_score))
|
||||
|
||||
unique: dict[float, float] = {}
|
||||
for seed_t, seed_score in seeds:
|
||||
rounded = round(max(0.0, seed_t), 3)
|
||||
unique[rounded] = max(unique.get(rounded, 0.0), seed_score)
|
||||
points = [(seed_t, score) for seed_t, score in sorted(unique.items())]
|
||||
return {beat_id: points} if points else {}
|
||||
|
||||
|
||||
def cmd_rematch(args: argparse.Namespace, cfg) -> None:
|
||||
"""
|
||||
Re-run automatic matching for ONE beat.
|
||||
|
||||
python cli.py rematch --beat 5 # re-scan CV for beat 5
|
||||
python cli.py rematch --beat 5 --threshold 0.40 # relax threshold
|
||||
"""
|
||||
|
||||
beat_id = args.beat
|
||||
beats = _load_beats(cfg)
|
||||
results = _load_results(cfg) if _results_cache_path(cfg).exists() else []
|
||||
|
||||
beat = next((b for b in beats if b.beat_id == beat_id), None)
|
||||
if beat is None:
|
||||
print(f"\u274c Beat {beat_id} not found. Run 'analyze' first.")
|
||||
return
|
||||
|
||||
# ---- Refine an already acceptable cached match -------------------------
|
||||
if args.refine:
|
||||
current = next((r for r in results if r.beat_id == beat_id), None)
|
||||
if current is None:
|
||||
print(f"❌ Beat {beat_id} has no cached match to refine. Run 'match --beat {beat_id}' first.")
|
||||
return
|
||||
|
||||
from src.cv.content_align import align_cached_match_by_content
|
||||
refined_in_s, sequence_score = align_cached_match_by_content(
|
||||
beat,
|
||||
current.in_point_s,
|
||||
cfg,
|
||||
search_window_s=args.refine_window,
|
||||
)
|
||||
usable_duration_s = max(0.0, current.out_point_s - current.in_point_s)
|
||||
span_score = sequence_score
|
||||
scene_data = _scene_for_time_light(_load_scene_cache_light(cfg), refined_in_s, cfg)
|
||||
out_point_s = refined_in_s + usable_duration_s
|
||||
if scene_data is not None:
|
||||
out_point_s = min(out_point_s, float(scene_data["end_s"]))
|
||||
matchable_duration_s = beat.duration_s
|
||||
duration_coverage = (
|
||||
max(0.0, out_point_s - refined_in_s) / matchable_duration_s
|
||||
if matchable_duration_s > 0 else 0.0
|
||||
)
|
||||
if duration_coverage < cfg.cv.deep_scan.min_duration_coverage:
|
||||
print(
|
||||
f"❌ Beat {beat_id} refined candidate rejected: "
|
||||
f"duration coverage {duration_coverage:.0%} < "
|
||||
f"{cfg.cv.deep_scan.min_duration_coverage:.0%}"
|
||||
)
|
||||
return
|
||||
|
||||
try:
|
||||
from src.cv.frame_extractor import get_video_info
|
||||
fps = float(get_video_info(cfg.paths.source_movie)["fps"]) or cfg.export.edl_frame_rate
|
||||
except Exception:
|
||||
fps = cfg.export.edl_frame_rate
|
||||
|
||||
from src.core.models import MatchResult
|
||||
refined = MatchResult(
|
||||
beat_id=beat_id,
|
||||
scene_id=int(scene_data["scene_id"]) if scene_data is not None else current.scene_id,
|
||||
source_path=current.source_path,
|
||||
in_point_s=max(0.0, refined_in_s),
|
||||
out_point_s=out_point_s,
|
||||
in_point_frame=int(max(0.0, refined_in_s) * fps),
|
||||
match_score=max(sequence_score, span_score),
|
||||
match_location=current.match_location,
|
||||
is_confirmed=max(sequence_score, span_score) >= cfg.cv.deep_scan.match_threshold,
|
||||
)
|
||||
results = _update_result(refined, results)
|
||||
_save_results(results, cfg)
|
||||
print(
|
||||
f"✅ Beat {beat_id} refined → "
|
||||
f"in={refined.in_point_s:.3f}s, out={refined.out_point_s:.3f}s, "
|
||||
f"sequence_score={refined.match_score:.3f}"
|
||||
)
|
||||
return
|
||||
|
||||
# ---- Re-run CV with optional threshold override ------------------------
|
||||
from dataclasses import replace as dc_replace
|
||||
run_cfg = cfg
|
||||
if args.threshold is not None:
|
||||
run_cfg = dc_replace(
|
||||
cfg,
|
||||
cv=dc_replace(
|
||||
cfg.cv,
|
||||
deep_scan=dc_replace(cfg.cv.deep_scan, match_threshold=args.threshold),
|
||||
),
|
||||
)
|
||||
print(f"ℹ️ threshold overridden to {args.threshold} for beat {beat_id}")
|
||||
|
||||
from src.cv.global_scan import run_global_scan
|
||||
seed_in_points = _continuity_seed_in_points(beat_id, beats, results, run_cfg)
|
||||
matches = run_global_scan([beat], run_cfg, seed_in_points=seed_in_points)
|
||||
|
||||
if not matches:
|
||||
print(f"❌ Beat {beat_id}: no match. Try --threshold 0.40.")
|
||||
return
|
||||
|
||||
match = matches[0]
|
||||
results = _update_result(match, results)
|
||||
_save_results(results, cfg)
|
||||
print(f"✅ Beat {beat_id} rematched → (in={match.in_point_s:.3f}s, score={match.match_score:.3f})")
|
||||
|
||||
|
||||
def cmd_report(args: argparse.Namespace, cfg) -> None:
|
||||
from src.pipeline.reporter import generate_report
|
||||
beats = _select_beats(_load_beats(cfg), getattr(args, "beat", None))
|
||||
beat_ids = {b.beat_id for b in beats} if getattr(args, "beat", None) is not None else None
|
||||
results = _select_results(_normalize_cached_results(_load_beats(cfg), _load_results(cfg), cfg), beat_ids)
|
||||
out = generate_report(beats, results, cfg)
|
||||
if getattr(args, "beat", None) is not None and not results:
|
||||
print(
|
||||
f"\n⚠️ Beat {args.beat} has no cached match yet. "
|
||||
f"Run: python cli.py match --beat {args.beat}"
|
||||
)
|
||||
print(f"\n\u2705 Report \u2192 {out}")
|
||||
|
||||
|
||||
def cmd_export(args: argparse.Namespace, cfg) -> None:
|
||||
from src.export.edl_writer import write_edl
|
||||
from src.export.fcpxml_writer import write_fcpxml
|
||||
from src.pipeline.matcher import build_timeline
|
||||
|
||||
beats = _select_beats(_load_beats(cfg), getattr(args, "beat", None))
|
||||
beat_ids = {b.beat_id for b in beats} if getattr(args, "beat", None) is not None else None
|
||||
results = _select_results(_normalize_cached_results(_load_beats(cfg), _load_results(cfg), cfg), beat_ids)
|
||||
if getattr(args, "beat", None) is not None and not results:
|
||||
print(f"❌ Beat {args.beat} has no cached match. Run 'match --beat {args.beat}' first.")
|
||||
return
|
||||
timeline = build_timeline(beats, results, cfg)
|
||||
|
||||
fmt = args.format or cfg.export.output_format
|
||||
beat_id = getattr(args, "beat", None)
|
||||
out_stem = (
|
||||
f"{cfg.paths.reference_trailer.stem}_beat_{beat_id:03d}"
|
||||
if beat_id is not None
|
||||
else timeline.title
|
||||
)
|
||||
|
||||
if fmt in ("fcpxml", "both"):
|
||||
out = write_fcpxml(timeline, cfg, output_path=cfg.paths.output_dir / f"{out_stem}.fcpxml")
|
||||
print(f"✅ FCPXML → {out}")
|
||||
|
||||
if fmt in ("edl", "both"):
|
||||
out = write_edl(timeline, cfg, output_path=cfg.paths.output_dir / f"{out_stem}.edl")
|
||||
print(f"✅ EDL → {out}")
|
||||
|
||||
|
||||
def cmd_run(args: argparse.Namespace, cfg) -> None:
|
||||
"""Full pipeline: analyze → match → report → export."""
|
||||
cmd_analyze(args, cfg)
|
||||
cmd_match(args, cfg)
|
||||
cmd_report(args, cfg)
|
||||
cmd_export(args, cfg)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Argument parser
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="ai-trailer",
|
||||
description="AI Trailer Generator v2 — Pure CV scene matching",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config", type=Path, default=Path("config.toml"),
|
||||
metavar="CONFIG", help="Path to config.toml (default: ./config.toml)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--log-level", default="INFO",
|
||||
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
||||
help="Logging verbosity (default: INFO)",
|
||||
)
|
||||
|
||||
sub = parser.add_subparsers(dest="command", required=True)
|
||||
|
||||
# analyze
|
||||
p_analyze = sub.add_parser("analyze", help="Detect trailer beats + fingerprint")
|
||||
p_analyze.add_argument("--no-audio", action="store_true",
|
||||
help="Skip Whisper (only affects beat labels, not matching)")
|
||||
p_analyze.add_argument("--no-llm", action="store_true",
|
||||
help="Skip LLM classification (only affects beat labels)")
|
||||
|
||||
# match
|
||||
p_match = sub.add_parser("match", help="Run 2-phase CV matching")
|
||||
p_match.add_argument("--force-reindex", action="store_true",
|
||||
help="Ignore scene cache and re-run PySceneDetect")
|
||||
p_match.add_argument("--beat", type=int,
|
||||
help="Match only one beat and merge it into the cached results")
|
||||
p_match.add_argument("--vision", action="store_true",
|
||||
help="Enable cached vision descriptions for extra automatic search seeds")
|
||||
p_match.add_argument("--no-vision", action="store_true",
|
||||
help="Disable vision seeding even if [vision].enabled is true")
|
||||
|
||||
# rematch
|
||||
p_rematch = sub.add_parser("rematch", help="Re-run or override matching for one beat")
|
||||
p_rematch.add_argument("--beat", type=int, required=True, help="Beat ID to rematch")
|
||||
p_rematch.add_argument("--threshold", type=float, default=None, help="Override match_threshold")
|
||||
p_rematch.add_argument("--refine", action="store_true",
|
||||
help="Refine the cached match by measuring a local image-content offset")
|
||||
p_rematch.add_argument("--refine-window", type=float, default=None,
|
||||
help="Seconds to search around the cached in-point when using --refine")
|
||||
|
||||
# report
|
||||
p_report = sub.add_parser("report", help="Generate HTML visual comparison report")
|
||||
p_report.add_argument("--beat", type=int, help="Report only one beat")
|
||||
|
||||
# export
|
||||
p_export = sub.add_parser("export", help="Export timeline from cached results")
|
||||
p_export.add_argument("--format", choices=["fcpxml", "edl", "both"],
|
||||
help="Override [export] output_format from config")
|
||||
p_export.add_argument("--beat", type=int, help="Export only one beat")
|
||||
|
||||
# run
|
||||
p_run = sub.add_parser("run", help="Full pipeline: analyze → match → export")
|
||||
p_run.add_argument("--no-audio", action="store_true")
|
||||
p_run.add_argument("--no-llm", action="store_true")
|
||||
p_run.add_argument("--force-reindex", action="store_true")
|
||||
p_run.add_argument("--vision", action="store_true")
|
||||
p_run.add_argument("--no-vision", action="store_true")
|
||||
p_run.add_argument("--format", choices=["fcpxml", "edl", "both"])
|
||||
p_run.add_argument("--beat", type=int,
|
||||
help="Run match/report/export for only one cached beat")
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Entry point
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main() -> None:
|
||||
_ensure_utf8_console()
|
||||
parser = _build_parser()
|
||||
args = parser.parse_args()
|
||||
|
||||
_setup_logging(args.log_level)
|
||||
|
||||
from src.core.config import load_config
|
||||
cfg = load_config(args.config)
|
||||
|
||||
dispatch = {
|
||||
"analyze": cmd_analyze,
|
||||
"match": cmd_match,
|
||||
"rematch": cmd_rematch,
|
||||
"report": cmd_report,
|
||||
"export": cmd_export,
|
||||
"run": cmd_run,
|
||||
}
|
||||
|
||||
handler = dispatch[args.command]
|
||||
handler(args, cfg)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
+198
@@ -0,0 +1,198 @@
|
||||
# =============================================================================
|
||||
# AI Trailer Generator v2 — Central Configuration
|
||||
# =============================================================================
|
||||
# All tunable parameters, thresholds, and file paths are defined here.
|
||||
# NO hardcoded values are allowed in the Python source code.
|
||||
# =============================================================================
|
||||
|
||||
[project]
|
||||
name = "AI Trailer Generator v2"
|
||||
version = "2.0.0"
|
||||
log_level = "INFO" # DEBUG | INFO | WARNING | ERROR
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# [paths] — External video sources (read-only access)
|
||||
# -----------------------------------------------------------------------------
|
||||
[paths]
|
||||
source_movie = "B:/Proxy/BehindTheRedDoor_FTR_1080P_2398_Fixed.mp4"
|
||||
reference_trailer = "F:/Encodings/BehindTheRedDoor_Trailer_REFERENCE.mp4"
|
||||
|
||||
# Output destinations (inside project sandbox)
|
||||
output_dir = "output"
|
||||
cache_dir = ".cache"
|
||||
proxy_dir = "proxy"
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# [video] — Decode / proxy settings
|
||||
# -----------------------------------------------------------------------------
|
||||
[video]
|
||||
# Target FPS for internal frame extraction (0 = use source FPS)
|
||||
extract_fps = 1.0
|
||||
# Proxy resolution for template matching (width x height)
|
||||
proxy_width = 640
|
||||
proxy_height = 360
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# [cv] — Computer Vision engine parameters
|
||||
# Phase 1 — "Vibe Check" (histogram / perceptual hash scene-level filter)
|
||||
# Phase 2 — "Deep Scan" (template matching frame-level precision)
|
||||
# -----------------------------------------------------------------------------
|
||||
[cv]
|
||||
|
||||
[cv.vibe_check]
|
||||
# Number of top candidate scenes to forward to Deep Scan
|
||||
top_k_candidates = 100
|
||||
|
||||
# Histogram comparison method:
|
||||
# CORREL=0 | CHISQR=1 | INTERSECT=2 | BHATTACHARYYA=3
|
||||
hist_compare_method = 0
|
||||
|
||||
# Histogram bins per channel (hue, saturation)
|
||||
hist_bins_hue = 50
|
||||
hist_bins_saturation = 60
|
||||
|
||||
# pHash similarity threshold (lower = stricter; 0–64 range)
|
||||
# NOTE: 12 is for near-duplicate detection. Cross-video matching
|
||||
# (trailer vs source movie with different grading/compression)
|
||||
# needs 25–35. Start at 32 and tighten if you get false positives.
|
||||
phash_max_distance = 32
|
||||
|
||||
# ---- Text-Safe Crop -------------------------------------------------------
|
||||
# Fraction of frame height to EXCLUDE from the top (e.g. logos, title cards)
|
||||
crop_top_fraction = 0.15
|
||||
# Fraction of frame height to EXCLUDE from the bottom (e.g. letterbox, subs)
|
||||
crop_bottom_fraction = 0.30
|
||||
|
||||
[cv.deep_scan]
|
||||
# Step size in SECONDS between sampled frames during the coarse scan pass
|
||||
coarse_step_seconds = 0.5
|
||||
|
||||
# Minimum template match score (0.0–1.0) to accept a candidate as a hit
|
||||
match_threshold = 0.65
|
||||
|
||||
# Store/report lower-confidence automatic candidates for visual review instead
|
||||
# of dropping them as "NO MATCH". Confirmed exports can still use match_threshold.
|
||||
provisional_match_threshold = 0.45
|
||||
|
||||
# Lower gate for entering temporal multi-frame refinement. The final decision
|
||||
# still uses sequence/span scoring; this only avoids rejecting real matches
|
||||
# because one midpoint frame is weak.
|
||||
coarse_candidate_threshold = 0.50
|
||||
|
||||
# Candidate ranking weights. Duration coverage matters when the same visual
|
||||
# shot appears multiple times: prefer the occurrence that can cover the beat.
|
||||
sequence_score_weight = 0.55
|
||||
span_score_weight = 0.15
|
||||
coarse_score_weight = 0.10
|
||||
duration_score_weight = 0.20
|
||||
duration_tie_break_score_delta = 0.03
|
||||
min_duration_coverage = 0.65
|
||||
continuity_seed_offsets_s = [-1.0, 0.0, 0.5, 1.0, 1.5, 2.0, 3.0]
|
||||
scene_seed_top_k = 30
|
||||
scene_seed_points_per_scene = 6
|
||||
content_rerank_candidate_count = 100
|
||||
skip_coarse_scan_with_weighted_seeds = false
|
||||
|
||||
# cv2.matchTemplate method:
|
||||
# TM_CCOEFF_NORMED=5 (recommended), TM_CCORR_NORMED=3
|
||||
match_method = 5
|
||||
|
||||
# If a coarse hit is found, refine by scanning ± this many seconds
|
||||
refine_window_seconds = 0.6
|
||||
refine_step_seconds = 0.04 # ≈ 1 frame at 25 fps
|
||||
content_align_window_seconds = 0.48
|
||||
content_align_sample_step_s = 0.28
|
||||
content_validation_weight = 0.35
|
||||
provisional_content_threshold = 0.42
|
||||
|
||||
# When several adjacent frame offsets score almost the same, prefer the earlier
|
||||
# one. This avoids matches that are visually correct but start a few frames late.
|
||||
start_tie_break_score_delta = 0.015
|
||||
start_preroll_frames = 0
|
||||
|
||||
# Automatic temporal verification after a coarse image hit.
|
||||
# More candidates reduces false positives from visually similar shots.
|
||||
sequence_candidate_count = 240
|
||||
sequence_min_distance_s = 1.0
|
||||
max_refine_candidates = 6
|
||||
|
||||
# Match-span detection: trim when the source starts drifting into a different shot.
|
||||
span_sample_step_s = 0.08
|
||||
trim_tail_frames = 4
|
||||
|
||||
# If a refined in-point lands this close to a detected scene end, treat it as
|
||||
# the next scene. Scene detectors often place cuts a frame or two around the
|
||||
# visible boundary.
|
||||
scene_boundary_epsilon_s = 0.12
|
||||
scoreable_luma_mean_min = 24.0
|
||||
scoreable_luma_p90_min = 58.0
|
||||
scoreable_contrast_min = 24.0
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# [scene_detection] — PySceneDetect parameters (used to segment source movie)
|
||||
# -----------------------------------------------------------------------------
|
||||
[scene_detection]
|
||||
# Threshold for ContentDetector (lower = more sensitive)
|
||||
content_threshold = 27.0
|
||||
# Minimum scene duration in seconds
|
||||
min_scene_duration_s = 1.5
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# [whisper] — Dialogue / audio analysis
|
||||
# -----------------------------------------------------------------------------
|
||||
[whisper]
|
||||
model = "large-v3"
|
||||
language = "ar"
|
||||
device = "cuda" # cuda | cpu
|
||||
compute_type = "float16" # float16 | int8 | float32
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# [llm] — Used ONLY for thematic segmentation / dramaturgy
|
||||
# -----------------------------------------------------------------------------
|
||||
[llm]
|
||||
provider = "openrouter"
|
||||
base_url = "https://openrouter.ai/api/v1"
|
||||
model = "google/gemma-4-31b-it"
|
||||
timeout_seconds = 120
|
||||
temperature = 0.3
|
||||
max_tokens = 4096
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# [vision] — Optional cached visual descriptions for ambiguous matching
|
||||
# -----------------------------------------------------------------------------
|
||||
[vision]
|
||||
# Disabled by default to avoid surprise API cost. Enable when you want the
|
||||
# matcher to ask a vision-capable model for cached 3-frame scene descriptions.
|
||||
enabled = false
|
||||
provider = "openrouter"
|
||||
base_url = "https://openrouter.ai/api/v1"
|
||||
model = "google/gemma-4-31b-it"
|
||||
timeout_seconds = 90
|
||||
temperature = 0.0
|
||||
max_tokens = 350
|
||||
|
||||
# Cost controls: per beat, only the top scene-level candidates are described,
|
||||
# and cached descriptions in .cache/vision_descriptions.json are reused.
|
||||
scene_candidate_top_k = 8
|
||||
max_new_descriptions_per_run = 12
|
||||
max_seed_scenes = 3
|
||||
seed_points_per_scene = 12
|
||||
seed_score = 0.88
|
||||
max_refine_candidates = 6
|
||||
local_scan_step_s = 0.12
|
||||
local_scan_max_points_per_scene = 180
|
||||
local_scan_top_candidates = 18
|
||||
local_scan_tie_break_score_delta = 0.08
|
||||
multi_shot_cut_corr_threshold = 0.20
|
||||
multi_shot_boundary_tolerance_s = 0.20
|
||||
fullscan_fallback = false
|
||||
content_threshold = 0.22
|
||||
similarity_threshold = 0.18
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# [export] — FCPXML / EDL export settings
|
||||
# -----------------------------------------------------------------------------
|
||||
[export]
|
||||
fcpxml_version = "1.10"
|
||||
edl_frame_rate = 23.976 # fps used in EDL timecode generation
|
||||
output_format = "fcpxml" # fcpxml | edl | both
|
||||
@@ -0,0 +1,68 @@
|
||||
[build-system]
|
||||
requires = ["setuptools>=69", "wheel"]
|
||||
build-backend = "setuptools.backends.legacy:build"
|
||||
|
||||
[project]
|
||||
name = "ai-trailer-2026"
|
||||
version = "2.0.0"
|
||||
description = "Frame-accurate trailer reconstruction via pure Computer Vision"
|
||||
requires-python = ">=3.11"
|
||||
|
||||
dependencies = [
|
||||
# Computer Vision
|
||||
"opencv-python>=4.9",
|
||||
"imagehash>=4.3",
|
||||
"numpy>=1.26",
|
||||
"Pillow>=10.0",
|
||||
|
||||
# Scene detection
|
||||
"scenedetect[opencv]>=0.6",
|
||||
|
||||
# Audio / transcription
|
||||
"faster-whisper>=1.0",
|
||||
|
||||
# Config / secrets
|
||||
# tomllib — built-in stdlib (Python 3.11+), no install needed
|
||||
"python-dotenv>=1.0", # loads .env into os.environ
|
||||
|
||||
# Export
|
||||
"lxml>=5.0", # FCPXML generation
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
dev = [
|
||||
"pytest>=8.0",
|
||||
"pytest-cov",
|
||||
"mypy>=1.9",
|
||||
"ruff>=0.4",
|
||||
]
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
where = ["."]
|
||||
include = ["src*"]
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Ruff (linter + formatter)
|
||||
# ---------------------------------------------------------------------------
|
||||
[tool.ruff]
|
||||
line-length = 100
|
||||
target-version = "py311"
|
||||
|
||||
[tool.ruff.lint]
|
||||
select = ["E", "F", "I", "UP", "B", "C4", "ANN"]
|
||||
ignore = ["ANN101", "ANN102"]
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Mypy
|
||||
# ---------------------------------------------------------------------------
|
||||
[tool.mypy]
|
||||
python_version = "3.11"
|
||||
strict = true
|
||||
ignore_missing_imports = true
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Pytest
|
||||
# ---------------------------------------------------------------------------
|
||||
[tool.pytest.ini_options]
|
||||
testpaths = ["tests"]
|
||||
addopts = "-v --tb=short"
|
||||
@@ -0,0 +1,37 @@
|
||||
# AI Trailer Generator v2 — Python Dependencies
|
||||
# Generated from: pip freeze (Python 3.11, Windows)
|
||||
# Install with: pip install -r requirements.txt
|
||||
#
|
||||
# NOTE: faster-whisper and scenedetect may pull in torch/cuda extras
|
||||
# depending on your platform. See README for CUDA setup.
|
||||
|
||||
# Computer Vision
|
||||
opencv-python>=4.9
|
||||
numpy>=1.26
|
||||
Pillow>=10.0
|
||||
ImageHash>=4.3
|
||||
PyWavelets>=1.6 # required by ImageHash
|
||||
|
||||
# Video scene detection
|
||||
scenedetect[opencv]>=0.6
|
||||
|
||||
# Audio transcription
|
||||
# faster-whisper>=1.0 ← uncomment when ready to use Whisper
|
||||
# (pulls in torch; large download)
|
||||
|
||||
# Config & secrets
|
||||
python-dotenv>=1.0 # loads .env into os.environ
|
||||
# tomllib — stdlib in Python 3.11+, no install needed
|
||||
|
||||
# XML export
|
||||
# lxml>=5.0 ← optional: only needed for advanced FCPXML features
|
||||
# stdlib xml.etree.ElementTree is used by default
|
||||
|
||||
# HTTP (LLM calls via urllib.request — no extra dep needed)
|
||||
# requests ← not used; stdlib urllib is sufficient
|
||||
|
||||
# Dev / testing
|
||||
pytest>=8.0
|
||||
pytest-cov
|
||||
# mypy>=1.9
|
||||
# ruff>=0.4
|
||||
@@ -0,0 +1,89 @@
|
||||
# setup_venv.ps1 — AI Trailer Generator v2 — Virtual Environment Setup
|
||||
# Run once: .\setup_venv.ps1
|
||||
# -----------------------------------------------------------------------
|
||||
# If blocked by ExecutionPolicy:
|
||||
# Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser
|
||||
|
||||
$ErrorActionPreference = "Stop"
|
||||
$VENV_DIR = ".venv"
|
||||
|
||||
function Resolve-ProjectPython {
|
||||
$cmd = Get-Command python -ErrorAction SilentlyContinue
|
||||
if ($cmd) {
|
||||
return $cmd.Source
|
||||
}
|
||||
|
||||
$candidates = @(
|
||||
"$env:LOCALAPPDATA\Programs\Python\Python311\python.exe",
|
||||
"$env:LOCALAPPDATA\Microsoft\WindowsApps\python.exe"
|
||||
)
|
||||
|
||||
foreach ($candidate in $candidates) {
|
||||
if ($candidate -and (Test-Path $candidate)) {
|
||||
return $candidate
|
||||
}
|
||||
}
|
||||
|
||||
throw "Python 3.11+ not found. Install Python 3.11+ or add it to PATH."
|
||||
}
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "==================================================" -ForegroundColor Cyan
|
||||
Write-Host " AI Trailer Generator v2 — venv Setup" -ForegroundColor Cyan
|
||||
Write-Host "==================================================" -ForegroundColor Cyan
|
||||
Write-Host ""
|
||||
|
||||
# ---- 1. Check Python version ------------------------------------------------
|
||||
$PROJECT_PYTHON = Resolve-ProjectPython
|
||||
$pythonVersion = & $PROJECT_PYTHON --version 2>&1
|
||||
Write-Host "Python: $pythonVersion"
|
||||
if ($pythonVersion -notmatch "3\.(1[1-9]|[2-9]\d)") {
|
||||
Write-Error "Python 3.11+ required. Found: $pythonVersion"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# ---- 2. Create venv ---------------------------------------------------------
|
||||
if (Test-Path $VENV_DIR) {
|
||||
Write-Host "Virtual environment already exists at '$VENV_DIR'. Skipping creation." -ForegroundColor Yellow
|
||||
} else {
|
||||
Write-Host "Creating virtual environment in '$VENV_DIR' ..." -ForegroundColor Green
|
||||
& $PROJECT_PYTHON -m venv $VENV_DIR
|
||||
Write-Host "Done." -ForegroundColor Green
|
||||
}
|
||||
|
||||
# ---- 3. Activate venv -------------------------------------------------------
|
||||
$activate = Join-Path $VENV_DIR "Scripts\Activate.ps1"
|
||||
Write-Host "Activating virtual environment ..."
|
||||
. $activate
|
||||
$VENV_PYTHON = Join-Path $VENV_DIR "Scripts\python.exe"
|
||||
|
||||
# ---- 4. Upgrade pip ---------------------------------------------------------
|
||||
Write-Host "Upgrading pip ..." -ForegroundColor Green
|
||||
& $VENV_PYTHON -m pip install --upgrade pip --quiet
|
||||
|
||||
# ---- 5. Install dependencies ------------------------------------------------
|
||||
Write-Host "Installing dependencies from requirements.txt ..." -ForegroundColor Green
|
||||
& $VENV_PYTHON -m pip install -r requirements.txt
|
||||
|
||||
# ---- 6. Copy .env if missing ------------------------------------------------
|
||||
if (-not (Test-Path ".env")) {
|
||||
if (Test-Path ".env.example") {
|
||||
Copy-Item ".env.example" ".env"
|
||||
Write-Host ""
|
||||
Write-Host " .env created from .env.example." -ForegroundColor Yellow
|
||||
Write-Host " >>> Open .env and fill in your OPENROUTER_API_KEY! <<<" -ForegroundColor Red
|
||||
}
|
||||
}
|
||||
|
||||
# ---- 7. Done ----------------------------------------------------------------
|
||||
Write-Host ""
|
||||
Write-Host "==================================================" -ForegroundColor Cyan
|
||||
Write-Host " Setup complete!" -ForegroundColor Green
|
||||
Write-Host ""
|
||||
Write-Host " Activate the venv with:"
|
||||
Write-Host " .\.venv\Scripts\Activate.ps1" -ForegroundColor White
|
||||
Write-Host ""
|
||||
Write-Host " Then run the pipeline:"
|
||||
Write-Host " python cli.py run --no-audio --no-llm" -ForegroundColor White
|
||||
Write-Host "==================================================" -ForegroundColor Cyan
|
||||
Write-Host ""
|
||||
@@ -0,0 +1 @@
|
||||
# src package
|
||||
@@ -0,0 +1 @@
|
||||
# src.audio package — Whisper / dialogue analysis
|
||||
@@ -0,0 +1,182 @@
|
||||
"""
|
||||
src/audio/transcriber.py — Whisper transcription via faster-whisper
|
||||
|
||||
Responsibility:
|
||||
- Transcribe audio from a video file into a list of DialogueLine objects
|
||||
- Optionally restrict to a time window [start_s, end_s] (for single beats)
|
||||
- All model config (model name, device, compute_type) comes from AppConfig
|
||||
|
||||
The LLM is NOT used here. This is pure audio-to-text.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Sequence
|
||||
|
||||
from src.core.config import AppConfig
|
||||
from src.core.models import DialogueLine
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Audio extraction helper (video → wav via ffmpeg)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _extract_audio_segment(
|
||||
video_path: Path,
|
||||
start_s: float | None,
|
||||
end_s: float | None,
|
||||
out_wav: Path,
|
||||
) -> None:
|
||||
"""
|
||||
Use ffmpeg (subprocess) to extract a mono 16kHz WAV from *video_path*.
|
||||
|
||||
Args:
|
||||
video_path: Source video.
|
||||
start_s: Start time in seconds (None = beginning of file).
|
||||
end_s: End time in seconds (None = end of file).
|
||||
out_wav: Destination WAV path.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If ffmpeg exits with a non-zero code.
|
||||
"""
|
||||
import subprocess
|
||||
|
||||
cmd = ["ffmpeg", "-y", "-loglevel", "error"]
|
||||
|
||||
if start_s is not None:
|
||||
cmd += ["-ss", str(start_s)]
|
||||
if end_s is not None and start_s is not None:
|
||||
cmd += ["-t", str(end_s - start_s)]
|
||||
elif end_s is not None:
|
||||
cmd += ["-to", str(end_s)]
|
||||
|
||||
cmd += [
|
||||
"-i", str(video_path),
|
||||
"-vn", # no video
|
||||
"-ac", "1", # mono
|
||||
"-ar", "16000", # 16 kHz — Whisper native rate
|
||||
"-f", "wav",
|
||||
str(out_wav),
|
||||
]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(
|
||||
f"ffmpeg failed (code {result.returncode}):\n"
|
||||
f"{result.stderr.decode(errors='replace')}"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Core transcription
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def transcribe_video(
|
||||
video_path: Path,
|
||||
cfg: AppConfig,
|
||||
start_s: float | None = None,
|
||||
end_s: float | None = None,
|
||||
time_offset_s: float = 0.0,
|
||||
) -> list[DialogueLine]:
|
||||
"""
|
||||
Transcribe dialogue from *video_path* using faster-whisper.
|
||||
|
||||
Args:
|
||||
video_path: Path to source or trailer video.
|
||||
cfg: Application configuration (whisper section).
|
||||
start_s: Clip start in video-file seconds (None = beginning).
|
||||
end_s: Clip end in video-file seconds (None = end of file).
|
||||
time_offset_s: Added to every transcript timestamp so that beat-level
|
||||
transcripts align with absolute movie time.
|
||||
|
||||
Returns:
|
||||
List of DialogueLine ordered by start time.
|
||||
"""
|
||||
try:
|
||||
from faster_whisper import WhisperModel
|
||||
except ImportError:
|
||||
raise ImportError("faster-whisper not installed. Run: pip install faster-whisper")
|
||||
|
||||
w = cfg.whisper
|
||||
|
||||
logger.info(
|
||||
"Transcribing %s [%.1f–%s] with %s on %s …",
|
||||
video_path.name,
|
||||
start_s or 0.0,
|
||||
f"{end_s:.1f}s" if end_s else "end",
|
||||
w.model,
|
||||
w.device,
|
||||
)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
wav = Path(tmp) / "audio.wav"
|
||||
_extract_audio_segment(video_path, start_s, end_s, wav)
|
||||
|
||||
model = WhisperModel(w.model, device=w.device, compute_type=w.compute_type)
|
||||
segments, _ = model.transcribe(
|
||||
str(wav),
|
||||
language=w.language if w.language else None,
|
||||
beam_size=5,
|
||||
)
|
||||
|
||||
lines: list[DialogueLine] = []
|
||||
for seg in segments:
|
||||
lines.append(DialogueLine(
|
||||
start_s=seg.start + time_offset_s,
|
||||
end_s=seg.end + time_offset_s,
|
||||
text=seg.text.strip(),
|
||||
))
|
||||
|
||||
logger.info("Transcription done: %d segments.", len(lines))
|
||||
return lines
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Convenience: transcribe a whole file and return grouped by scene
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def transcribe_full_movie(
|
||||
cfg: AppConfig,
|
||||
) -> list[DialogueLine]:
|
||||
"""
|
||||
Transcribe the entire source movie. Use this result to enrich Scenes
|
||||
via a dialogue_callback passed to build_scene_index().
|
||||
"""
|
||||
return transcribe_video(cfg.paths.source_movie, cfg)
|
||||
|
||||
|
||||
def assign_dialogue_to_scenes(
|
||||
all_dialogue: Sequence[DialogueLine],
|
||||
scenes: list["src.core.models.Scene"], # type: ignore[name-defined]
|
||||
) -> list["src.core.models.Scene"]: # type: ignore[name-defined]
|
||||
"""
|
||||
Distribute pre-transcribed DialogueLines into their respective Scenes.
|
||||
|
||||
A line is assigned to the scene whose window contains its midpoint.
|
||||
|
||||
Args:
|
||||
all_dialogue: Full-movie transcript as flat list.
|
||||
scenes: Scene list (will be replaced with enriched copies).
|
||||
|
||||
Returns:
|
||||
New list of Scene objects with dialogue tuples populated.
|
||||
"""
|
||||
from dataclasses import replace
|
||||
from src.core.models import Scene
|
||||
|
||||
enriched: list[Scene] = []
|
||||
for scene in scenes:
|
||||
matched = tuple(
|
||||
line for line in all_dialogue
|
||||
if scene.start_s <= (line.start_s + line.end_s) / 2.0 < scene.end_s
|
||||
)
|
||||
enriched.append(replace(scene, dialogue=matched))
|
||||
|
||||
total_assigned = sum(len(s.dialogue) for s in enriched)
|
||||
logger.info("Assigned %d dialogue lines across %d scenes.", total_assigned, len(enriched))
|
||||
return enriched
|
||||
@@ -0,0 +1 @@
|
||||
# src.core package
|
||||
@@ -0,0 +1,387 @@
|
||||
"""
|
||||
src/core/config.py — Configuration loader for AI Trailer Generator v2
|
||||
|
||||
Loads config.toml and exposes typed, nested dataclasses.
|
||||
All CV thresholds, paths, and model settings are sourced exclusively here.
|
||||
API keys are NEVER stored in config.toml; they are loaded from .env.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import tomllib
|
||||
|
||||
try:
|
||||
from dotenv import load_dotenv as _load_dotenv
|
||||
_HAS_DOTENV = True
|
||||
except ImportError: # dotenv optional — falls back to existing env vars
|
||||
_HAS_DOTENV = False
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Literal
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Leaf sections
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PathsConfig:
|
||||
source_movie: Path
|
||||
reference_trailer: Path
|
||||
output_dir: Path
|
||||
cache_dir: Path
|
||||
proxy_dir: Path
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class VideoConfig:
|
||||
extract_fps: float
|
||||
proxy_width: int
|
||||
proxy_height: int
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class VibeCheckConfig:
|
||||
top_k_candidates: int
|
||||
hist_compare_method: int
|
||||
hist_bins_hue: int
|
||||
hist_bins_saturation: int
|
||||
phash_max_distance: int
|
||||
crop_top_fraction: float
|
||||
crop_bottom_fraction: float
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class DeepScanConfig:
|
||||
coarse_step_seconds: float
|
||||
match_threshold: float
|
||||
provisional_match_threshold: float
|
||||
coarse_candidate_threshold: float
|
||||
sequence_score_weight: float
|
||||
span_score_weight: float
|
||||
coarse_score_weight: float
|
||||
duration_score_weight: float
|
||||
duration_tie_break_score_delta: float
|
||||
min_duration_coverage: float
|
||||
continuity_seed_offsets_s: tuple[float, ...]
|
||||
scene_seed_top_k: int
|
||||
scene_seed_points_per_scene: int
|
||||
content_rerank_candidate_count: int
|
||||
skip_coarse_scan_with_weighted_seeds: bool
|
||||
max_refine_candidates: int
|
||||
match_method: int
|
||||
refine_window_seconds: float
|
||||
refine_step_seconds: float
|
||||
content_align_window_seconds: float
|
||||
content_align_sample_step_s: float
|
||||
content_validation_weight: float
|
||||
provisional_content_threshold: float
|
||||
start_tie_break_score_delta: float
|
||||
start_preroll_frames: int
|
||||
sequence_candidate_count: int
|
||||
sequence_min_distance_s: float
|
||||
span_sample_step_s: float
|
||||
trim_tail_frames: int
|
||||
scene_boundary_epsilon_s: float
|
||||
scoreable_luma_mean_min: float
|
||||
scoreable_luma_p90_min: float
|
||||
scoreable_contrast_min: float
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class CVConfig:
|
||||
vibe_check: VibeCheckConfig
|
||||
deep_scan: DeepScanConfig
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class SceneDetectionConfig:
|
||||
content_threshold: float
|
||||
min_scene_duration_s: float
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class WhisperConfig:
|
||||
model: str
|
||||
language: str
|
||||
device: Literal["cuda", "cpu"]
|
||||
compute_type: Literal["float16", "int8", "float32"]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class LLMConfig:
|
||||
provider: Literal["ollama", "openai", "openrouter"]
|
||||
base_url: str
|
||||
model: str
|
||||
timeout_seconds: int
|
||||
temperature: float
|
||||
max_tokens: int
|
||||
# Loaded from .env — NEVER committed to version control
|
||||
api_key: str = ""
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class VisionConfig:
|
||||
enabled: bool
|
||||
provider: Literal["openai", "openrouter"]
|
||||
base_url: str
|
||||
model: str
|
||||
timeout_seconds: int
|
||||
temperature: float
|
||||
max_tokens: int
|
||||
scene_candidate_top_k: int
|
||||
max_new_descriptions_per_run: int
|
||||
max_seed_scenes: int
|
||||
seed_points_per_scene: int
|
||||
seed_score: float
|
||||
max_refine_candidates: int
|
||||
local_scan_step_s: float
|
||||
local_scan_max_points_per_scene: int
|
||||
local_scan_top_candidates: int
|
||||
local_scan_tie_break_score_delta: float
|
||||
multi_shot_cut_corr_threshold: float
|
||||
multi_shot_boundary_tolerance_s: float
|
||||
fullscan_fallback: bool
|
||||
content_threshold: float
|
||||
similarity_threshold: float
|
||||
api_key: str = ""
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ExportConfig:
|
||||
fcpxml_version: str
|
||||
edl_frame_rate: float
|
||||
output_format: Literal["fcpxml", "edl", "both"]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Root config — single object passed through the entire application
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class AppConfig:
|
||||
project_name: str
|
||||
version: str
|
||||
log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR"]
|
||||
|
||||
paths: PathsConfig
|
||||
video: VideoConfig
|
||||
cv: CVConfig
|
||||
scene_detection: SceneDetectionConfig
|
||||
whisper: WhisperConfig
|
||||
llm: LLMConfig
|
||||
vision: VisionConfig
|
||||
export: ExportConfig
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Loader
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_DEFAULT_CONFIG_PATH = Path(__file__).parents[2] / "config.toml"
|
||||
_DEFAULT_ENV_PATH = Path(__file__).parents[2] / ".env"
|
||||
|
||||
|
||||
def load_config(
|
||||
config_path: Path = _DEFAULT_CONFIG_PATH,
|
||||
env_path: Path = _DEFAULT_ENV_PATH,
|
||||
) -> AppConfig:
|
||||
"""
|
||||
Parse config.toml and return a fully-typed, immutable AppConfig.
|
||||
|
||||
API keys are read from the .env file (or existing environment variables);
|
||||
they are never stored in config.toml.
|
||||
|
||||
Args:
|
||||
config_path: Absolute or relative path to the TOML file.
|
||||
Defaults to <project_root>/config.toml.
|
||||
env_path: Path to the .env file.
|
||||
Defaults to <project_root>/.env.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If the TOML file does not exist.
|
||||
KeyError / TypeError: If a required key is missing or has the wrong type.
|
||||
"""
|
||||
# Load .env first so os.environ is populated before we read it below.
|
||||
if _HAS_DOTENV:
|
||||
_load_dotenv(dotenv_path=env_path, override=False)
|
||||
|
||||
if not config_path.exists():
|
||||
raise FileNotFoundError(
|
||||
f"Config file not found: {config_path}\n"
|
||||
"Copy config.toml.example to config.toml and adjust your paths."
|
||||
)
|
||||
|
||||
with config_path.open("rb") as fh:
|
||||
raw: dict = tomllib.load(fh)
|
||||
|
||||
project = raw["project"]
|
||||
paths_raw = raw["paths"]
|
||||
video_raw = raw["video"]
|
||||
cv_raw = raw["cv"]
|
||||
sd_raw = raw["scene_detection"]
|
||||
whisper_raw = raw["whisper"]
|
||||
llm_raw = raw["llm"]
|
||||
vision_raw = raw.get("vision", {})
|
||||
export_raw = raw["export"]
|
||||
|
||||
# Resolve paths relative to the config file's parent directory so the
|
||||
# project is relocatable, but keep absolute paths as-is.
|
||||
def _resolve(p: str) -> Path:
|
||||
path = Path(p)
|
||||
return path if path.is_absolute() else (config_path.parent / path).resolve()
|
||||
|
||||
paths = PathsConfig(
|
||||
source_movie=_resolve(paths_raw["source_movie"]),
|
||||
reference_trailer=_resolve(paths_raw["reference_trailer"]),
|
||||
output_dir=_resolve(paths_raw["output_dir"]),
|
||||
cache_dir=_resolve(paths_raw["cache_dir"]),
|
||||
proxy_dir=_resolve(paths_raw["proxy_dir"]),
|
||||
)
|
||||
|
||||
video = VideoConfig(
|
||||
extract_fps=float(video_raw["extract_fps"]),
|
||||
proxy_width=int(video_raw["proxy_width"]),
|
||||
proxy_height=int(video_raw["proxy_height"]),
|
||||
)
|
||||
|
||||
vibe_check = VibeCheckConfig(
|
||||
top_k_candidates=int(cv_raw["vibe_check"]["top_k_candidates"]),
|
||||
hist_compare_method=int(cv_raw["vibe_check"]["hist_compare_method"]),
|
||||
hist_bins_hue=int(cv_raw["vibe_check"]["hist_bins_hue"]),
|
||||
hist_bins_saturation=int(cv_raw["vibe_check"]["hist_bins_saturation"]),
|
||||
phash_max_distance=int(cv_raw["vibe_check"]["phash_max_distance"]),
|
||||
crop_top_fraction=float(cv_raw["vibe_check"]["crop_top_fraction"]),
|
||||
crop_bottom_fraction=float(cv_raw["vibe_check"]["crop_bottom_fraction"]),
|
||||
)
|
||||
|
||||
deep_scan = DeepScanConfig(
|
||||
coarse_step_seconds=float(cv_raw["deep_scan"]["coarse_step_seconds"]),
|
||||
match_threshold=float(cv_raw["deep_scan"]["match_threshold"]),
|
||||
provisional_match_threshold=float(cv_raw["deep_scan"].get("provisional_match_threshold", 0.45)),
|
||||
coarse_candidate_threshold=float(cv_raw["deep_scan"].get("coarse_candidate_threshold", cv_raw["deep_scan"]["match_threshold"])),
|
||||
sequence_score_weight=float(cv_raw["deep_scan"].get("sequence_score_weight", 0.55)),
|
||||
span_score_weight=float(cv_raw["deep_scan"].get("span_score_weight", 0.15)),
|
||||
coarse_score_weight=float(cv_raw["deep_scan"].get("coarse_score_weight", 0.10)),
|
||||
duration_score_weight=float(cv_raw["deep_scan"].get("duration_score_weight", 0.20)),
|
||||
duration_tie_break_score_delta=float(cv_raw["deep_scan"].get("duration_tie_break_score_delta", 0.03)),
|
||||
min_duration_coverage=float(cv_raw["deep_scan"].get("min_duration_coverage", 0.65)),
|
||||
continuity_seed_offsets_s=tuple(
|
||||
float(v) for v in cv_raw["deep_scan"].get(
|
||||
"continuity_seed_offsets_s",
|
||||
[-1.0, 0.0, 0.5, 1.0, 1.5, 2.0, 3.0],
|
||||
)
|
||||
),
|
||||
scene_seed_top_k=int(cv_raw["deep_scan"].get("scene_seed_top_k", 30)),
|
||||
scene_seed_points_per_scene=int(cv_raw["deep_scan"].get("scene_seed_points_per_scene", 6)),
|
||||
content_rerank_candidate_count=int(cv_raw["deep_scan"].get("content_rerank_candidate_count", 100)),
|
||||
skip_coarse_scan_with_weighted_seeds=bool(cv_raw["deep_scan"].get("skip_coarse_scan_with_weighted_seeds", False)),
|
||||
max_refine_candidates=int(cv_raw["deep_scan"].get("max_refine_candidates", 6)),
|
||||
match_method=int(cv_raw["deep_scan"]["match_method"]),
|
||||
refine_window_seconds=float(cv_raw["deep_scan"].get("refine_window_seconds", 0.6)),
|
||||
refine_step_seconds=float(cv_raw["deep_scan"]["refine_step_seconds"]),
|
||||
content_align_window_seconds=float(cv_raw["deep_scan"].get("content_align_window_seconds", 0.48)),
|
||||
content_align_sample_step_s=float(cv_raw["deep_scan"].get("content_align_sample_step_s", 0.28)),
|
||||
content_validation_weight=float(cv_raw["deep_scan"].get("content_validation_weight", 0.35)),
|
||||
provisional_content_threshold=float(cv_raw["deep_scan"].get("provisional_content_threshold", 0.42)),
|
||||
start_tie_break_score_delta=float(cv_raw["deep_scan"].get("start_tie_break_score_delta", 0.015)),
|
||||
start_preroll_frames=int(cv_raw["deep_scan"].get("start_preroll_frames", 0)),
|
||||
sequence_candidate_count=int(cv_raw["deep_scan"].get("sequence_candidate_count", 240)),
|
||||
sequence_min_distance_s=float(cv_raw["deep_scan"].get("sequence_min_distance_s", 1.0)),
|
||||
span_sample_step_s=float(cv_raw["deep_scan"].get("span_sample_step_s", 0.08)),
|
||||
trim_tail_frames=int(cv_raw["deep_scan"].get("trim_tail_frames", 2)),
|
||||
scene_boundary_epsilon_s=float(cv_raw["deep_scan"].get("scene_boundary_epsilon_s", 0.12)),
|
||||
scoreable_luma_mean_min=float(cv_raw["deep_scan"].get("scoreable_luma_mean_min", 24.0)),
|
||||
scoreable_luma_p90_min=float(cv_raw["deep_scan"].get("scoreable_luma_p90_min", 58.0)),
|
||||
scoreable_contrast_min=float(cv_raw["deep_scan"].get("scoreable_contrast_min", 24.0)),
|
||||
)
|
||||
|
||||
scene_detection = SceneDetectionConfig(
|
||||
content_threshold=float(sd_raw["content_threshold"]),
|
||||
min_scene_duration_s=float(sd_raw["min_scene_duration_s"]),
|
||||
)
|
||||
|
||||
whisper = WhisperConfig(
|
||||
model=whisper_raw["model"],
|
||||
language=whisper_raw["language"],
|
||||
device=whisper_raw["device"],
|
||||
compute_type=whisper_raw["compute_type"],
|
||||
)
|
||||
|
||||
# Resolve API key: env var takes precedence over config (which shouldn't have it).
|
||||
# Supported env vars (in priority order):
|
||||
# OPENROUTER_API_KEY → for provider = openrouter
|
||||
# OPENAI_API_KEY → for provider = openai
|
||||
# LLM_API_KEY → universal fallback
|
||||
_provider = llm_raw["provider"]
|
||||
_api_key = (
|
||||
os.environ.get("OPENROUTER_API_KEY", "")
|
||||
if _provider == "openrouter"
|
||||
else os.environ.get("OPENAI_API_KEY", "")
|
||||
if _provider == "openai"
|
||||
else ""
|
||||
) or os.environ.get("LLM_API_KEY", "")
|
||||
|
||||
llm = LLMConfig(
|
||||
provider=_provider,
|
||||
base_url=llm_raw["base_url"],
|
||||
model=llm_raw["model"],
|
||||
timeout_seconds=int(llm_raw["timeout_seconds"]),
|
||||
temperature=float(llm_raw["temperature"]),
|
||||
max_tokens=int(llm_raw["max_tokens"]),
|
||||
api_key=_api_key,
|
||||
)
|
||||
|
||||
vision_provider = vision_raw.get("provider", _provider if _provider in ("openai", "openrouter") else "openrouter")
|
||||
vision_api_key = (
|
||||
os.environ.get("OPENROUTER_API_KEY", "")
|
||||
if vision_provider == "openrouter"
|
||||
else os.environ.get("OPENAI_API_KEY", "")
|
||||
) or os.environ.get("VISION_API_KEY", "") or os.environ.get("LLM_API_KEY", "")
|
||||
|
||||
vision = VisionConfig(
|
||||
enabled=bool(vision_raw.get("enabled", False)),
|
||||
provider=vision_provider,
|
||||
base_url=str(vision_raw.get("base_url", llm.base_url)),
|
||||
model=str(vision_raw.get("model", llm.model)),
|
||||
timeout_seconds=int(vision_raw.get("timeout_seconds", llm.timeout_seconds)),
|
||||
temperature=float(vision_raw.get("temperature", 0.0)),
|
||||
max_tokens=int(vision_raw.get("max_tokens", 350)),
|
||||
scene_candidate_top_k=int(vision_raw.get("scene_candidate_top_k", 8)),
|
||||
max_new_descriptions_per_run=int(vision_raw.get("max_new_descriptions_per_run", 12)),
|
||||
max_seed_scenes=int(vision_raw.get("max_seed_scenes", 3)),
|
||||
seed_points_per_scene=int(vision_raw.get("seed_points_per_scene", 12)),
|
||||
seed_score=float(vision_raw.get("seed_score", 0.88)),
|
||||
max_refine_candidates=int(vision_raw.get("max_refine_candidates", 6)),
|
||||
local_scan_step_s=float(vision_raw.get("local_scan_step_s", 0.12)),
|
||||
local_scan_max_points_per_scene=int(vision_raw.get("local_scan_max_points_per_scene", 180)),
|
||||
local_scan_top_candidates=int(vision_raw.get("local_scan_top_candidates", 18)),
|
||||
local_scan_tie_break_score_delta=float(vision_raw.get("local_scan_tie_break_score_delta", 0.08)),
|
||||
multi_shot_cut_corr_threshold=float(vision_raw.get("multi_shot_cut_corr_threshold", 0.20)),
|
||||
multi_shot_boundary_tolerance_s=float(vision_raw.get("multi_shot_boundary_tolerance_s", 0.20)),
|
||||
fullscan_fallback=bool(vision_raw.get("fullscan_fallback", False)),
|
||||
content_threshold=float(vision_raw.get("content_threshold", 0.22)),
|
||||
similarity_threshold=float(vision_raw.get("similarity_threshold", 0.18)),
|
||||
api_key=vision_api_key,
|
||||
)
|
||||
|
||||
export = ExportConfig(
|
||||
fcpxml_version=str(export_raw["fcpxml_version"]),
|
||||
edl_frame_rate=float(export_raw["edl_frame_rate"]),
|
||||
output_format=export_raw["output_format"],
|
||||
)
|
||||
|
||||
return AppConfig(
|
||||
project_name=project["name"],
|
||||
version=project["version"],
|
||||
log_level=project["log_level"],
|
||||
paths=paths,
|
||||
video=video,
|
||||
cv=CVConfig(vibe_check=vibe_check, deep_scan=deep_scan),
|
||||
scene_detection=scene_detection,
|
||||
whisper=whisper,
|
||||
llm=llm,
|
||||
vision=vision,
|
||||
export=export,
|
||||
)
|
||||
@@ -0,0 +1,287 @@
|
||||
"""
|
||||
src/core/models.py — Canonical data models for AI Trailer Generator v2
|
||||
|
||||
Rules:
|
||||
- Every model is a frozen dataclass (immutable after creation).
|
||||
- All fields are strictly typed; no bare dicts or untyped lists.
|
||||
- Seconds are always float; frame numbers are always int.
|
||||
- Confidence scores live in [0.0, 1.0].
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum, auto
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Enumerations
|
||||
# ===========================================================================
|
||||
|
||||
class MatchMethod(Enum):
|
||||
"""CV template matching method (mirrors cv2.TM_* constants)."""
|
||||
TM_SQDIFF = 0
|
||||
TM_SQDIFF_NORMED = 1
|
||||
TM_CCORR = 2
|
||||
TM_CCORR_NORMED = 3
|
||||
TM_CCOEFF = 4
|
||||
TM_CCOEFF_NORMED = 5
|
||||
|
||||
|
||||
class BeatType(Enum):
|
||||
"""Narrative role of a trailer beat (for dramaturgy / LLM use only)."""
|
||||
HOOK = auto() # Opening attention grabber
|
||||
SETUP = auto() # World / character introduction
|
||||
CONFLICT = auto() # Inciting incident / rising tension
|
||||
CLIMAX = auto() # Peak action / emotion
|
||||
RESOLUTION = auto() # Cool-down / tagline
|
||||
UNKNOWN = auto()
|
||||
|
||||
|
||||
class ExportFormat(Enum):
|
||||
FCPXML = "fcpxml"
|
||||
EDL = "edl"
|
||||
BOTH = "both"
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Phase 0 — Source-movie scene index
|
||||
# ===========================================================================
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class DialogueLine:
|
||||
"""Single transcribed line from Whisper output."""
|
||||
start_s: float # onset in seconds
|
||||
end_s: float # offset in seconds
|
||||
text: str # verbatim transcript
|
||||
speaker: Optional[str] = None # diarisation label if available
|
||||
|
||||
@property
|
||||
def duration_s(self) -> float:
|
||||
return self.end_s - self.start_s
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Scene:
|
||||
"""
|
||||
One detected scene in the source movie.
|
||||
|
||||
Produced by PySceneDetect; enriched by Whisper dialogue and
|
||||
(optionally) perceptual hashes during the Vibe Check phase.
|
||||
"""
|
||||
scene_id: int # zero-based index in source movie
|
||||
source_path: Path # absolute path to the source video file
|
||||
start_s: float # scene start in seconds
|
||||
end_s: float # scene end in seconds
|
||||
start_frame: int # first frame number
|
||||
end_frame: int # last frame number
|
||||
|
||||
# Populated after Vibe Check fingerprinting
|
||||
luma_hist: Optional[bytes] = None # serialised np.ndarray (pickle)
|
||||
sat_hist: Optional[bytes] = None
|
||||
phash: Optional[str] = None # 64-bit hex string
|
||||
|
||||
# Populated after Whisper pass
|
||||
dialogue: tuple[DialogueLine, ...] = field(default_factory=tuple)
|
||||
|
||||
@property
|
||||
def duration_s(self) -> float:
|
||||
return self.end_s - self.start_s
|
||||
|
||||
@property
|
||||
def midpoint_s(self) -> float:
|
||||
return self.start_s + self.duration_s / 2.0
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return (
|
||||
f"Scene(id={self.scene_id}, "
|
||||
f"{self.start_s:.2f}s–{self.end_s:.2f}s, "
|
||||
f"dur={self.duration_s:.2f}s)"
|
||||
)
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Phase 1 — Reference-trailer beat
|
||||
# ===========================================================================
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class TrailerBeat:
|
||||
"""
|
||||
One cut / segment in the reference trailer.
|
||||
|
||||
The 'beat' is the atomic unit of a trailer: it maps exactly to one
|
||||
clip that will later be sourced from the original movie.
|
||||
"""
|
||||
beat_id: int
|
||||
trailer_path: Path
|
||||
start_s: float
|
||||
end_s: float
|
||||
start_frame: int
|
||||
end_frame: int
|
||||
|
||||
beat_type: BeatType = BeatType.UNKNOWN # set by LLM dramaturgy pass
|
||||
|
||||
# Visual fingerprints of the *middle* frame (populated by CV pipeline)
|
||||
luma_hist: Optional[bytes] = None
|
||||
sat_hist: Optional[bytes] = None
|
||||
phash: Optional[str] = None
|
||||
|
||||
# Dialogue extracted from this beat
|
||||
dialogue: tuple[DialogueLine, ...] = field(default_factory=tuple)
|
||||
|
||||
@property
|
||||
def duration_s(self) -> float:
|
||||
return self.end_s - self.start_s
|
||||
|
||||
@property
|
||||
def midpoint_s(self) -> float:
|
||||
return self.start_s + self.duration_s / 2.0
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return (
|
||||
f"TrailerBeat(id={self.beat_id}, "
|
||||
f"{self.beat_type.name}, "
|
||||
f"{self.start_s:.2f}s–{self.end_s:.2f}s)"
|
||||
)
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Phase 2 — CV match result
|
||||
# ===========================================================================
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class VibeHit:
|
||||
"""
|
||||
Intermediate result from Phase 1 (Vibe Check — histogram/pHash).
|
||||
|
||||
Represents a *candidate* scene that passed the coarse filter.
|
||||
Not yet a confirmed match; forwarded to Deep Scan.
|
||||
"""
|
||||
beat_id: int
|
||||
scene_id: int
|
||||
hist_score: float # histogram similarity [0.0, 1.0] (CORREL method)
|
||||
phash_distance: int # Hamming distance [0, 64]; lower = more similar
|
||||
combined_score: float # weighted aggregate used for ranking
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MatchSegment:
|
||||
"""
|
||||
One source-backed visual island inside a trailer beat.
|
||||
|
||||
Some trailer beats contain multiple shots separated by fades/title frames.
|
||||
A single continuous source in/out cannot represent those beats accurately.
|
||||
"""
|
||||
trailer_offset_s: float
|
||||
duration_s: float
|
||||
scene_id: int
|
||||
in_point_s: float
|
||||
out_point_s: float
|
||||
match_score: float
|
||||
is_confirmed: bool = True
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MatchResult:
|
||||
"""
|
||||
Final, confirmed match from Phase 2 (Deep Scan — template matching).
|
||||
|
||||
One MatchResult per TrailerBeat: the best frame-accurate hit found
|
||||
inside the source movie.
|
||||
"""
|
||||
beat_id: int # which trailer beat was matched
|
||||
scene_id: int # which source scene contains the match
|
||||
source_path: Path # absolute path to source video
|
||||
|
||||
# Frame-accurate in-point / out-point in the SOURCE movie
|
||||
in_point_s: float # matched frame onset in source seconds
|
||||
out_point_s: float # computed out-point (in_point + beat duration)
|
||||
in_point_frame: int # matched frame number in source movie
|
||||
|
||||
# Match quality
|
||||
match_score: float # cv2.matchTemplate peak value [0.0, 1.0]
|
||||
match_location: tuple[int, int] = field(default_factory=lambda: (0, 0))
|
||||
# (x, y) pixel location of the best match within the source frame
|
||||
|
||||
# Provenance
|
||||
vibe_hit: Optional[VibeHit] = None # the candidate that led here
|
||||
is_confirmed: bool = True
|
||||
segments: tuple[MatchSegment, ...] = field(default_factory=tuple)
|
||||
|
||||
@property
|
||||
def duration_s(self) -> float:
|
||||
return self.out_point_s - self.in_point_s
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return (
|
||||
f"MatchResult(beat={self.beat_id} → scene={self.scene_id}, "
|
||||
f"in={self.in_point_s:.3f}s, score={self.match_score:.3f})"
|
||||
)
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Phase 3 — Edit timeline (pre-export)
|
||||
# ===========================================================================
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class EditClip:
|
||||
"""
|
||||
One clip on the final edit timeline, ready for FCPXML / EDL export.
|
||||
|
||||
Combines beat dramaturgy + the CV-confirmed source in/out points.
|
||||
"""
|
||||
clip_index: int # position on the timeline (0-based)
|
||||
beat: TrailerBeat
|
||||
match: MatchResult
|
||||
|
||||
# Timeline position (in the OUTPUT trailer)
|
||||
timeline_start_s: float
|
||||
timeline_end_s: float
|
||||
source_duration_s: float | None = None
|
||||
trailer_tail_s: float = 0.0
|
||||
|
||||
# Optional audio override (e.g. VO or music)
|
||||
audio_path: Optional[Path] = None
|
||||
audio_offset_s: float = 0.0
|
||||
|
||||
@property
|
||||
def timeline_duration_s(self) -> float:
|
||||
return self.timeline_end_s - self.timeline_start_s
|
||||
|
||||
@property
|
||||
def source_timeline_duration_s(self) -> float:
|
||||
if self.source_duration_s is not None:
|
||||
return max(0.0, self.source_duration_s)
|
||||
return self.timeline_duration_s
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return (
|
||||
f"EditClip(#{self.clip_index}, "
|
||||
f"tl={self.timeline_start_s:.2f}s–{self.timeline_end_s:.2f}s, "
|
||||
f"src={self.match.in_point_s:.3f}s)"
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class EditTimeline:
|
||||
"""
|
||||
The complete ordered sequence of EditClips that forms the trailer.
|
||||
|
||||
Passed to the export layer (FCPXML / EDL writer).
|
||||
"""
|
||||
title: str
|
||||
frame_rate: float # e.g. 23.976
|
||||
clips: tuple[EditClip, ...] # ordered by clip_index
|
||||
|
||||
@property
|
||||
def total_duration_s(self) -> float:
|
||||
if not self.clips:
|
||||
return 0.0
|
||||
last = max(self.clips, key=lambda c: c.timeline_end_s)
|
||||
return last.timeline_end_s
|
||||
|
||||
@property
|
||||
def clip_count(self) -> int:
|
||||
return len(self.clips)
|
||||
@@ -0,0 +1 @@
|
||||
# src.cv package — Computer Vision engine
|
||||
@@ -0,0 +1,240 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
import shutil
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image, ImageFilter, ImageOps
|
||||
|
||||
from src.core.config import AppConfig
|
||||
from src.core.models import TrailerBeat
|
||||
|
||||
|
||||
def _run(cmd: list[str]) -> None:
|
||||
result = subprocess.run(cmd, capture_output=True)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(result.stderr.decode(errors="replace"))
|
||||
|
||||
|
||||
def _extract_frames(
|
||||
video_path: Path,
|
||||
start_s: float,
|
||||
duration_s: float,
|
||||
fps: float,
|
||||
out_dir: Path,
|
||||
prefix: str,
|
||||
) -> None:
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
_run([
|
||||
"ffmpeg", "-y", "-loglevel", "error",
|
||||
"-ss", str(max(0.0, start_s)),
|
||||
"-i", str(video_path),
|
||||
"-t", str(max(0.04, duration_s)),
|
||||
"-vf", f"scale=640:360,fps={fps}",
|
||||
str(out_dir / f"{prefix}_%04d.png"),
|
||||
])
|
||||
|
||||
|
||||
def _cropped_image(path: Path, cfg: AppConfig) -> Image.Image:
|
||||
image = Image.open(path).convert("L")
|
||||
image = _trim_dark_borders(image)
|
||||
w, h = image.size
|
||||
# Final validation should see the composition. The broader text-safe crop
|
||||
# used for coarse search can remove bodies, furniture and lower-frame
|
||||
# spatial cues that distinguish otherwise similar face/window shots.
|
||||
top = int(h * 0.05)
|
||||
bottom = int(h * 0.95)
|
||||
return image.crop((0, top, w, bottom))
|
||||
|
||||
|
||||
def _trim_dark_borders(image: Image.Image) -> Image.Image:
|
||||
"""Remove encoded black matte/pillarbox borders before content scoring."""
|
||||
gray = image.convert("L")
|
||||
arr = np.asarray(gray, dtype=np.float32)
|
||||
if arr.size == 0:
|
||||
return image
|
||||
h, w = arr.shape[:2]
|
||||
col_signal = np.percentile(arr, 90, axis=0)
|
||||
row_signal = np.percentile(arr, 90, axis=1)
|
||||
active_cols = np.where(col_signal > 18.0)[0]
|
||||
active_rows = np.where(row_signal > 18.0)[0]
|
||||
if active_cols.size >= max(8, int(w * 0.35)):
|
||||
x0 = max(0, int(active_cols[0]) - 2)
|
||||
x1 = min(w, int(active_cols[-1]) + 3)
|
||||
else:
|
||||
x0, x1 = 0, w
|
||||
if active_rows.size >= max(8, int(h * 0.35)):
|
||||
y0 = max(0, int(active_rows[0]) - 2)
|
||||
y1 = min(h, int(active_rows[-1]) + 3)
|
||||
else:
|
||||
y0, y1 = 0, h
|
||||
if x1 - x0 < int(w * 0.35) or y1 - y0 < int(h * 0.35):
|
||||
return image
|
||||
return image.crop((x0, y0, x1, y1))
|
||||
|
||||
|
||||
def _feature(path: Path, cfg: AppConfig) -> np.ndarray:
|
||||
image = _cropped_image(path, cfg)
|
||||
w, h = image.size
|
||||
image = image.crop((int(w * 0.10), int(h * 0.10), int(w * 0.90), int(h * 0.90)))
|
||||
image = ImageOps.equalize(image).filter(ImageFilter.FIND_EDGES).resize((160, 62))
|
||||
arr = np.asarray(image, dtype=np.float32)
|
||||
return (arr - arr.mean()) / (arr.std() + 1e-6)
|
||||
|
||||
|
||||
def _luma_feature(path: Path, cfg: AppConfig) -> np.ndarray:
|
||||
image = ImageOps.equalize(_cropped_image(path, cfg)).resize((160, 80))
|
||||
arr = np.asarray(image, dtype=np.float32)
|
||||
return (arr - arr.mean()) / (arr.std() + 1e-6)
|
||||
|
||||
|
||||
def _hist_feature(path: Path, cfg: AppConfig) -> np.ndarray:
|
||||
image = _trim_dark_borders(Image.open(path).convert("RGB"))
|
||||
w, h = image.size
|
||||
top = int(h * 0.05)
|
||||
bottom = int(h * 0.95)
|
||||
arr = np.asarray(image.crop((0, top, w, bottom)).resize((160, 80)), dtype=np.float32)
|
||||
hist_parts = []
|
||||
for channel in range(3):
|
||||
hist, _ = np.histogram(arr[:, :, channel], bins=32, range=(0, 255))
|
||||
hist = hist.astype(np.float32)
|
||||
hist_parts.append(hist / (hist.sum() + 1e-6))
|
||||
return np.concatenate(hist_parts)
|
||||
|
||||
|
||||
def _spatial_hist_feature(path: Path, cfg: AppConfig) -> np.ndarray:
|
||||
image = _trim_dark_borders(Image.open(path).convert("RGB"))
|
||||
w, h = image.size
|
||||
top = int(h * 0.05)
|
||||
bottom = int(h * 0.95)
|
||||
arr = np.asarray(image.crop((0, top, w, bottom)).resize((160, 80)), dtype=np.float32)
|
||||
cells = []
|
||||
grid_y = 4
|
||||
grid_x = 4
|
||||
cell_h = arr.shape[0] // grid_y
|
||||
cell_w = arr.shape[1] // grid_x
|
||||
for gy in range(grid_y):
|
||||
for gx in range(grid_x):
|
||||
cell = arr[gy * cell_h:(gy + 1) * cell_h, gx * cell_w:(gx + 1) * cell_w, :]
|
||||
for channel in range(3):
|
||||
hist, _ = np.histogram(cell[:, :, channel], bins=16, range=(0, 255))
|
||||
hist = hist.astype(np.float32)
|
||||
cells.append(hist / (hist.sum() + 1e-6))
|
||||
return np.concatenate(cells)
|
||||
|
||||
|
||||
def _is_dark(path: Path, cfg: AppConfig) -> bool:
|
||||
image = _trim_dark_borders(Image.open(path).convert("L"))
|
||||
w, h = image.size
|
||||
top = int(h * 0.05)
|
||||
bottom = int(h * 0.95)
|
||||
arr = np.asarray(image.crop((0, top, w, bottom)), dtype=np.float32)
|
||||
return float(arr.mean()) < 28.0 and float(np.percentile(arr, 90)) < 58.0
|
||||
|
||||
|
||||
def _corr(a: np.ndarray, b: np.ndarray) -> float:
|
||||
return float((a * b).mean())
|
||||
|
||||
|
||||
def _hist_intersection(a: np.ndarray, b: np.ndarray) -> float:
|
||||
return float(np.minimum(a, b).sum() / (np.maximum(a, b).sum() + 1e-6))
|
||||
|
||||
|
||||
def _paired_frame_score(ref_path: Path, src_path: Path, cfg: AppConfig) -> float:
|
||||
edge_score = _corr(_feature(ref_path, cfg), _feature(src_path, cfg))
|
||||
luma_score = _corr(_luma_feature(ref_path, cfg), _luma_feature(src_path, cfg))
|
||||
hist_score = _hist_intersection(_hist_feature(ref_path, cfg), _hist_feature(src_path, cfg))
|
||||
spatial_score = _hist_intersection(_spatial_hist_feature(ref_path, cfg), _spatial_hist_feature(src_path, cfg))
|
||||
return (
|
||||
edge_score * 0.24
|
||||
+ luma_score * 0.24
|
||||
+ hist_score * 0.14
|
||||
+ spatial_score * 0.38
|
||||
)
|
||||
|
||||
|
||||
def align_cached_match_by_content(
|
||||
beat: TrailerBeat,
|
||||
estimated_in_point_s: float,
|
||||
cfg: AppConfig,
|
||||
search_window_s: float | None = None,
|
||||
fps: float = 25.0,
|
||||
) -> tuple[float, float]:
|
||||
"""
|
||||
Measure the local source offset directly from rendered frame content.
|
||||
|
||||
This is intentionally independent from the global OpenCV matcher: it only
|
||||
needs FFmpeg, Pillow and numpy, and it scans a small window around an
|
||||
already plausible candidate.
|
||||
"""
|
||||
window_s = (
|
||||
search_window_s
|
||||
if search_window_s is not None
|
||||
else cfg.cv.deep_scan.content_align_window_seconds
|
||||
)
|
||||
sample_step_s = max(1.0 / fps, cfg.cv.deep_scan.content_align_sample_step_s)
|
||||
source_start_s = max(0.0, estimated_in_point_s - window_s)
|
||||
source_duration_s = beat.duration_s + (2.0 * window_s) + 0.5
|
||||
|
||||
tmp = cfg.paths.output_dir / "align_tmp" / f"beat_{beat.beat_id:03d}"
|
||||
shutil.rmtree(tmp, ignore_errors=True)
|
||||
tmp.mkdir(parents=True, exist_ok=True)
|
||||
try:
|
||||
ref_dir = tmp / "ref"
|
||||
src_dir = tmp / "src"
|
||||
_extract_frames(beat.trailer_path, beat.start_s, beat.duration_s, fps, ref_dir, "ref")
|
||||
_extract_frames(cfg.paths.source_movie, source_start_s, source_duration_s, fps, src_dir, "src")
|
||||
|
||||
ref_frames = sorted(ref_dir.glob("ref_*.png"))
|
||||
src_frames = sorted(src_dir.glob("src_*.png"))
|
||||
if not ref_frames or not src_frames:
|
||||
return estimated_in_point_s, 0.0
|
||||
|
||||
sample_frame_step = max(1, int(round(sample_step_s * fps)))
|
||||
min_matchable_frames = max(1, len(ref_frames) - int(round(0.24 * fps)))
|
||||
template_offsets: list[int] = []
|
||||
templates: list[tuple[int, np.ndarray]] = []
|
||||
for idx in range(0, min_matchable_frames, sample_frame_step):
|
||||
path = ref_frames[idx]
|
||||
if _is_dark(path, cfg):
|
||||
continue
|
||||
template_offsets.append(idx)
|
||||
templates.append((idx, _feature(path, cfg)))
|
||||
|
||||
if len(templates) < 3:
|
||||
template_offsets = list(range(0, min_matchable_frames, sample_frame_step))
|
||||
templates = [
|
||||
(idx, _feature(ref_frames[idx], cfg))
|
||||
for idx in template_offsets
|
||||
]
|
||||
|
||||
search_start_frame = 0
|
||||
search_end_frame = max(0, len(src_frames) - min_matchable_frames)
|
||||
estimated_frame = int(round((estimated_in_point_s - source_start_s) * fps))
|
||||
best_frame = estimated_frame
|
||||
best_score = -1.0
|
||||
|
||||
for candidate_frame in range(search_start_frame, search_end_frame + 1):
|
||||
scores: list[float] = []
|
||||
for offset_frame in template_offsets:
|
||||
src_idx = candidate_frame + offset_frame
|
||||
if src_idx < 0 or src_idx >= len(src_frames):
|
||||
break
|
||||
scores.append(_paired_frame_score(ref_frames[offset_frame], src_frames[src_idx], cfg))
|
||||
if len(scores) < max(3, math.ceil(len(templates) * 0.65)):
|
||||
continue
|
||||
|
||||
avg_score = sum(scores) / len(scores)
|
||||
min_score = min(scores)
|
||||
score = (avg_score * 0.68) + (min_score * 0.32)
|
||||
if score > best_score + 0.003:
|
||||
best_score = score
|
||||
best_frame = candidate_frame
|
||||
elif score >= best_score - 0.003 and abs(candidate_frame - estimated_frame) < abs(best_frame - estimated_frame):
|
||||
best_frame = candidate_frame
|
||||
|
||||
return source_start_s + (best_frame / fps), max(0.0, best_score)
|
||||
finally:
|
||||
shutil.rmtree(tmp, ignore_errors=True)
|
||||
@@ -0,0 +1,253 @@
|
||||
"""
|
||||
src/cv/deep_scan.py — Phase 2: Frame-accurate template matching (Deep Scan)
|
||||
|
||||
Responsibility:
|
||||
Given a TrailerBeat and a ranked list of VibeHit candidates, open the
|
||||
source video and scan each candidate scene in two passes:
|
||||
|
||||
1. Coarse pass: step through at coarse_step_seconds intervals,
|
||||
comparing via cv2.matchTemplate.
|
||||
2. Refine pass: if coarse score > threshold, zoom in ± refine_window_seconds
|
||||
at refine_step_seconds resolution to pin the exact in-point.
|
||||
|
||||
Returns a MatchResult if a confident hit is found, otherwise None.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Sequence
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
from src.core.config import AppConfig
|
||||
from src.core.models import MatchResult, Scene, TrailerBeat, VibeHit
|
||||
from src.cv.fingerprinting import text_safe_crop
|
||||
from src.cv.frame_extractor import (
|
||||
grab_frame_at,
|
||||
grab_frame_at_path,
|
||||
iter_frames_stepped,
|
||||
open_video,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Template preparation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _prepare_template(
|
||||
trailer_beat: TrailerBeat,
|
||||
cfg: AppConfig,
|
||||
proxy_w: int,
|
||||
proxy_h: int,
|
||||
) -> np.ndarray | None:
|
||||
"""
|
||||
Extract, crop, and resize the representative frame from the trailer beat.
|
||||
|
||||
This frame becomes the cv2.matchTemplate "needle".
|
||||
"""
|
||||
vc = cfg.cv.vibe_check
|
||||
ds = cfg.cv.deep_scan
|
||||
|
||||
beat_frame = grab_frame_at_path(
|
||||
trailer_beat.trailer_path,
|
||||
trailer_beat.midpoint_s,
|
||||
)
|
||||
if beat_frame is None:
|
||||
logger.warning("Beat %d: cannot decode midpoint frame.", trailer_beat.beat_id)
|
||||
return None
|
||||
|
||||
cropped = text_safe_crop(beat_frame, vc.crop_top_fraction, vc.crop_bottom_fraction)
|
||||
resized = cv2.resize(cropped, (proxy_w, proxy_h), interpolation=cv2.INTER_AREA)
|
||||
|
||||
# Crop the template by 10% on all sides to allow sliding window (translation invariance)
|
||||
# when matching against the source movie, which might have slight pan/scan shifts.
|
||||
margin_y = int(proxy_h * 0.10)
|
||||
margin_x = int(proxy_w * 0.10)
|
||||
template = resized[margin_y : proxy_h - margin_y, margin_x : proxy_w - margin_x]
|
||||
|
||||
return template
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Single-frame match
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _match_frame(
|
||||
source_frame: np.ndarray,
|
||||
template: np.ndarray,
|
||||
method: int,
|
||||
proxy_w: int,
|
||||
proxy_h: int,
|
||||
crop_top: float,
|
||||
crop_bottom: float,
|
||||
) -> tuple[float, tuple[int, int]]:
|
||||
"""
|
||||
Run cv2.matchTemplate between *source_frame* and *template*.
|
||||
|
||||
Returns:
|
||||
(score, (x, y)) where score ∈ [0, 1] for CCOEFF_NORMED.
|
||||
"""
|
||||
cropped = text_safe_crop(source_frame, crop_top, crop_bottom)
|
||||
haystack = cv2.resize(cropped, (proxy_w, proxy_h), interpolation=cv2.INTER_AREA)
|
||||
|
||||
# Match the slightly smaller template inside the full proxy frame
|
||||
result = cv2.matchTemplate(haystack, template, method)
|
||||
_, max_val, _, max_loc = cv2.minMaxLoc(result)
|
||||
return float(max_val), (int(max_loc[0]), int(max_loc[1]))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Deep Scan core
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def scan_scene(
|
||||
beat: TrailerBeat,
|
||||
scene: Scene,
|
||||
template: np.ndarray,
|
||||
cfg: AppConfig,
|
||||
) -> tuple[float, float, tuple[int, int]] | None:
|
||||
"""
|
||||
Scan one source scene in two passes (coarse → refine).
|
||||
|
||||
Returns:
|
||||
(best_timestamp_s, best_score, best_location) or None if no hit.
|
||||
"""
|
||||
ds = cfg.cv.deep_scan
|
||||
vc = cfg.cv.vibe_check
|
||||
proxy_w = cfg.video.proxy_width
|
||||
proxy_h = cfg.video.proxy_height
|
||||
|
||||
best_t = scene.start_s
|
||||
best_score = 0.0
|
||||
best_loc = (0, 0)
|
||||
|
||||
# ---- Coarse pass --------------------------------------------------------
|
||||
with open_video(scene.source_path) as cap:
|
||||
for t, frame in iter_frames_stepped(
|
||||
cap, scene.start_s, scene.end_s, ds.coarse_step_seconds
|
||||
):
|
||||
score, loc = _match_frame(
|
||||
frame, template, ds.match_method,
|
||||
proxy_w, proxy_h, vc.crop_top_fraction, vc.crop_bottom_fraction,
|
||||
)
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_t = t
|
||||
best_loc = loc
|
||||
|
||||
if best_score < ds.match_threshold:
|
||||
return None # scene doesn't contain a match worth refining
|
||||
|
||||
# ---- Refine pass ----------------------------------------------------
|
||||
refine_start = max(scene.start_s, best_t - ds.refine_window_seconds)
|
||||
refine_end = min(scene.end_s, best_t + ds.refine_window_seconds)
|
||||
|
||||
refined_t = best_t
|
||||
refined_score = best_score
|
||||
refined_loc = best_loc
|
||||
|
||||
for t, frame in iter_frames_stepped(
|
||||
cap, refine_start, refine_end, ds.refine_step_seconds
|
||||
):
|
||||
score, loc = _match_frame(
|
||||
frame, template, ds.match_method,
|
||||
proxy_w, proxy_h, vc.crop_top_fraction, vc.crop_bottom_fraction,
|
||||
)
|
||||
if score > refined_score:
|
||||
refined_score = score
|
||||
refined_t = t
|
||||
refined_loc = loc
|
||||
|
||||
logger.debug(
|
||||
"Beat %d → Scene %d: coarse=%.3f refined=%.3f @%.3fs",
|
||||
beat.beat_id, scene.scene_id, best_score, refined_score, refined_t,
|
||||
)
|
||||
return refined_t, refined_score, refined_loc
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def run_deep_scan(
|
||||
beat: TrailerBeat,
|
||||
candidates: Sequence[VibeHit],
|
||||
scenes_by_id: dict[int, Scene],
|
||||
cfg: AppConfig,
|
||||
) -> MatchResult | None:
|
||||
"""
|
||||
Phase 2 Deep Scan: iterate over Vibe Check candidates and template-match.
|
||||
|
||||
Args:
|
||||
beat: The trailer beat to source.
|
||||
candidates: Ranked VibeHit list from Phase 1 (best first).
|
||||
scenes_by_id: Lookup dict: scene_id → Scene.
|
||||
cfg: Application configuration.
|
||||
|
||||
Returns:
|
||||
The best MatchResult above threshold, or None if no match found.
|
||||
"""
|
||||
proxy_w = cfg.video.proxy_width
|
||||
proxy_h = cfg.video.proxy_height
|
||||
|
||||
template = _prepare_template(beat, cfg, proxy_w, proxy_h)
|
||||
if template is None:
|
||||
return None
|
||||
|
||||
best_result: MatchResult | None = None
|
||||
|
||||
for vibe_hit in candidates:
|
||||
scene = scenes_by_id.get(vibe_hit.scene_id)
|
||||
if scene is None:
|
||||
logger.warning("VibeHit references unknown scene_id=%d", vibe_hit.scene_id)
|
||||
continue
|
||||
|
||||
hit = scan_scene(beat, scene, template, cfg)
|
||||
if hit is None:
|
||||
continue
|
||||
|
||||
in_point_s, match_score, match_loc = hit
|
||||
|
||||
# Frame number: approximate via FPS (refined later if needed)
|
||||
from src.cv.frame_extractor import get_video_info
|
||||
info = get_video_info(scene.source_path)
|
||||
fps = float(info["fps"]) or 24.0
|
||||
in_point_frame = int(in_point_s * fps)
|
||||
|
||||
candidate_result = MatchResult(
|
||||
beat_id=beat.beat_id,
|
||||
scene_id=scene.scene_id,
|
||||
source_path=scene.source_path,
|
||||
in_point_s=in_point_s,
|
||||
out_point_s=in_point_s + beat.duration_s,
|
||||
in_point_frame=in_point_frame,
|
||||
match_score=match_score,
|
||||
match_location=match_loc,
|
||||
vibe_hit=vibe_hit,
|
||||
)
|
||||
|
||||
if best_result is None or match_score > best_result.match_score:
|
||||
best_result = candidate_result
|
||||
|
||||
# Early exit: if score is very high, no need to check other candidates
|
||||
if match_score >= 0.90:
|
||||
logger.info(
|
||||
"Beat %d: early-exit match (score=%.3f) in scene %d @%.3fs",
|
||||
beat.beat_id, match_score, scene.scene_id, in_point_s,
|
||||
)
|
||||
break
|
||||
|
||||
if best_result:
|
||||
logger.info("Beat %d → MATCH scene=%d score=%.3f in=%.3fs",
|
||||
beat.beat_id, best_result.scene_id,
|
||||
best_result.match_score, best_result.in_point_s)
|
||||
else:
|
||||
logger.warning("Beat %d → NO MATCH found in %d candidates.",
|
||||
beat.beat_id, len(candidates))
|
||||
|
||||
return best_result
|
||||
@@ -0,0 +1,228 @@
|
||||
"""
|
||||
src/cv/fingerprinting.py — Image fingerprinting for the Vibe Check phase
|
||||
|
||||
Responsibilities (Single Responsibility Principle):
|
||||
- Text-Safe Crop: strip top/bottom fractions to hide logos & letterbox
|
||||
- Luma + Saturation histogram extraction (scale-invariant)
|
||||
- Perceptual hash (pHash) via imagehash
|
||||
|
||||
This module is PURELY functional — no file I/O, no video decoding,
|
||||
no search logic. It takes numpy arrays and returns numeric descriptors.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pickle
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
try:
|
||||
import imagehash
|
||||
from PIL import Image as PilImage
|
||||
_HAS_IMAGEHASH = True
|
||||
except ImportError:
|
||||
_HAS_IMAGEHASH = False
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from src.core.config import VibeCheckConfig
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Text-Safe Crop
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def text_safe_crop(
|
||||
frame: np.ndarray,
|
||||
crop_top: float,
|
||||
crop_bottom: float,
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Remove the top and bottom fractions of a frame.
|
||||
|
||||
This eliminates title cards, logos (top) and letterbox / subtitles
|
||||
(bottom) before any colour analysis, preventing false positives.
|
||||
|
||||
Args:
|
||||
frame: BGR or greyscale frame as (H, W[, C]) ndarray.
|
||||
crop_top: Fraction [0, 1) of height to remove from the top.
|
||||
crop_bottom: Fraction [0, 1) of height to remove from the bottom.
|
||||
|
||||
Returns:
|
||||
Cropped view (no copy — avoids memory overhead).
|
||||
|
||||
Raises:
|
||||
ValueError: If crop fractions are out of range or overlap.
|
||||
"""
|
||||
if not (0.0 <= crop_top < 1.0):
|
||||
raise ValueError(f"crop_top must be in [0, 1); got {crop_top}")
|
||||
if not (0.0 <= crop_bottom < 1.0):
|
||||
raise ValueError(f"crop_bottom must be in [0, 1); got {crop_bottom}")
|
||||
if crop_top + crop_bottom >= 1.0:
|
||||
raise ValueError(
|
||||
f"crop_top ({crop_top}) + crop_bottom ({crop_bottom}) must be < 1.0"
|
||||
)
|
||||
|
||||
h = frame.shape[0]
|
||||
y_start = int(h * crop_top)
|
||||
y_end = int(h * (1.0 - crop_bottom))
|
||||
return frame[y_start:y_end]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Histogram extraction
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def extract_hs_histograms(
|
||||
frame_bgr: np.ndarray,
|
||||
bins_luma: int | None = None,
|
||||
bins_sat: int | None = None,
|
||||
*,
|
||||
bins_hue: int | None = None,
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
"""
|
||||
Compute normalised Luma (Value) and Saturation histograms from a BGR frame.
|
||||
|
||||
We use Luma and Saturation (ignoring Hue) because Hue is highly sensitive
|
||||
to color grading differences between the trailer and the source movie.
|
||||
|
||||
Args:
|
||||
frame_bgr: BGR frame (H, W, 3) uint8.
|
||||
bins_luma: Number of histogram bins for the Luma channel [0, 256).
|
||||
bins_hue: Backwards-compatible alias for bins_luma.
|
||||
bins_sat: Number of histogram bins for the Saturation channel [0, 256).
|
||||
|
||||
Returns:
|
||||
(luma_hist, sat_hist) — each a 1-D float32 ndarray, L2-normalised.
|
||||
"""
|
||||
if bins_luma is None:
|
||||
bins_luma = bins_hue
|
||||
elif bins_hue is not None and bins_hue != bins_luma:
|
||||
raise ValueError("bins_hue is an alias for bins_luma; pass only one value")
|
||||
if bins_luma is None or bins_sat is None:
|
||||
raise TypeError("bins_luma/bins_hue and bins_sat are required")
|
||||
|
||||
hsv = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2HSV)
|
||||
luma = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
# Use perceptual grayscale luma rather than HSV Value. Value would make
|
||||
# saturated red and blue look identical, weakening the scene-level filter.
|
||||
luma_hist = cv2.calcHist(
|
||||
[luma], [0], None, [bins_luma], [0, 256]
|
||||
).flatten().astype(np.float32)
|
||||
|
||||
sat_hist = cv2.calcHist(
|
||||
[hsv], [1], None, [bins_sat], [0, 256]
|
||||
).flatten().astype(np.float32)
|
||||
|
||||
# L2-normalise so scene size doesn't affect scores
|
||||
cv2.normalize(luma_hist, luma_hist, alpha=1.0, norm_type=cv2.NORM_L2)
|
||||
cv2.normalize(sat_hist, sat_hist, alpha=1.0, norm_type=cv2.NORM_L2)
|
||||
|
||||
return luma_hist, sat_hist
|
||||
|
||||
|
||||
def compare_histograms(
|
||||
hist_a: np.ndarray,
|
||||
hist_b: np.ndarray,
|
||||
method: int,
|
||||
) -> float:
|
||||
"""
|
||||
Compare two histograms using cv2.compareHist.
|
||||
|
||||
Args:
|
||||
hist_a, hist_b: 1-D float32 ndarrays of identical shape.
|
||||
method: cv2.HISTCMP_* constant (e.g. cv2.HISTCMP_CORREL = 0).
|
||||
|
||||
Returns:
|
||||
Raw score from cv2.compareHist (range depends on method).
|
||||
For CORREL: [-1, 1], higher = more similar.
|
||||
For BHATTACHARYYA: [0, 1], lower = more similar.
|
||||
"""
|
||||
return float(cv2.compareHist(hist_a, hist_b, method))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Perceptual Hash
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def compute_phash(frame_bgr: np.ndarray, hash_size: int = 8) -> str:
|
||||
"""
|
||||
Compute a perceptual hash (pHash) of a BGR frame.
|
||||
|
||||
pHash is rotation- and scale-invariant; it catches visual similarity
|
||||
even when resolution differs between trailer proxy and source movie.
|
||||
|
||||
Args:
|
||||
frame_bgr: BGR frame (H, W, 3) uint8.
|
||||
hash_size: DCT block size; 8 → 64-bit hash (default).
|
||||
|
||||
Returns:
|
||||
Hex string representation of the 64-bit hash (e.g. "f8e0e0e0...").
|
||||
|
||||
Raises:
|
||||
RuntimeError: If imagehash is not installed.
|
||||
"""
|
||||
if not _HAS_IMAGEHASH:
|
||||
raise RuntimeError(
|
||||
"imagehash is not installed. Run: pip install imagehash"
|
||||
)
|
||||
rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
|
||||
pil = PilImage.fromarray(rgb)
|
||||
phash = imagehash.phash(pil, hash_size=hash_size)
|
||||
return str(phash)
|
||||
|
||||
|
||||
def phash_distance(hash_a: str, hash_b: str) -> int:
|
||||
"""
|
||||
Compute Hamming distance between two pHash hex strings.
|
||||
|
||||
Args:
|
||||
hash_a, hash_b: Hex strings as returned by compute_phash().
|
||||
|
||||
Returns:
|
||||
Integer Hamming distance [0, 64]. 0 = identical.
|
||||
"""
|
||||
if not _HAS_IMAGEHASH:
|
||||
raise RuntimeError("imagehash is not installed.")
|
||||
return int(imagehash.hex_to_hash(hash_a) - imagehash.hex_to_hash(hash_b))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Serialisation helpers (histograms ↔ bytes for caching)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def hist_to_bytes(hist: np.ndarray) -> bytes:
|
||||
"""Serialise a numpy histogram array for storage in a Scene/Beat model."""
|
||||
return pickle.dumps(hist, protocol=pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
|
||||
def bytes_to_hist(data: bytes) -> np.ndarray:
|
||||
"""Deserialise a numpy histogram array from bytes."""
|
||||
return pickle.loads(data) # noqa: S301 (trusted internal cache only)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# High-level convenience: fingerprint one frame using config
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def fingerprint_frame(
|
||||
frame_bgr: np.ndarray,
|
||||
cfg: "VibeCheckConfig",
|
||||
) -> tuple[bytes, bytes, str]:
|
||||
"""
|
||||
Apply Text-Safe Crop, histogram extraction, and pHash in one call.
|
||||
|
||||
Args:
|
||||
frame_bgr: Full BGR frame (H, W, 3) uint8.
|
||||
cfg: VibeCheckConfig carrying crop fractions and bin counts.
|
||||
|
||||
Returns:
|
||||
(luma_hist_bytes, sat_hist_bytes, phash_hex)
|
||||
"""
|
||||
cropped = text_safe_crop(frame_bgr, cfg.crop_top_fraction, cfg.crop_bottom_fraction)
|
||||
luma_hist, sat_hist = extract_hs_histograms(cropped, cfg.hist_bins_hue, cfg.hist_bins_saturation)
|
||||
phash_hex = compute_phash(cropped)
|
||||
|
||||
return hist_to_bytes(luma_hist), hist_to_bytes(sat_hist), phash_hex
|
||||
@@ -0,0 +1,172 @@
|
||||
"""
|
||||
src/cv/frame_extractor.py — Low-level video frame access
|
||||
|
||||
Responsibility:
|
||||
Provide a thin, testable wrapper around cv2.VideoCapture for:
|
||||
- seeking to an exact timestamp and returning one BGR frame
|
||||
- iterating frames with a configurable step size
|
||||
- extracting the "representative" middle frame of a Scene / TrailerBeat
|
||||
|
||||
No fingerprinting, no matching — only raw frame delivery.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
from typing import Generator, Iterator
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Context-managed VideoCapture
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@contextmanager
|
||||
def open_video(path: Path) -> Generator[cv2.VideoCapture, None, None]:
|
||||
"""
|
||||
Context manager that opens a VideoCapture and guarantees release.
|
||||
|
||||
Args:
|
||||
path: Absolute path to the video file.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If the file does not exist.
|
||||
RuntimeError: If OpenCV cannot open the file.
|
||||
"""
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"Video not found: {path}")
|
||||
|
||||
cap = cv2.VideoCapture(str(path))
|
||||
if not cap.isOpened():
|
||||
raise RuntimeError(f"OpenCV could not open video: {path}")
|
||||
|
||||
try:
|
||||
yield cap
|
||||
finally:
|
||||
cap.release()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Video metadata
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def get_video_info(path: Path) -> dict[str, float | int]:
|
||||
"""
|
||||
Return basic metadata without keeping the file open.
|
||||
|
||||
Returns:
|
||||
dict with keys: fps, frame_count, duration_s, width, height
|
||||
"""
|
||||
with open_video(path) as cap:
|
||||
fps = cap.get(cv2.CAP_PROP_FPS)
|
||||
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||
|
||||
duration_s = frame_count / fps if fps > 0 else 0.0
|
||||
return {
|
||||
"fps": fps,
|
||||
"frame_count": frame_count,
|
||||
"duration_s": duration_s,
|
||||
"width": width,
|
||||
"height": height,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Single frame extraction
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def grab_frame_at(cap: cv2.VideoCapture, timestamp_s: float) -> np.ndarray | None:
|
||||
"""
|
||||
Seek to *timestamp_s* and return the BGR frame at that position.
|
||||
|
||||
Uses CAP_PROP_POS_MSEC for sub-frame accuracy.
|
||||
|
||||
Args:
|
||||
cap: An already-open VideoCapture.
|
||||
timestamp_s: Target time in seconds.
|
||||
|
||||
Returns:
|
||||
BGR ndarray (H, W, 3) or None if seeking / decoding failed.
|
||||
"""
|
||||
cap.set(cv2.CAP_PROP_POS_MSEC, timestamp_s * 1000.0)
|
||||
ok, frame = cap.read()
|
||||
if not ok or frame is None:
|
||||
logger.debug("grab_frame_at: failed at %.3fs", timestamp_s)
|
||||
return None
|
||||
return frame
|
||||
|
||||
|
||||
def grab_frame_at_path(path: Path, timestamp_s: float) -> np.ndarray | None:
|
||||
"""
|
||||
One-shot convenience: open → seek → grab → release.
|
||||
Prefer open_video() when grabbing multiple frames from the same file.
|
||||
"""
|
||||
with open_video(path) as cap:
|
||||
return grab_frame_at(cap, timestamp_s)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Middle-frame extraction (representative frame for fingerprinting)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def grab_midpoint_frame(
|
||||
cap: cv2.VideoCapture,
|
||||
start_s: float,
|
||||
end_s: float,
|
||||
) -> np.ndarray | None:
|
||||
"""
|
||||
Grab the frame at the exact midpoint of a [start_s, end_s] interval.
|
||||
|
||||
Args:
|
||||
cap: Open VideoCapture for the source video.
|
||||
start_s: Interval start in seconds.
|
||||
end_s: Interval end in seconds.
|
||||
|
||||
Returns:
|
||||
BGR frame or None if decoding failed.
|
||||
"""
|
||||
mid = start_s + (end_s - start_s) / 2.0
|
||||
return grab_frame_at(cap, mid)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Stepped-frame iterator (used by Deep Scan coarse pass)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def iter_frames_stepped(
|
||||
cap: cv2.VideoCapture,
|
||||
start_s: float,
|
||||
end_s: float,
|
||||
step_s: float,
|
||||
) -> Iterator[tuple[float, np.ndarray]]:
|
||||
"""
|
||||
Yield (timestamp_s, frame) for every *step_s* increment in [start_s, end_s].
|
||||
|
||||
Frames that fail to decode are silently skipped.
|
||||
|
||||
Args:
|
||||
cap: Open VideoCapture.
|
||||
start_s: Scan window start in seconds.
|
||||
end_s: Scan window end in seconds.
|
||||
step_s: Step between samples in seconds.
|
||||
|
||||
Yields:
|
||||
(timestamp_s, bgr_frame)
|
||||
"""
|
||||
if step_s <= 0:
|
||||
raise ValueError(f"step_s must be > 0; got {step_s}")
|
||||
|
||||
t = start_s
|
||||
while t <= end_s:
|
||||
frame = grab_frame_at(cap, t)
|
||||
if frame is not None:
|
||||
yield t, frame
|
||||
t = round(t + step_s, 6) # avoid float accumulation drift
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,229 @@
|
||||
"""
|
||||
src/cv/scene_indexer.py — Source-movie scene segmentation + fingerprinting
|
||||
|
||||
Responsibility:
|
||||
1. Run PySceneDetect on the source movie → list of raw scene boundaries
|
||||
2. For each scene, extract the midpoint frame and fingerprint it
|
||||
3. Optionally run Whisper dialogue on each scene (injected as dependency)
|
||||
4. Persist results to .cache/ as JSON for fast re-runs
|
||||
|
||||
Returns: list[Scene] with luma_hist, sat_hist, phash populated.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
from typing import Callable, Sequence
|
||||
|
||||
import numpy as np
|
||||
|
||||
from src.core.config import AppConfig
|
||||
from src.core.models import Scene
|
||||
from src.cv.fingerprinting import fingerprint_frame
|
||||
from src.cv.frame_extractor import grab_midpoint_frame, open_video
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Type alias for an optional dialogue-injection callback
|
||||
DialogueCallback = Callable[[Scene], Scene]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Cache helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _cache_path(cfg: AppConfig) -> Path:
|
||||
p = cfg.paths.cache_dir / "scene_index.json"
|
||||
p.parent.mkdir(parents=True, exist_ok=True)
|
||||
return p
|
||||
|
||||
|
||||
def _scene_to_dict(s: Scene) -> dict:
|
||||
return {
|
||||
"scene_id": s.scene_id,
|
||||
"source_path": str(s.source_path),
|
||||
"start_s": s.start_s,
|
||||
"end_s": s.end_s,
|
||||
"start_frame": s.start_frame,
|
||||
"end_frame": s.end_frame,
|
||||
# histograms serialised as hex so JSON can hold them
|
||||
"luma_hist": s.luma_hist.hex() if s.luma_hist else None,
|
||||
"sat_hist": s.sat_hist.hex() if s.sat_hist else None,
|
||||
"phash": s.phash,
|
||||
}
|
||||
|
||||
|
||||
def _scene_from_dict(d: dict) -> Scene:
|
||||
return Scene(
|
||||
scene_id=d["scene_id"],
|
||||
source_path=Path(d["source_path"]),
|
||||
start_s=d["start_s"],
|
||||
end_s=d["end_s"],
|
||||
start_frame=d["start_frame"],
|
||||
end_frame=d["end_frame"],
|
||||
luma_hist=bytes.fromhex(d["luma_hist"]) if d.get("luma_hist") else None,
|
||||
sat_hist= bytes.fromhex(d["sat_hist"]) if d.get("sat_hist") else None,
|
||||
phash=d.get("phash"),
|
||||
)
|
||||
|
||||
|
||||
def _save_cache(scenes: list[Scene], cfg: AppConfig) -> None:
|
||||
data = [_scene_to_dict(s) for s in scenes]
|
||||
_cache_path(cfg).write_text(json.dumps(data, indent=2), encoding="utf-8")
|
||||
logger.info("Scene index cached → %s (%d scenes)", _cache_path(cfg), len(scenes))
|
||||
|
||||
|
||||
def _load_cache(cfg: AppConfig) -> list[Scene] | None:
|
||||
p = _cache_path(cfg)
|
||||
if not p.exists():
|
||||
return None
|
||||
try:
|
||||
data = json.loads(p.read_text(encoding="utf-8"))
|
||||
scenes = [_scene_from_dict(d) for d in data]
|
||||
logger.info("Loaded %d scenes from cache (%s)", len(scenes), p)
|
||||
return scenes
|
||||
except Exception as exc:
|
||||
logger.warning("Cache corrupt, re-indexing: %s", exc)
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# PySceneDetect integration
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _detect_scenes_pyscenedetect(cfg: AppConfig) -> list[tuple[float, float, int, int]]:
|
||||
"""
|
||||
Run PySceneDetect ContentDetector on the source movie.
|
||||
|
||||
Returns:
|
||||
List of (start_s, end_s, start_frame, end_frame) tuples.
|
||||
"""
|
||||
try:
|
||||
from scenedetect import open_video as sd_open_video, SceneManager
|
||||
from scenedetect.detectors import ContentDetector
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"scenedetect is not installed. Run: pip install scenedetect[opencv]"
|
||||
)
|
||||
|
||||
video = sd_open_video(str(cfg.paths.source_movie))
|
||||
manager = SceneManager()
|
||||
manager.add_detector(
|
||||
ContentDetector(
|
||||
threshold=cfg.scene_detection.content_threshold,
|
||||
min_scene_len=int(
|
||||
cfg.scene_detection.min_scene_duration_s
|
||||
* video.frame_rate
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
logger.info("Detecting scenes in %s …", cfg.paths.source_movie.name)
|
||||
manager.detect_scenes(video=video, show_progress=True)
|
||||
|
||||
raw = manager.get_scene_list()
|
||||
result: list[tuple[float, float, int, int]] = []
|
||||
for start_tc, end_tc in raw:
|
||||
result.append((
|
||||
start_tc.get_seconds(),
|
||||
end_tc.get_seconds(),
|
||||
start_tc.get_frames(),
|
||||
end_tc.get_frames(),
|
||||
))
|
||||
|
||||
logger.info("PySceneDetect found %d scenes.", len(result))
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fingerprint enrichment
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _fingerprint_scenes(
|
||||
raw_scenes: list[tuple[float, float, int, int]],
|
||||
cfg: AppConfig,
|
||||
) -> list[Scene]:
|
||||
"""
|
||||
For each raw scene boundary, extract the midpoint frame and fingerprint it.
|
||||
"""
|
||||
scenes: list[Scene] = []
|
||||
vc_cfg = cfg.cv.vibe_check
|
||||
|
||||
logger.info("Fingerprinting %d scenes …", len(raw_scenes))
|
||||
|
||||
with open_video(cfg.paths.source_movie) as cap:
|
||||
for idx, (start_s, end_s, start_frame, end_frame) in enumerate(raw_scenes):
|
||||
frame = grab_midpoint_frame(cap, start_s, end_s)
|
||||
|
||||
if frame is None:
|
||||
logger.warning("Scene %d: midpoint frame decode failed, skipping fingerprint.", idx)
|
||||
scenes.append(Scene(
|
||||
scene_id=idx,
|
||||
source_path=cfg.paths.source_movie,
|
||||
start_s=start_s, end_s=end_s,
|
||||
start_frame=start_frame, end_frame=end_frame,
|
||||
))
|
||||
continue
|
||||
|
||||
luma_bytes, sat_bytes, phash_hex = fingerprint_frame(frame, vc_cfg)
|
||||
|
||||
scenes.append(Scene(
|
||||
scene_id=idx,
|
||||
source_path=cfg.paths.source_movie,
|
||||
start_s=start_s, end_s=end_s,
|
||||
start_frame=start_frame, end_frame=end_frame,
|
||||
luma_hist=luma_bytes,
|
||||
sat_hist=sat_bytes,
|
||||
phash=phash_hex,
|
||||
))
|
||||
|
||||
if (idx + 1) % 50 == 0:
|
||||
logger.info(" … %d / %d scenes fingerprinted", idx + 1, len(raw_scenes))
|
||||
|
||||
return scenes
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def build_scene_index(
|
||||
cfg: AppConfig,
|
||||
force_reindex: bool = False,
|
||||
dialogue_callback: DialogueCallback | None = None,
|
||||
) -> list[Scene]:
|
||||
"""
|
||||
Build (or load from cache) the full scene index for the source movie.
|
||||
|
||||
Steps:
|
||||
1. Load from .cache/scene_index.json if available and force_reindex=False.
|
||||
2. Otherwise: detect scenes via PySceneDetect → fingerprint → cache.
|
||||
3. Optionally enrich each scene with dialogue via dialogue_callback.
|
||||
|
||||
Args:
|
||||
cfg: Application configuration.
|
||||
force_reindex: Ignore cache and re-run detection + fingerprinting.
|
||||
dialogue_callback: Optional function Scene → Scene that adds dialogue.
|
||||
Injected here so this module stays audio-free.
|
||||
|
||||
Returns:
|
||||
List of Scene objects with fingerprints populated.
|
||||
"""
|
||||
if not force_reindex:
|
||||
cached = _load_cache(cfg)
|
||||
if cached is not None:
|
||||
if dialogue_callback:
|
||||
cached = [dialogue_callback(s) for s in cached]
|
||||
return cached
|
||||
|
||||
raw = _detect_scenes_pyscenedetect(cfg)
|
||||
scenes = _fingerprint_scenes(raw, cfg)
|
||||
_save_cache(scenes, cfg)
|
||||
|
||||
if dialogue_callback:
|
||||
scenes = [dialogue_callback(s) for s in scenes]
|
||||
|
||||
return scenes
|
||||
@@ -0,0 +1,190 @@
|
||||
"""
|
||||
src/cv/vibe_check.py — Phase 1: Scene-level histogram / pHash filter
|
||||
|
||||
Responsibility:
|
||||
Given ONE TrailerBeat (with pre-computed fingerprints) and a list of
|
||||
source Scenes (also fingerprinted), return the Top-K candidates ranked
|
||||
by a combined histogram + pHash score.
|
||||
|
||||
This module contains ZERO file I/O and ZERO frame decoding — those live
|
||||
in the pipeline layer. Input = model objects, output = sorted VibeHit list.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from dataclasses import replace
|
||||
from typing import Sequence
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
from src.core.models import Scene, TrailerBeat, VibeHit
|
||||
from src.cv.fingerprinting import bytes_to_hist, phash_distance
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scoring
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Weight applied to histogram score vs pHash score in the combined metric.
|
||||
# pHash gets less weight because it's sensitive to text overlays on source.
|
||||
_HIST_WEIGHT = 0.70
|
||||
_PHASH_WEIGHT = 0.30
|
||||
_PHASH_MAX_BITS = 64 # maximum possible Hamming distance
|
||||
|
||||
|
||||
def _hist_combined_score(
|
||||
beat: TrailerBeat,
|
||||
scene: Scene,
|
||||
hist_method: int,
|
||||
) -> float:
|
||||
"""
|
||||
Average CORREL score of luma + saturation histograms.
|
||||
|
||||
Returns a value in [-1, 1] (CORREL) or [0, 1] depending on method.
|
||||
Higher is always more similar (we invert BHATTACHARYYA if needed).
|
||||
"""
|
||||
if beat.luma_hist is None or scene.luma_hist is None:
|
||||
return 0.0
|
||||
if beat.sat_hist is None or scene.sat_hist is None:
|
||||
return 0.0
|
||||
|
||||
luma_score = cv2.compareHist(
|
||||
bytes_to_hist(beat.luma_hist),
|
||||
bytes_to_hist(scene.luma_hist),
|
||||
hist_method,
|
||||
)
|
||||
sat_score = cv2.compareHist(
|
||||
bytes_to_hist(beat.sat_hist),
|
||||
bytes_to_hist(scene.sat_hist),
|
||||
hist_method,
|
||||
)
|
||||
|
||||
# Normalise BHATTACHARYYA to [0, 1] similarity (invert distance)
|
||||
if hist_method == cv2.HISTCMP_BHATTACHARYYA:
|
||||
luma_score = 1.0 - float(luma_score)
|
||||
sat_score = 1.0 - float(sat_score)
|
||||
|
||||
return float((luma_score + sat_score) / 2.0)
|
||||
|
||||
|
||||
def _phash_score(beat: TrailerBeat, scene: Scene) -> float:
|
||||
"""
|
||||
Convert Hamming distance to a [0, 1] similarity score.
|
||||
|
||||
0 Hamming distance → 1.0 (identical)
|
||||
64 Hamming distance → 0.0 (completely different)
|
||||
"""
|
||||
if beat.phash is None or scene.phash is None:
|
||||
return 0.0
|
||||
dist = phash_distance(beat.phash, scene.phash)
|
||||
return 1.0 - (dist / _PHASH_MAX_BITS)
|
||||
|
||||
|
||||
def _combined_score(
|
||||
beat: TrailerBeat,
|
||||
scene: Scene,
|
||||
hist_method: int,
|
||||
) -> float:
|
||||
"""Weighted aggregate of histogram + pHash similarity."""
|
||||
hist = _hist_combined_score(beat, scene, hist_method)
|
||||
phash = _phash_score(beat, scene)
|
||||
return _HIST_WEIGHT * hist + _PHASH_WEIGHT * phash
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def run_vibe_check(
|
||||
beat: TrailerBeat,
|
||||
scenes: Sequence[Scene],
|
||||
top_k: int,
|
||||
hist_method: int,
|
||||
phash_max_distance: int,
|
||||
) -> list[VibeHit]:
|
||||
"""
|
||||
Phase 1: Score all source scenes against one trailer beat and return
|
||||
the top-K candidates for Deep Scan.
|
||||
|
||||
Args:
|
||||
beat: The trailer beat to match (must have fingerprints).
|
||||
scenes: All detected scenes from the source movie.
|
||||
top_k: Maximum number of candidates to return.
|
||||
hist_method: cv2.HISTCMP_* constant (e.g. 0 = CORREL).
|
||||
phash_max_distance: Scenes with pHash Hamming distance > this value
|
||||
are excluded before ranking (hard filter).
|
||||
|
||||
Returns:
|
||||
List of VibeHit, sorted by combined_score descending, length ≤ top_k.
|
||||
Empty list if beat has no fingerprints or no scenes pass the filter.
|
||||
"""
|
||||
if beat.luma_hist is None and beat.phash is None:
|
||||
logger.warning(
|
||||
"Beat %d has no fingerprints — skipping Vibe Check.", beat.beat_id
|
||||
)
|
||||
return []
|
||||
|
||||
candidates: list[VibeHit] = []
|
||||
|
||||
for scene in scenes:
|
||||
# Hard pHash filter: skip scenes that are too visually distant
|
||||
if beat.phash and scene.phash:
|
||||
dist = phash_distance(beat.phash, scene.phash)
|
||||
if dist > phash_max_distance:
|
||||
continue # fast rejection — avoids full histogram compare
|
||||
|
||||
hist = _hist_combined_score(beat, scene, hist_method)
|
||||
phash = _phash_score(beat, scene)
|
||||
combined = _HIST_WEIGHT * hist + _PHASH_WEIGHT * phash
|
||||
|
||||
candidates.append(VibeHit(
|
||||
beat_id=beat.beat_id,
|
||||
scene_id=scene.scene_id,
|
||||
hist_score=round(hist, 4),
|
||||
phash_distance=(
|
||||
phash_distance(beat.phash, scene.phash)
|
||||
if beat.phash and scene.phash
|
||||
else _PHASH_MAX_BITS
|
||||
),
|
||||
combined_score=round(combined, 4),
|
||||
))
|
||||
|
||||
# Sort by combined score, descending; return top-K
|
||||
candidates.sort(key=lambda h: h.combined_score, reverse=True)
|
||||
top = candidates[:top_k]
|
||||
|
||||
logger.info(
|
||||
"Vibe Check beat=%d: %d scenes scored, %d candidates forwarded to Deep Scan. "
|
||||
"Best score: %.3f (scene %s)",
|
||||
beat.beat_id,
|
||||
len(candidates),
|
||||
len(top),
|
||||
top[0].combined_score if top else 0.0,
|
||||
top[0].scene_id if top else "—",
|
||||
)
|
||||
|
||||
return top
|
||||
|
||||
|
||||
def batch_vibe_check(
|
||||
beats: Sequence[TrailerBeat],
|
||||
scenes: Sequence[Scene],
|
||||
top_k: int,
|
||||
hist_method: int,
|
||||
phash_max_distance: int,
|
||||
) -> dict[int, list[VibeHit]]:
|
||||
"""
|
||||
Run Vibe Check for every beat and return a mapping beat_id → [VibeHit].
|
||||
|
||||
Convenience wrapper for the pipeline layer.
|
||||
"""
|
||||
return {
|
||||
beat.beat_id: run_vibe_check(
|
||||
beat, scenes, top_k, hist_method, phash_max_distance
|
||||
)
|
||||
for beat in beats
|
||||
}
|
||||
@@ -0,0 +1 @@
|
||||
# src.export package — FCPXML / EDL export
|
||||
@@ -0,0 +1,114 @@
|
||||
"""
|
||||
src/export/edl_writer.py — EditTimeline → CMX 3600 EDL
|
||||
|
||||
Generates a standard CMX 3600 Edit Decision List compatible with
|
||||
Avid, DaVinci Resolve, Premiere Pro, and most NLEs.
|
||||
|
||||
CMX 3600 format reference:
|
||||
https://en.wikipedia.org/wiki/Edit_decision_list#CMX_3600
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from src.core.config import AppConfig
|
||||
from src.core.models import EditClip, EditTimeline
|
||||
from src.export.timecode import seconds_to_smpte
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# EDL line builders
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _edl_header(title: str) -> str:
|
||||
return f"TITLE: {title}\nFCM: NON-DROP FRAME\n"
|
||||
|
||||
|
||||
def _edl_event(
|
||||
event_num: int,
|
||||
clip: EditClip,
|
||||
fps: float,
|
||||
) -> str:
|
||||
"""
|
||||
Build one CMX 3600 event block for a single clip.
|
||||
|
||||
Format:
|
||||
NNN AX V C <SRC_IN> <SRC_OUT> <REC_IN> <REC_OUT>
|
||||
* FROM CLIP NAME: ...
|
||||
* COMMENT: ...
|
||||
"""
|
||||
src_in = seconds_to_smpte(clip.match.in_point_s, fps)
|
||||
source_duration_s = clip.source_timeline_duration_s
|
||||
src_out = seconds_to_smpte(clip.match.in_point_s + source_duration_s, fps)
|
||||
rec_in = seconds_to_smpte(clip.timeline_start_s, fps)
|
||||
rec_out = seconds_to_smpte(clip.timeline_start_s + source_duration_s, fps)
|
||||
|
||||
event_line = f"{event_num:03d} AX V C {src_in} {src_out} {rec_in} {rec_out}"
|
||||
name_line = f"* FROM CLIP NAME: {clip.match.source_path.name}"
|
||||
comment_line = (
|
||||
f"* BEAT {clip.beat.beat_id:03d} | {clip.beat.beat_type.name} | "
|
||||
f"score={clip.match.match_score:.3f}"
|
||||
)
|
||||
|
||||
return "\n".join([event_line, name_line, comment_line, ""])
|
||||
|
||||
|
||||
def _edl_black_tail_event(event_num: int, clip: EditClip, fps: float) -> str:
|
||||
rec_in = seconds_to_smpte(clip.timeline_start_s + clip.source_timeline_duration_s, fps)
|
||||
rec_out = seconds_to_smpte(clip.timeline_end_s, fps)
|
||||
event_line = f"{event_num:03d} BL V C 00:00:00:00 00:00:00:00 {rec_in} {rec_out}"
|
||||
comment_line = (
|
||||
f"* BEAT {clip.beat.beat_id:03d} TRAILER-ONLY TAIL | "
|
||||
"add fade/dissolve to black"
|
||||
)
|
||||
return "\n".join([event_line, "* FROM CLIP NAME: BLACK", comment_line, ""])
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def write_edl(
|
||||
timeline: EditTimeline,
|
||||
cfg: AppConfig,
|
||||
output_path: Path | None = None,
|
||||
) -> Path:
|
||||
"""
|
||||
Write the EditTimeline as a CMX 3600 EDL file.
|
||||
|
||||
Args:
|
||||
timeline: EditTimeline from build_timeline().
|
||||
cfg: Application configuration.
|
||||
output_path: Override destination. Defaults to
|
||||
<output_dir>/<project_name>.edl.
|
||||
|
||||
Returns:
|
||||
Path to the written .edl file.
|
||||
"""
|
||||
if output_path is None:
|
||||
output_path = cfg.paths.output_dir / f"{timeline.title}.edl"
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
fps = timeline.frame_rate
|
||||
lines = [_edl_header(timeline.title), "\n"]
|
||||
|
||||
event_num = 1
|
||||
for clip in sorted(timeline.clips, key=lambda c: c.clip_index):
|
||||
lines.append(_edl_event(event_num, clip, fps))
|
||||
event_num += 1
|
||||
if clip.trailer_tail_s > 0:
|
||||
lines.append("\n")
|
||||
lines.append(_edl_black_tail_event(event_num, clip, fps))
|
||||
event_num += 1
|
||||
lines.append("\n")
|
||||
|
||||
edl_text = "\n".join(lines)
|
||||
output_path.write_text(edl_text, encoding="utf-8")
|
||||
|
||||
logger.info("EDL written → %s (%d events)", output_path, timeline.clip_count)
|
||||
return output_path
|
||||
@@ -0,0 +1,222 @@
|
||||
"""
|
||||
src/export/fcpxml_writer.py — EditTimeline → Final Cut Pro XML (FCPXML 1.10)
|
||||
|
||||
Generates a standards-compliant FCPXML file that can be imported directly
|
||||
into Final Cut Pro X, DaVinci Resolve, or Premiere Pro (via FCPXML plugin).
|
||||
|
||||
Spec reference: https://developer.apple.com/documentation/professional_video_applications/fcpxml_reference
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from urllib.parse import quote
|
||||
from xml.etree import ElementTree as ET
|
||||
from xml.etree.ElementTree import Element, SubElement
|
||||
|
||||
from src.core.config import AppConfig
|
||||
from src.core.models import EditClip, EditTimeline
|
||||
from src.export.timecode import (
|
||||
fcpxml_format_name,
|
||||
fcpxml_frame_duration,
|
||||
seconds_to_fcpxml,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Asset registry — one <asset> per unique source file
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class _AssetRegistry:
|
||||
def __init__(self) -> None:
|
||||
self._assets: dict[Path, str] = {} # path → asset id
|
||||
self._counter = 2 # r1 reserved for format
|
||||
|
||||
def get_or_create(self, path: Path) -> str:
|
||||
if path not in self._assets:
|
||||
rid = f"r{self._counter}"
|
||||
self._assets[path] = rid
|
||||
self._counter += 1
|
||||
return self._assets[path]
|
||||
|
||||
@property
|
||||
def items(self) -> dict[Path, str]:
|
||||
return dict(self._assets)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Builder
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _path_to_url(path: Path) -> str:
|
||||
"""Convert an absolute Path to a file:// URL as required by FCPXML."""
|
||||
posix = path.as_posix()
|
||||
if not posix.startswith("/"):
|
||||
# Windows drive letter: C:/foo → /C:/foo
|
||||
posix = "/" + posix
|
||||
return "file://" + quote(posix, safe="/:@")
|
||||
|
||||
|
||||
def build_fcpxml(
|
||||
timeline: EditTimeline,
|
||||
cfg: AppConfig,
|
||||
source_duration_s: float = 7200.0, # 2-hour fallback if not probed
|
||||
) -> ET.ElementTree:
|
||||
"""
|
||||
Build a complete FCPXML ElementTree from an EditTimeline.
|
||||
|
||||
Args:
|
||||
timeline: Ordered sequence of EditClips.
|
||||
cfg: Application configuration.
|
||||
source_duration_s: Duration of the source movie asset (used for
|
||||
<asset> duration attribute). Will be probed
|
||||
automatically when possible.
|
||||
|
||||
Returns:
|
||||
xml.etree.ElementTree.ElementTree — call .write() to serialise.
|
||||
"""
|
||||
fps = timeline.frame_rate
|
||||
|
||||
# ---- root ---------------------------------------------------------------
|
||||
root = Element("fcpxml", version=cfg.export.fcpxml_version)
|
||||
root.set("xmlns", "http://www.apple.com/dt/FCPXML/1_10")
|
||||
|
||||
# ---- resources ----------------------------------------------------------
|
||||
resources = SubElement(root, "resources")
|
||||
|
||||
format_id = "r1"
|
||||
format_name = fcpxml_format_name(fps)
|
||||
fmt = SubElement(resources, "format",
|
||||
id=format_id,
|
||||
name=format_name,
|
||||
frameDuration=fcpxml_frame_duration(fps),
|
||||
width="1920",
|
||||
height="1080",
|
||||
colorSpace="1-1-1 (Rec. 709)",
|
||||
)
|
||||
|
||||
registry = _AssetRegistry()
|
||||
|
||||
# Pre-register all unique source paths so <asset> elements come before
|
||||
# the <library> block (required by FCPXML spec).
|
||||
for clip in timeline.clips:
|
||||
registry.get_or_create(clip.match.source_path)
|
||||
|
||||
# Probe actual source duration when possible
|
||||
_durations: dict[Path, float] = {}
|
||||
for path in registry.items:
|
||||
try:
|
||||
from src.cv.frame_extractor import get_video_info
|
||||
info = get_video_info(path)
|
||||
_durations[path] = float(info["duration_s"])
|
||||
except Exception:
|
||||
_durations[path] = source_duration_s
|
||||
|
||||
for path, rid in registry.items.items():
|
||||
dur_s = _durations.get(path, source_duration_s)
|
||||
SubElement(resources, "asset",
|
||||
id=rid,
|
||||
name=path.stem,
|
||||
src=_path_to_url(path),
|
||||
start="0s",
|
||||
duration=seconds_to_fcpxml(dur_s, fps),
|
||||
hasVideo="1",
|
||||
hasAudio="1",
|
||||
format=format_id,
|
||||
)
|
||||
|
||||
# ---- library / event / project ------------------------------------------
|
||||
library = SubElement(root, "library")
|
||||
event = SubElement(library, "event", name=timeline.title)
|
||||
project = SubElement(event, "project", name=timeline.title)
|
||||
sequence = SubElement(project, "sequence",
|
||||
duration=seconds_to_fcpxml(timeline.total_duration_s, fps),
|
||||
format=format_id,
|
||||
tcStart="0s",
|
||||
tcFormat="NDF",
|
||||
audioLayout="stereo",
|
||||
audioRate="48k",
|
||||
)
|
||||
spine = SubElement(sequence, "spine")
|
||||
|
||||
# ---- clips --------------------------------------------------------------
|
||||
for clip in sorted(timeline.clips, key=lambda c: c.clip_index):
|
||||
asset_id = registry.get_or_create(clip.match.source_path)
|
||||
|
||||
source_duration_s = clip.source_timeline_duration_s
|
||||
clip_elem = SubElement(spine, "clip",
|
||||
name=f"Beat_{clip.beat.beat_id:03d}_{clip.beat.beat_type.name}",
|
||||
ref=asset_id,
|
||||
# offset = position on the timeline
|
||||
offset=seconds_to_fcpxml(clip.timeline_start_s, fps),
|
||||
# duration = matched source part only; trailer-only tails become gaps.
|
||||
duration=seconds_to_fcpxml(source_duration_s, fps),
|
||||
# start = in-point inside the source asset
|
||||
start=seconds_to_fcpxml(clip.match.in_point_s, fps),
|
||||
)
|
||||
|
||||
# Inline audio role
|
||||
SubElement(clip_elem, "audio",
|
||||
role="dialogue",
|
||||
srcCh="1, 2",
|
||||
outCh="L, R",
|
||||
)
|
||||
|
||||
if clip.trailer_tail_s > 0:
|
||||
gap = SubElement(spine, "gap",
|
||||
name=f"Beat_{clip.beat.beat_id:03d}_TRAILER_TAIL_BLACK_FADE",
|
||||
offset=seconds_to_fcpxml(clip.timeline_start_s + source_duration_s, fps),
|
||||
duration=seconds_to_fcpxml(clip.trailer_tail_s, fps),
|
||||
start="0s",
|
||||
)
|
||||
SubElement(gap, "marker",
|
||||
start="0s",
|
||||
value="Trailer-only tail: add fade/dissolve to black here",
|
||||
completed="0",
|
||||
)
|
||||
|
||||
return ET.ElementTree(root)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Writer
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def write_fcpxml(
|
||||
timeline: EditTimeline,
|
||||
cfg: AppConfig,
|
||||
output_path: Path | None = None,
|
||||
) -> Path:
|
||||
"""
|
||||
Serialise the EditTimeline to a .fcpxml file.
|
||||
|
||||
Args:
|
||||
timeline: EditTimeline from build_timeline().
|
||||
cfg: Application configuration.
|
||||
output_path: Override destination. Defaults to
|
||||
<output_dir>/<project_name>.fcpxml.
|
||||
|
||||
Returns:
|
||||
Path to the written .fcpxml file.
|
||||
"""
|
||||
if output_path is None:
|
||||
output_path = cfg.paths.output_dir / f"{timeline.title}.fcpxml"
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
tree = build_fcpxml(timeline, cfg)
|
||||
|
||||
# Add XML declaration + DOCTYPE manually (ElementTree doesn't support DOCTYPE)
|
||||
xml_bytes = ET.tostring(tree.getroot(), encoding="unicode", xml_declaration=False)
|
||||
header = (
|
||||
'<?xml version="1.0" encoding="UTF-8"?>\n'
|
||||
'<!DOCTYPE fcpxml>\n'
|
||||
)
|
||||
|
||||
output_path.write_text(header + xml_bytes, encoding="utf-8")
|
||||
|
||||
logger.info("FCPXML written → %s (%d clips)", output_path, timeline.clip_count)
|
||||
return output_path
|
||||
@@ -0,0 +1,146 @@
|
||||
"""
|
||||
src/export/timecode.py — Timecode / rational-time conversion helpers
|
||||
|
||||
FCPXML uses rational fractions ("1001/24000s") for all time values.
|
||||
EDL uses SMPTE timecode strings ("HH:MM:SS:FF").
|
||||
|
||||
All conversion functions are pure — no I/O, no state.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
from fractions import Fraction
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Common frame-rate denominators
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_FPS_RATIONAL: dict[float, tuple[int, int]] = {
|
||||
23.976: (24000, 1001),
|
||||
24.0: (24, 1),
|
||||
25.0: (25, 1),
|
||||
29.97: (30000, 1001),
|
||||
30.0: (30, 1),
|
||||
50.0: (50, 1),
|
||||
59.94: (60000, 1001),
|
||||
60.0: (60, 1),
|
||||
}
|
||||
|
||||
_TOLERANCE = 0.01 # fps match tolerance
|
||||
|
||||
|
||||
def _fps_to_rational(fps: float) -> tuple[int, int]:
|
||||
"""Return (numerator, denominator) for common fps values."""
|
||||
for ref_fps, rational in _FPS_RATIONAL.items():
|
||||
if abs(fps - ref_fps) < _TOLERANCE:
|
||||
return rational
|
||||
# Fallback: convert float to exact fraction
|
||||
f = Fraction(fps).limit_denominator(1001)
|
||||
return f.numerator, f.denominator
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Seconds → FCPXML rational string
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def seconds_to_fcpxml(seconds: float, fps: float) -> str:
|
||||
"""
|
||||
Convert *seconds* to FCPXML rational time string.
|
||||
|
||||
FCPXML requires exact rational arithmetic to avoid drift.
|
||||
Example: 10.0s @23.976fps → "240240/24000s"
|
||||
|
||||
Args:
|
||||
seconds: Time in seconds (float).
|
||||
fps: Project frame rate.
|
||||
|
||||
Returns:
|
||||
FCPXML time string, e.g. "240240/24000s".
|
||||
"""
|
||||
if seconds == 0.0:
|
||||
return "0s"
|
||||
|
||||
num, den = _fps_to_rational(fps) # frames per second = num/den
|
||||
# seconds × (num/den) = frames (float); round to nearest frame
|
||||
frames = round(seconds * num / den)
|
||||
# frames ÷ (num/den) = frames × den/num → rational seconds
|
||||
total_num = frames * den
|
||||
total_den = num
|
||||
# Reduce fraction
|
||||
g = math.gcd(total_num, total_den)
|
||||
return f"{total_num // g}/{total_den // g}s"
|
||||
|
||||
|
||||
def seconds_to_frame_count(seconds: float, fps: float) -> int:
|
||||
"""Convert seconds to integer frame count."""
|
||||
return round(seconds * fps)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Seconds → SMPTE timecode (for EDL)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def seconds_to_smpte(seconds: float, fps: float, drop_frame: bool = False) -> str:
|
||||
"""
|
||||
Convert *seconds* to SMPTE timecode string "HH:MM:SS:FF".
|
||||
|
||||
Drop-frame timecode (;) is not implemented — always returns NDF (:).
|
||||
|
||||
Args:
|
||||
seconds: Time in float seconds.
|
||||
fps: Frame rate (23.976, 24, 25, etc.).
|
||||
drop_frame: Ignored; placeholder for future DF support.
|
||||
|
||||
Returns:
|
||||
"HH:MM:SS:FF" string.
|
||||
"""
|
||||
total_frames = seconds_to_frame_count(seconds, fps)
|
||||
nominal_fps = round(fps) # e.g. 23.976 → 24
|
||||
|
||||
ff = total_frames % nominal_fps
|
||||
total_s = total_frames // nominal_fps
|
||||
ss = total_s % 60
|
||||
total_m = total_s // 60
|
||||
mm = total_m % 60
|
||||
hh = total_m // 60
|
||||
|
||||
return f"{hh:02d}:{mm:02d}:{ss:02d}:{ff:02d}"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# FCPXML format ID helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def fcpxml_format_name(fps: float, width: int = 1920, height: int = 1080) -> str:
|
||||
"""
|
||||
Return an FCPXML format name string for a given frame rate and resolution.
|
||||
|
||||
Example: fps=23.976, 1080p → "FFVideoFormat1080p2398"
|
||||
"""
|
||||
res = f"{height}p"
|
||||
fps_tag = {
|
||||
23.976: "2398",
|
||||
24.0: "24",
|
||||
25.0: "25",
|
||||
29.97: "2997",
|
||||
30.0: "30",
|
||||
50.0: "50",
|
||||
59.94: "5994",
|
||||
60.0: "60",
|
||||
}.get(fps, str(int(fps * 100)))
|
||||
return f"FFVideoFormat{res}{fps_tag}"
|
||||
|
||||
|
||||
def fcpxml_frame_duration(fps: float) -> str:
|
||||
"""
|
||||
Return FCPXML frameDuration attribute for a given fps.
|
||||
|
||||
frame duration = 1 frame = 1/fps seconds = den/num seconds
|
||||
Example: 23.976fps → num=24000, den=1001 → frame duration = 1001/24000s
|
||||
"""
|
||||
num, den = _fps_to_rational(fps) # fps = num/den (e.g. 24000/1001)
|
||||
# frame duration = den/num seconds
|
||||
g = math.gcd(den, num)
|
||||
return f"{den // g}/{num // g}s"
|
||||
@@ -0,0 +1 @@
|
||||
# src.llm package — Thematic segmentation / dramaturgy (NO vision matching)
|
||||
@@ -0,0 +1,202 @@
|
||||
"""
|
||||
src/llm/dramaturg.py — LLM-based thematic beat classification (OpenRouter)
|
||||
|
||||
Responsibility:
|
||||
- Receive a list of TrailerBeat objects (with dialogue lines attached)
|
||||
- Send a single structured prompt to the LLM
|
||||
- Parse the JSON response to assign BeatType to each beat
|
||||
|
||||
IMPORTANT: This module does ZERO visual analysis.
|
||||
It classifies narrative dramaturgy from dialogue text only.
|
||||
Visual matching is handled exclusively by the CV engine.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
from dataclasses import replace
|
||||
from typing import Sequence
|
||||
|
||||
from src.core.config import AppConfig
|
||||
from src.core.models import BeatType, TrailerBeat
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Prompt builder
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_SYSTEM_PROMPT = """You are a film trailer editor and narrative analyst.
|
||||
Your task is to classify each beat of a trailer into one of these dramatic roles:
|
||||
HOOK - Opening attention grabber (first impression, shocking image, logo)
|
||||
SETUP - World/character introduction
|
||||
CONFLICT - Inciting incident, rising tension, threat revealed
|
||||
CLIMAX - Peak action/emotion, highest stakes
|
||||
RESOLUTION - Cool-down, tagline, final title card
|
||||
|
||||
You will receive a JSON array of beats with their index and dialogue text.
|
||||
Respond ONLY with a valid JSON array, one object per beat, with keys:
|
||||
"beat_id" (int) and "beat_type" (one of the strings above).
|
||||
Do NOT include any explanation or markdown fences."""
|
||||
|
||||
_USER_TEMPLATE = """Classify the following {n} trailer beats:
|
||||
|
||||
{beats_json}"""
|
||||
|
||||
|
||||
def _build_beats_payload(beats: Sequence[TrailerBeat]) -> str:
|
||||
payload = []
|
||||
for b in beats:
|
||||
dialogue_text = " / ".join(line.text for line in b.dialogue) or "(no dialogue)"
|
||||
payload.append({
|
||||
"beat_id": b.beat_id,
|
||||
"duration": round(b.duration_s, 2),
|
||||
"dialogue": dialogue_text,
|
||||
})
|
||||
return json.dumps(payload, ensure_ascii=False, indent=2)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# OpenRouter / OpenAI-compatible HTTP client
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _call_llm(prompt_user: str, cfg: AppConfig) -> str:
|
||||
"""
|
||||
Send a chat completion request to the configured LLM provider.
|
||||
|
||||
Supports: openrouter, openai, ollama (all use the OpenAI-compatible API).
|
||||
|
||||
Returns:
|
||||
The raw text content of the first assistant message.
|
||||
|
||||
Raises:
|
||||
RuntimeError: On HTTP errors or missing API key.
|
||||
"""
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
|
||||
llm = cfg.llm
|
||||
|
||||
if llm.provider in ("openrouter", "openai") and not llm.api_key:
|
||||
raise RuntimeError(
|
||||
f"LLM provider is '{llm.provider}' but no API key found. "
|
||||
"Set OPENROUTER_API_KEY (or OPENAI_API_KEY) in your .env file."
|
||||
)
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {llm.api_key}",
|
||||
}
|
||||
if llm.provider == "openrouter":
|
||||
headers["HTTP-Referer"] = "https://github.com/ai-trailer-2026"
|
||||
headers["X-Title"] = "AI Trailer Generator v2"
|
||||
|
||||
body = json.dumps({
|
||||
"model": llm.model,
|
||||
"messages": [
|
||||
{"role": "system", "content": _SYSTEM_PROMPT},
|
||||
{"role": "user", "content": prompt_user},
|
||||
],
|
||||
"temperature": llm.temperature,
|
||||
"max_tokens": llm.max_tokens,
|
||||
}).encode("utf-8")
|
||||
|
||||
url = f"{llm.base_url.rstrip('/')}/chat/completions"
|
||||
|
||||
req = urllib.request.Request(url, data=body, headers=headers, method="POST")
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=llm.timeout_seconds) as resp:
|
||||
data = json.loads(resp.read().decode("utf-8"))
|
||||
return data["choices"][0]["message"]["content"]
|
||||
except urllib.error.HTTPError as exc:
|
||||
body_text = exc.read().decode(errors="replace")
|
||||
raise RuntimeError(
|
||||
f"LLM HTTP {exc.code} from {url}:\n{body_text}"
|
||||
) from exc
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Response parser
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_BEAT_TYPE_MAP: dict[str, BeatType] = {bt.name: bt for bt in BeatType}
|
||||
|
||||
|
||||
def _parse_response(raw: str, beats: Sequence[TrailerBeat]) -> dict[int, BeatType]:
|
||||
"""
|
||||
Parse the LLM JSON array response into a beat_id → BeatType mapping.
|
||||
|
||||
Falls back to BeatType.UNKNOWN for any beat that cannot be parsed.
|
||||
"""
|
||||
# Strip accidental markdown fences
|
||||
clean = raw.strip()
|
||||
if clean.startswith("```"):
|
||||
clean = "\n".join(clean.split("\n")[1:])
|
||||
if clean.endswith("```"):
|
||||
clean = clean[: clean.rfind("```")]
|
||||
clean = clean.strip()
|
||||
|
||||
result: dict[int, BeatType] = {b.beat_id: BeatType.UNKNOWN for b in beats}
|
||||
|
||||
try:
|
||||
parsed = json.loads(clean)
|
||||
if not isinstance(parsed, list):
|
||||
raise ValueError("Expected JSON array at top level.")
|
||||
|
||||
for item in parsed:
|
||||
bid = int(item["beat_id"])
|
||||
name = str(item.get("beat_type", "UNKNOWN")).upper()
|
||||
result[bid] = _BEAT_TYPE_MAP.get(name, BeatType.UNKNOWN)
|
||||
|
||||
except (json.JSONDecodeError, KeyError, ValueError) as exc:
|
||||
logger.warning("LLM response parse error (%s) — all beats → UNKNOWN.\nRaw: %s", exc, raw[:300])
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def classify_beats(
|
||||
beats: Sequence[TrailerBeat],
|
||||
cfg: AppConfig,
|
||||
) -> list[TrailerBeat]:
|
||||
"""
|
||||
Use the LLM to assign a BeatType to each TrailerBeat.
|
||||
|
||||
Args:
|
||||
beats: TrailerBeat list (dialogue should be populated for best results).
|
||||
cfg: Application configuration (llm section + api key).
|
||||
|
||||
Returns:
|
||||
New list of TrailerBeat objects with beat_type set.
|
||||
On LLM error, all beats keep BeatType.UNKNOWN (no exception raised).
|
||||
"""
|
||||
if not beats:
|
||||
return list(beats)
|
||||
|
||||
logger.info(
|
||||
"Classifying %d beats via %s / %s …",
|
||||
len(beats), cfg.llm.provider, cfg.llm.model,
|
||||
)
|
||||
|
||||
payload = _build_beats_payload(beats)
|
||||
prompt = _USER_TEMPLATE.format(n=len(beats), beats_json=payload)
|
||||
|
||||
try:
|
||||
raw_response = _call_llm(prompt, cfg)
|
||||
except Exception as exc:
|
||||
logger.error("LLM classification failed: %s — keeping BeatType.UNKNOWN.", exc)
|
||||
return list(beats)
|
||||
|
||||
type_map = _parse_response(raw_response, beats)
|
||||
|
||||
enriched = [replace(b, beat_type=type_map.get(b.beat_id, BeatType.UNKNOWN)) for b in beats]
|
||||
|
||||
classified = sum(1 for b in enriched if b.beat_type != BeatType.UNKNOWN)
|
||||
logger.info("Beat classification done: %d / %d classified.", classified, len(beats))
|
||||
return enriched
|
||||
@@ -0,0 +1,316 @@
|
||||
"""
|
||||
Cached vision descriptions for ambiguous trailer/source matching.
|
||||
|
||||
This module is deliberately conservative: it never writes a final match and it
|
||||
does not replace CV. It describes a small number of 3-frame beat/scene samples,
|
||||
caches those descriptions, and returns extra source in-point seeds for the CV
|
||||
scanner to verify.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from dataclasses import asdict
|
||||
from pathlib import Path
|
||||
from typing import Sequence
|
||||
|
||||
import cv2
|
||||
|
||||
from src.core.config import AppConfig
|
||||
from src.core.models import Scene, TrailerBeat
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_CACHE_VERSION = 1
|
||||
_STOPWORDS = {
|
||||
"the", "and", "with", "from", "that", "this", "there", "their", "into",
|
||||
"scene", "frame", "image", "shot", "video", "visible", "looks", "appears",
|
||||
"eine", "einer", "einem", "einen", "und", "oder", "mit", "der", "die", "das",
|
||||
}
|
||||
|
||||
_SYSTEM_PROMPT = """You describe film shots for automatic matching.
|
||||
Return only compact JSON with these keys:
|
||||
subject, setting, composition, action_phase, distinctive_objects, lighting_color, negatives.
|
||||
Focus on stable visual facts and spatial layout. Ignore timecode overlays, subtitles, logos, compression, aspect ratio, and color grading differences."""
|
||||
|
||||
|
||||
def _cache_path(cfg: AppConfig) -> Path:
|
||||
return cfg.paths.cache_dir / "vision_descriptions.json"
|
||||
|
||||
|
||||
def _load_cache(cfg: AppConfig) -> dict:
|
||||
path = _cache_path(cfg)
|
||||
if not path.exists():
|
||||
return {"version": _CACHE_VERSION, "items": {}}
|
||||
try:
|
||||
data = json.loads(path.read_text(encoding="utf-8"))
|
||||
except json.JSONDecodeError:
|
||||
logger.warning("Vision cache is unreadable; rebuilding: %s", path)
|
||||
return {"version": _CACHE_VERSION, "items": {}}
|
||||
if data.get("version") != _CACHE_VERSION or not isinstance(data.get("items"), dict):
|
||||
return {"version": _CACHE_VERSION, "items": {}}
|
||||
return data
|
||||
|
||||
|
||||
def _save_cache(cfg: AppConfig, cache: dict) -> None:
|
||||
path = _cache_path(cfg)
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(json.dumps(cache, indent=2, ensure_ascii=False), encoding="utf-8")
|
||||
|
||||
|
||||
def _sample_times(start_s: float, end_s: float) -> list[float]:
|
||||
duration_s = max(0.04, end_s - start_s)
|
||||
return [
|
||||
start_s + min(duration_s * 0.12, max(0.0, duration_s - 0.04)),
|
||||
start_s + duration_s * 0.50,
|
||||
start_s + max(0.0, duration_s - min(duration_s * 0.12, 0.20)),
|
||||
]
|
||||
|
||||
|
||||
def _frame_data_url(video_path: Path, t_s: float) -> str | None:
|
||||
cap = cv2.VideoCapture(str(video_path))
|
||||
try:
|
||||
if not cap.isOpened():
|
||||
return None
|
||||
cap.set(cv2.CAP_PROP_POS_MSEC, max(0.0, t_s) * 1000.0)
|
||||
ok, frame = cap.read()
|
||||
if not ok or frame is None:
|
||||
return None
|
||||
h, w = frame.shape[:2]
|
||||
if w > 640:
|
||||
frame = cv2.resize(frame, (640, int(h * (640 / w))), interpolation=cv2.INTER_AREA)
|
||||
ok, encoded = cv2.imencode(".jpg", frame, [int(cv2.IMWRITE_JPEG_QUALITY), 72])
|
||||
if not ok:
|
||||
return None
|
||||
payload = base64.b64encode(encoded.tobytes()).decode("ascii")
|
||||
return f"data:image/jpeg;base64,{payload}"
|
||||
finally:
|
||||
cap.release()
|
||||
|
||||
|
||||
def _call_vision_model(label: str, image_urls: list[str], cfg: AppConfig) -> str:
|
||||
vision = cfg.vision
|
||||
if vision.provider in ("openai", "openrouter") and not vision.api_key:
|
||||
raise RuntimeError(
|
||||
"Vision is enabled but no API key is available. Set VISION_API_KEY, "
|
||||
"OPENROUTER_API_KEY, OPENAI_API_KEY, or LLM_API_KEY."
|
||||
)
|
||||
|
||||
content: list[dict] = [{
|
||||
"type": "text",
|
||||
"text": (
|
||||
f"Describe this 3-frame sample for matching. Label: {label}. "
|
||||
"The frames are start, middle, and end of the same beat/scene."
|
||||
),
|
||||
}]
|
||||
content.extend({
|
||||
"type": "image_url",
|
||||
"image_url": {"url": url, "detail": "low"},
|
||||
} for url in image_urls)
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {vision.api_key}",
|
||||
}
|
||||
if vision.provider == "openrouter":
|
||||
headers["HTTP-Referer"] = "https://github.com/ai-trailer-2026"
|
||||
headers["X-Title"] = "AI Trailer Generator v2"
|
||||
|
||||
body = json.dumps({
|
||||
"model": vision.model,
|
||||
"messages": [
|
||||
{"role": "system", "content": _SYSTEM_PROMPT},
|
||||
{"role": "user", "content": content},
|
||||
],
|
||||
"temperature": vision.temperature,
|
||||
"max_tokens": vision.max_tokens,
|
||||
}).encode("utf-8")
|
||||
|
||||
url = f"{vision.base_url.rstrip('/')}/chat/completions"
|
||||
req = urllib.request.Request(url, data=body, headers=headers, method="POST")
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=vision.timeout_seconds) as resp:
|
||||
data = json.loads(resp.read().decode("utf-8"))
|
||||
return str(data["choices"][0]["message"]["content"]).strip()
|
||||
except urllib.error.HTTPError as exc:
|
||||
body_text = exc.read().decode(errors="replace")
|
||||
raise RuntimeError(f"Vision HTTP {exc.code} from {url}:\n{body_text}") from exc
|
||||
|
||||
|
||||
def _description_key(kind: str, item_id: int, start_s: float, end_s: float, cfg: AppConfig) -> str:
|
||||
path = cfg.paths.reference_trailer if kind == "beat" else cfg.paths.source_movie
|
||||
try:
|
||||
stamp = int(path.stat().st_mtime)
|
||||
except OSError:
|
||||
stamp = 0
|
||||
return (
|
||||
f"{kind}:{item_id}:"
|
||||
f"{start_s:.3f}:{end_s:.3f}:"
|
||||
f"{cfg.vision.provider}:{cfg.vision.model}:{stamp}"
|
||||
)
|
||||
|
||||
|
||||
def _describe_sample(
|
||||
*,
|
||||
kind: str,
|
||||
item_id: int,
|
||||
label: str,
|
||||
video_path: Path,
|
||||
start_s: float,
|
||||
end_s: float,
|
||||
cfg: AppConfig,
|
||||
cache: dict,
|
||||
budget: list[int],
|
||||
) -> str | None:
|
||||
key = _description_key(kind, item_id, start_s, end_s, cfg)
|
||||
cached = cache["items"].get(key)
|
||||
if cached:
|
||||
return str(cached.get("description", ""))
|
||||
if budget[0] <= 0:
|
||||
return None
|
||||
|
||||
image_urls = [
|
||||
url for url in (_frame_data_url(video_path, t) for t in _sample_times(start_s, end_s))
|
||||
if url is not None
|
||||
]
|
||||
if len(image_urls) < 2:
|
||||
return None
|
||||
|
||||
description = _call_vision_model(label, image_urls, cfg)
|
||||
cache["items"][key] = {
|
||||
"kind": kind,
|
||||
"item_id": item_id,
|
||||
"start_s": start_s,
|
||||
"end_s": end_s,
|
||||
"label": label,
|
||||
"description": description,
|
||||
}
|
||||
budget[0] -= 1
|
||||
return description
|
||||
|
||||
|
||||
def _terms(text: str) -> set[str]:
|
||||
words = re.findall(r"[a-zA-Z][a-zA-Z0-9_'-]{2,}", text.lower())
|
||||
return {w for w in words if w not in _STOPWORDS}
|
||||
|
||||
|
||||
def _text_similarity(a: str, b: str) -> float:
|
||||
ta = _terms(a)
|
||||
tb = _terms(b)
|
||||
if not ta or not tb:
|
||||
return 0.0
|
||||
overlap = len(ta & tb)
|
||||
return float(overlap / max(8, min(len(ta), len(tb))))
|
||||
|
||||
|
||||
def _scene_seed_points(scene: Scene, max_points: int) -> list[float]:
|
||||
if max_points <= 1 or scene.duration_s <= 0:
|
||||
return [scene.start_s]
|
||||
usable_end = max(scene.start_s, scene.end_s - 0.2)
|
||||
if usable_end <= scene.start_s:
|
||||
return [scene.start_s]
|
||||
step = (usable_end - scene.start_s) / max(1, max_points - 1)
|
||||
return [scene.start_s + step * idx for idx in range(max_points)]
|
||||
|
||||
|
||||
def build_vision_seed_in_points(
|
||||
beats: Sequence[TrailerBeat],
|
||||
scenes: Sequence[Scene],
|
||||
cfg: AppConfig,
|
||||
) -> dict[int, list[tuple[float, float]]]:
|
||||
"""
|
||||
Return extra in-point seeds from cached vision descriptions.
|
||||
|
||||
The function is intentionally small-budget: for each beat it describes the
|
||||
beat once and only a few top scene-level candidates. Existing descriptions
|
||||
are read from cache and cost nothing.
|
||||
"""
|
||||
if not cfg.vision.enabled:
|
||||
return {}
|
||||
if not beats or not scenes:
|
||||
return {}
|
||||
|
||||
from src.cv.vibe_check import run_vibe_check
|
||||
|
||||
cache = _load_cache(cfg)
|
||||
budget = [cfg.vision.max_new_descriptions_per_run]
|
||||
scenes_by_id = {scene.scene_id: scene for scene in scenes}
|
||||
seeds: dict[int, list[tuple[float, float]]] = {}
|
||||
|
||||
for beat in beats:
|
||||
beat_desc = _describe_sample(
|
||||
kind="beat",
|
||||
item_id=beat.beat_id,
|
||||
label=f"trailer beat {beat.beat_id}",
|
||||
video_path=beat.trailer_path,
|
||||
start_s=beat.start_s,
|
||||
end_s=beat.end_s,
|
||||
cfg=cfg,
|
||||
cache=cache,
|
||||
budget=budget,
|
||||
)
|
||||
if not beat_desc:
|
||||
continue
|
||||
|
||||
hits = run_vibe_check(
|
||||
beat,
|
||||
scenes,
|
||||
top_k=cfg.vision.scene_candidate_top_k,
|
||||
hist_method=cfg.cv.vibe_check.hist_compare_method,
|
||||
phash_max_distance=64,
|
||||
)
|
||||
|
||||
ranked: list[tuple[float, Scene]] = []
|
||||
for hit in hits:
|
||||
scene = scenes_by_id.get(hit.scene_id)
|
||||
if scene is None:
|
||||
continue
|
||||
scene_desc = _describe_sample(
|
||||
kind="scene",
|
||||
item_id=scene.scene_id,
|
||||
label=f"source scene {scene.scene_id}",
|
||||
video_path=scene.source_path,
|
||||
start_s=scene.start_s,
|
||||
end_s=scene.end_s,
|
||||
cfg=cfg,
|
||||
cache=cache,
|
||||
budget=budget,
|
||||
)
|
||||
if not scene_desc:
|
||||
continue
|
||||
score = _text_similarity(beat_desc, scene_desc)
|
||||
if score >= cfg.vision.similarity_threshold:
|
||||
ranked.append((score, scene))
|
||||
|
||||
ranked.sort(key=lambda item: item[0], reverse=True)
|
||||
points: list[tuple[float, float]] = []
|
||||
for score, scene in ranked[:cfg.vision.max_seed_scenes]:
|
||||
logger.info(
|
||||
"Beat %d: vision seed scene=%d score=%.3f",
|
||||
beat.beat_id,
|
||||
scene.scene_id,
|
||||
score,
|
||||
)
|
||||
weighted_score = max(
|
||||
cfg.cv.deep_scan.coarse_candidate_threshold,
|
||||
min(0.98, cfg.vision.seed_score * (0.75 + min(1.0, score) * 0.25)),
|
||||
)
|
||||
points.extend(
|
||||
(point, weighted_score)
|
||||
for point in _scene_seed_points(scene, cfg.vision.seed_points_per_scene)
|
||||
)
|
||||
|
||||
if points:
|
||||
merged: dict[float, float] = {}
|
||||
for point, weighted_score in points:
|
||||
key = round(max(0.0, point), 3)
|
||||
merged[key] = max(weighted_score, merged.get(key, 0.0))
|
||||
seeds[beat.beat_id] = sorted((point, score) for point, score in merged.items())
|
||||
|
||||
_save_cache(cfg, cache)
|
||||
return seeds
|
||||
@@ -0,0 +1,3 @@
|
||||
"""
|
||||
src/pipeline/__init__.py — Orchestration layer
|
||||
"""
|
||||
@@ -0,0 +1,291 @@
|
||||
"""
|
||||
src/pipeline/matcher.py — Top-level CV matching orchestrator
|
||||
|
||||
This is the single entry point for the full 2-phase CV pipeline:
|
||||
|
||||
Phase 0: Load / build scene index (PySceneDetect + fingerprinting)
|
||||
Phase 1: Vibe Check — histogram + pHash filter → Top-K candidates per beat
|
||||
Phase 2: Deep Scan — template matching → frame-accurate MatchResult per beat
|
||||
|
||||
Usage:
|
||||
from src.core.config import load_config
|
||||
from src.pipeline.matcher import run_matching
|
||||
|
||||
cfg = load_config()
|
||||
beats = [...] # list[TrailerBeat] from trailer analysis
|
||||
results = run_matching(cfg, beats)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import Sequence
|
||||
|
||||
from src.core.config import AppConfig
|
||||
from src.core.models import MatchResult, Scene, TrailerBeat
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
SeedPoint = float | tuple[float, float]
|
||||
|
||||
|
||||
def _scene_seed_points(scene: Scene, max_points: int) -> list[float]:
|
||||
if max_points <= 1 or scene.duration_s <= 0:
|
||||
return [scene.start_s]
|
||||
usable_end = max(scene.start_s, scene.end_s - 0.2)
|
||||
if usable_end <= scene.start_s:
|
||||
return [scene.start_s]
|
||||
step = (usable_end - scene.start_s) / max(1, max_points - 1)
|
||||
return [scene.start_s + step * idx for idx in range(max_points)]
|
||||
|
||||
|
||||
def _build_scene_seed_in_points(
|
||||
beats: Sequence[TrailerBeat],
|
||||
scenes: Sequence[Scene],
|
||||
cfg: AppConfig,
|
||||
) -> dict[int, list[float]]:
|
||||
from src.cv.vibe_check import run_vibe_check
|
||||
|
||||
scenes_by_id = {scene.scene_id: scene for scene in scenes}
|
||||
seeds: dict[int, list[float]] = {}
|
||||
for beat in beats:
|
||||
hits = run_vibe_check(
|
||||
beat,
|
||||
scenes,
|
||||
top_k=cfg.cv.deep_scan.scene_seed_top_k,
|
||||
hist_method=cfg.cv.vibe_check.hist_compare_method,
|
||||
phash_max_distance=64,
|
||||
)
|
||||
points: list[float] = []
|
||||
for hit in hits:
|
||||
scene = scenes_by_id.get(hit.scene_id)
|
||||
if scene is None:
|
||||
continue
|
||||
points.extend(_scene_seed_points(scene, cfg.cv.deep_scan.scene_seed_points_per_scene))
|
||||
if points:
|
||||
seeds[beat.beat_id] = sorted({round(max(0.0, p), 3) for p in points})
|
||||
logger.info(
|
||||
"Beat %d: added %d scene-level seed candidates from %d source scenes.",
|
||||
beat.beat_id,
|
||||
len(seeds[beat.beat_id]),
|
||||
len(hits),
|
||||
)
|
||||
return seeds
|
||||
|
||||
|
||||
def _merge_seed_in_points(
|
||||
*seed_maps: dict[int, Sequence[SeedPoint]] | None,
|
||||
) -> dict[int, list[SeedPoint]]:
|
||||
merged: dict[int, dict[float, float | None]] = {}
|
||||
for seed_map in seed_maps:
|
||||
if not seed_map:
|
||||
continue
|
||||
for beat_id, points in seed_map.items():
|
||||
beat_points = merged.setdefault(beat_id, {})
|
||||
for point in points:
|
||||
if isinstance(point, tuple):
|
||||
t_sec = round(max(0.0, float(point[0])), 3)
|
||||
score = float(point[1])
|
||||
else:
|
||||
t_sec = round(max(0.0, float(point)), 3)
|
||||
score = None
|
||||
old_score = beat_points.get(t_sec)
|
||||
if old_score is None:
|
||||
beat_points[t_sec] = score
|
||||
elif score is not None:
|
||||
beat_points[t_sec] = max(old_score, score)
|
||||
|
||||
result: dict[int, list[SeedPoint]] = {}
|
||||
for beat_id, points in merged.items():
|
||||
result[beat_id] = [
|
||||
(t_sec, score) if score is not None else t_sec
|
||||
for t_sec, score in sorted(points.items())
|
||||
]
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Beat fingerprinting
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def fingerprint_beats(
|
||||
beats: Sequence[TrailerBeat],
|
||||
cfg: AppConfig,
|
||||
) -> list[TrailerBeat]:
|
||||
"""
|
||||
Enrich every TrailerBeat with its visual fingerprint (histogram + pHash).
|
||||
|
||||
Extracts the midpoint frame from the reference trailer and fingerprints it
|
||||
using the same Text-Safe Crop parameters as the scene indexer.
|
||||
|
||||
Args:
|
||||
beats: TrailerBeat list (fingerprints will be None initially).
|
||||
cfg: Application configuration.
|
||||
|
||||
Returns:
|
||||
New list of TrailerBeat objects with luma_hist, sat_hist, phash set.
|
||||
"""
|
||||
from dataclasses import replace
|
||||
from src.cv.fingerprinting import fingerprint_frame
|
||||
from src.cv.frame_extractor import grab_frame_at_path
|
||||
|
||||
vc_cfg = cfg.cv.vibe_check
|
||||
enriched: list[TrailerBeat] = []
|
||||
|
||||
for beat in beats:
|
||||
frame = grab_frame_at_path(beat.trailer_path, beat.midpoint_s)
|
||||
if frame is None:
|
||||
logger.warning("Beat %d: cannot decode midpoint frame, leaving unfingerpinted.", beat.beat_id)
|
||||
enriched.append(beat)
|
||||
continue
|
||||
|
||||
luma_b, sat_b, phash = fingerprint_frame(frame, vc_cfg)
|
||||
enriched.append(replace(beat, luma_hist=luma_b, sat_hist=sat_b, phash=phash))
|
||||
|
||||
logger.info("Fingerprinted %d / %d beats.", sum(1 for b in enriched if b.phash), len(beats))
|
||||
return enriched
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main pipeline entry point
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def run_matching(
|
||||
cfg: AppConfig,
|
||||
beats: Sequence[TrailerBeat],
|
||||
force_reindex: bool = False,
|
||||
seed_in_points: dict[int, Sequence[SeedPoint]] | None = None,
|
||||
) -> list[MatchResult]:
|
||||
"""
|
||||
Execute the full 2-phase CV matching pipeline.
|
||||
|
||||
Args:
|
||||
cfg: Application configuration (loaded from config.toml).
|
||||
beats: All trailer beats to source (must have trailer_path set).
|
||||
force_reindex: If True, ignore the scene cache and re-run PySceneDetect.
|
||||
|
||||
Returns:
|
||||
List of MatchResult, one per beat (unmatched beats are omitted).
|
||||
Results are in the same order as the input beats.
|
||||
"""
|
||||
from src.cv.scene_indexer import build_scene_index
|
||||
|
||||
logger.info("=" * 60)
|
||||
logger.info("AI Trailer Generator v2 — CV Matching Pipeline")
|
||||
logger.info("Source : %s", cfg.paths.source_movie.name)
|
||||
logger.info("Trailer: %s", cfg.paths.reference_trailer.name)
|
||||
logger.info("Beats : %d", len(beats))
|
||||
logger.info("=" * 60)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Phase 0: Scene index
|
||||
# ------------------------------------------------------------------
|
||||
logger.info("[Phase 0] Building scene index …")
|
||||
scenes: list[Scene] = build_scene_index(cfg, force_reindex=force_reindex)
|
||||
scenes_by_id: dict[int, Scene] = {s.scene_id: s for s in scenes}
|
||||
logger.info("[Phase 0] %d scenes indexed.", len(scenes))
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Phase 0b: Fingerprint the beats
|
||||
# ------------------------------------------------------------------
|
||||
logger.info("[Phase 0b] Fingerprinting %d trailer beats …", len(beats))
|
||||
beats = fingerprint_beats(beats, cfg)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Phase 1 & 2: Global Scan (bypasses Scene Indexer / Vibe Check entirely)
|
||||
# ------------------------------------------------------------------
|
||||
logger.info("[Phase 1 & 2] Running FFmpeg Global Scan for %d beats ...", len(beats))
|
||||
from src.cv.global_scan import run_global_scan
|
||||
|
||||
scene_seed_in_points = _build_scene_seed_in_points(beats, scenes, cfg)
|
||||
vision_seed_in_points = {}
|
||||
if cfg.vision.enabled:
|
||||
try:
|
||||
from src.llm.vision_cache import build_vision_seed_in_points
|
||||
|
||||
vision_seed_in_points = build_vision_seed_in_points(beats, scenes, cfg)
|
||||
except Exception as exc:
|
||||
logger.error("Vision seeding failed: %s — continuing with CV-only seeds.", exc)
|
||||
results = run_global_scan(
|
||||
beats,
|
||||
cfg,
|
||||
scenes=scenes,
|
||||
seed_in_points=_merge_seed_in_points(seed_in_points, scene_seed_in_points, vision_seed_in_points),
|
||||
)
|
||||
|
||||
logger.info("[Phase 1 & 2] Done. %d / %d beats matched.", len(results), len(beats))
|
||||
logger.info("=" * 60)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Convenience: build an EditTimeline from match results
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def build_timeline(
|
||||
beats: Sequence[TrailerBeat],
|
||||
results: Sequence[MatchResult],
|
||||
cfg: AppConfig,
|
||||
) -> "src.core.models.EditTimeline": # type: ignore[name-defined]
|
||||
"""
|
||||
Combine beats + match results into an ordered EditTimeline.
|
||||
|
||||
Unmatched beats are skipped; timeline positions are computed
|
||||
sequentially from the usable source-match durations.
|
||||
|
||||
Args:
|
||||
beats: All trailer beats (defines order + durations).
|
||||
results: MatchResult list from run_matching().
|
||||
cfg: Application configuration.
|
||||
|
||||
Returns:
|
||||
EditTimeline ready for FCPXML / EDL export.
|
||||
"""
|
||||
from src.core.models import EditClip, EditTimeline
|
||||
|
||||
results_by_beat: dict[int, MatchResult] = {r.beat_id: r for r in results}
|
||||
|
||||
clips: list[EditClip] = []
|
||||
cursor = 0.0
|
||||
|
||||
for beat in beats:
|
||||
match = results_by_beat.get(beat.beat_id)
|
||||
if match is None:
|
||||
logger.warning("Beat %d has no match — gap in timeline.", beat.beat_id)
|
||||
cursor += beat.duration_s
|
||||
continue
|
||||
|
||||
match_duration = max(0.0, match.duration_s)
|
||||
source_duration = min(beat.duration_s, match_duration) if match_duration > 0 else beat.duration_s
|
||||
trailer_tail_s = max(0.0, beat.duration_s - source_duration)
|
||||
if trailer_tail_s > 0:
|
||||
logger.warning(
|
||||
"Beat %d uses %.2fs source + %.2fs generated trailer tail.",
|
||||
beat.beat_id,
|
||||
source_duration,
|
||||
trailer_tail_s,
|
||||
)
|
||||
|
||||
clip = EditClip(
|
||||
clip_index=len(clips),
|
||||
beat=beat,
|
||||
match=match,
|
||||
timeline_start_s=cursor,
|
||||
timeline_end_s=cursor + beat.duration_s,
|
||||
source_duration_s=source_duration,
|
||||
trailer_tail_s=trailer_tail_s,
|
||||
)
|
||||
clips.append(clip)
|
||||
cursor += beat.duration_s
|
||||
|
||||
timeline = EditTimeline(
|
||||
title=cfg.paths.reference_trailer.stem,
|
||||
frame_rate=cfg.export.edl_frame_rate,
|
||||
clips=tuple(clips),
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Timeline built: %d clips, total duration %.2fs",
|
||||
timeline.clip_count, timeline.total_duration_s,
|
||||
)
|
||||
return timeline
|
||||
@@ -0,0 +1,427 @@
|
||||
"""
|
||||
src/pipeline/reporter.py — Visual Match Report Generator
|
||||
|
||||
Generates an HTML file containing side-by-side video clips of:
|
||||
Left: The original beat from the reference trailer
|
||||
Right: The matched scene from the source movie
|
||||
|
||||
This allows instant visual verification of the CV pipeline's results.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
from src.core.config import AppConfig
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _extract_clip(video_path: Path, start_s: float, duration_s: float, out_path: Path) -> None:
|
||||
"""Use ffmpeg to extract a silent, low-res preview clip."""
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Fast input seek close to the target, then accurate output seek for
|
||||
# frame-faithful preview clips. A plain "-ss before -i" can land on a
|
||||
# nearby keyframe and make the report look several frames out of sync.
|
||||
preroll_s = 2.0 if start_s >= 2.0 else 0.0
|
||||
input_seek_s = max(0.0, start_s - preroll_s)
|
||||
accurate_seek_s = start_s - input_seek_s
|
||||
|
||||
cmd = [
|
||||
"ffmpeg", "-y", "-loglevel", "error",
|
||||
"-ss", str(input_seek_s),
|
||||
"-i", str(video_path),
|
||||
"-ss", str(accurate_seek_s),
|
||||
"-t", str(duration_s),
|
||||
"-map", "0:v:0",
|
||||
"-c:v", "libx264",
|
||||
"-preset", "ultrafast",
|
||||
"-crf", "28",
|
||||
"-vf", "scale=640:-2", # scale down for lightweight report
|
||||
"-an", # no audio
|
||||
"-movflags", "+faststart",
|
||||
str(out_path)
|
||||
]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True)
|
||||
if result.returncode != 0:
|
||||
logger.error(
|
||||
"ffmpeg clip extraction failed for %s:\n%s",
|
||||
out_path.name, result.stderr.decode(errors="replace")
|
||||
)
|
||||
|
||||
|
||||
def _extract_clip_with_black_tail(
|
||||
video_path: Path,
|
||||
start_s: float,
|
||||
source_duration_s: float,
|
||||
total_duration_s: float,
|
||||
out_path: Path,
|
||||
) -> None:
|
||||
"""Extract a source preview and append black frames for trailer-only tails."""
|
||||
tail_s = max(0.0, total_duration_s - source_duration_s)
|
||||
if tail_s <= 0.02:
|
||||
_extract_clip(video_path, start_s, source_duration_s, out_path)
|
||||
return
|
||||
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
source_tmp = out_path.with_name(f"{out_path.stem}_source_tmp.mp4")
|
||||
tail_tmp = out_path.with_name(f"{out_path.stem}_tail_tmp.mp4")
|
||||
preroll_s = 2.0 if start_s >= 2.0 else 0.0
|
||||
input_seek_s = max(0.0, start_s - preroll_s)
|
||||
accurate_seek_s = start_s - input_seek_s
|
||||
|
||||
# First render the matched source portion with the same accurate seek path
|
||||
# as _extract_clip(). Using trim=start=... after an input seek is brittle
|
||||
# because FFmpeg may preserve non-zero packet timestamps around keyframes.
|
||||
source_cmd = [
|
||||
"ffmpeg", "-y", "-loglevel", "error",
|
||||
"-ss", str(input_seek_s),
|
||||
"-i", str(video_path),
|
||||
"-ss", str(accurate_seek_s),
|
||||
"-t", str(source_duration_s),
|
||||
"-map", "0:v:0",
|
||||
"-c:v", "libx264",
|
||||
"-preset", "ultrafast",
|
||||
"-crf", "28",
|
||||
"-vf", "scale=640:360,setsar=1,fps=25,setpts=PTS-STARTPTS",
|
||||
"-an",
|
||||
"-movflags", "+faststart",
|
||||
str(source_tmp),
|
||||
]
|
||||
|
||||
result = subprocess.run(source_cmd, capture_output=True)
|
||||
if result.returncode != 0:
|
||||
logger.error(
|
||||
"ffmpeg source preview extraction failed for %s:\n%s",
|
||||
out_path.name,
|
||||
result.stderr.decode(errors="replace"),
|
||||
)
|
||||
return
|
||||
|
||||
tail_cmd = [
|
||||
"ffmpeg", "-y", "-loglevel", "error",
|
||||
"-f", "lavfi",
|
||||
"-i", f"color=c=black:s=640x360:r=25:d={tail_s}",
|
||||
"-c:v", "libx264",
|
||||
"-preset", "ultrafast",
|
||||
"-crf", "28",
|
||||
"-an",
|
||||
"-movflags", "+faststart",
|
||||
str(tail_tmp),
|
||||
]
|
||||
result = subprocess.run(tail_cmd, capture_output=True)
|
||||
if result.returncode != 0:
|
||||
logger.error(
|
||||
"ffmpeg black tail render failed for %s:\n%s",
|
||||
out_path.name,
|
||||
result.stderr.decode(errors="replace"),
|
||||
)
|
||||
return
|
||||
|
||||
concat_cmd = [
|
||||
"ffmpeg", "-y", "-loglevel", "error",
|
||||
"-i", str(source_tmp),
|
||||
"-i", str(tail_tmp),
|
||||
"-filter_complex", "[0:v][1:v]concat=n=2:v=1:a=0[v]",
|
||||
"-map", "[v]",
|
||||
"-c:v", "libx264",
|
||||
"-preset", "ultrafast",
|
||||
"-crf", "28",
|
||||
"-an",
|
||||
"-movflags", "+faststart",
|
||||
str(out_path),
|
||||
]
|
||||
result = subprocess.run(concat_cmd, capture_output=True)
|
||||
if result.returncode != 0:
|
||||
logger.error(
|
||||
"ffmpeg tailed preview concat failed for %s:\n%s",
|
||||
out_path.name,
|
||||
result.stderr.decode(errors="replace"),
|
||||
)
|
||||
|
||||
for tmp in (source_tmp, tail_tmp):
|
||||
try:
|
||||
tmp.unlink(missing_ok=True)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def _extract_segmented_clip(
|
||||
video_path: Path,
|
||||
segments: list,
|
||||
total_duration_s: float,
|
||||
out_path: Path,
|
||||
) -> None:
|
||||
"""Render a beat-length source preview from multiple matched source islands."""
|
||||
if not segments:
|
||||
_extract_clip_with_black_tail(video_path, 0.0, 0.0, total_duration_s, out_path)
|
||||
return
|
||||
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
tmp_paths: list[Path] = []
|
||||
cursor = 0.0
|
||||
|
||||
def add_black(duration_s: float) -> None:
|
||||
if duration_s <= 0.02:
|
||||
return
|
||||
tmp = out_path.with_name(f"{out_path.stem}_part_{len(tmp_paths):03d}_black.mp4")
|
||||
cmd = [
|
||||
"ffmpeg", "-y", "-loglevel", "error",
|
||||
"-f", "lavfi",
|
||||
"-i", f"color=c=black:s=640x360:r=25:d={duration_s}",
|
||||
"-c:v", "libx264", "-preset", "ultrafast", "-crf", "28",
|
||||
"-an", "-movflags", "+faststart",
|
||||
str(tmp),
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True)
|
||||
if result.returncode == 0:
|
||||
tmp_paths.append(tmp)
|
||||
else:
|
||||
logger.error("ffmpeg black segment render failed:\n%s", result.stderr.decode(errors="replace"))
|
||||
|
||||
def add_source(start_s: float, duration_s: float) -> None:
|
||||
if duration_s <= 0.02:
|
||||
return
|
||||
tmp = out_path.with_name(f"{out_path.stem}_part_{len(tmp_paths):03d}_src.mp4")
|
||||
preroll_s = 2.0 if start_s >= 2.0 else 0.0
|
||||
input_seek_s = max(0.0, start_s - preroll_s)
|
||||
accurate_seek_s = start_s - input_seek_s
|
||||
cmd = [
|
||||
"ffmpeg", "-y", "-loglevel", "error",
|
||||
"-ss", str(input_seek_s),
|
||||
"-i", str(video_path),
|
||||
"-ss", str(accurate_seek_s),
|
||||
"-t", str(duration_s),
|
||||
"-map", "0:v:0",
|
||||
"-c:v", "libx264", "-preset", "ultrafast", "-crf", "28",
|
||||
"-vf", "scale=640:360,setsar=1,fps=25,setpts=PTS-STARTPTS",
|
||||
"-an", "-movflags", "+faststart",
|
||||
str(tmp),
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True)
|
||||
if result.returncode == 0 and tmp.exists():
|
||||
tmp_paths.append(tmp)
|
||||
else:
|
||||
logger.error("ffmpeg source segment render failed:\n%s", result.stderr.decode(errors="replace"))
|
||||
|
||||
for segment in sorted(segments, key=lambda s: s.trailer_offset_s):
|
||||
offset_s = max(0.0, float(segment.trailer_offset_s))
|
||||
duration_s = max(0.0, float(segment.duration_s))
|
||||
add_black(offset_s - cursor)
|
||||
add_source(float(segment.in_point_s), duration_s)
|
||||
cursor = max(cursor, offset_s + duration_s)
|
||||
|
||||
add_black(total_duration_s - cursor)
|
||||
|
||||
if len(tmp_paths) == 1:
|
||||
tmp_paths[0].replace(out_path)
|
||||
return
|
||||
|
||||
inputs: list[str] = []
|
||||
labels: list[str] = []
|
||||
for idx, tmp in enumerate(tmp_paths):
|
||||
inputs.extend(["-i", str(tmp)])
|
||||
labels.append(f"[{idx}:v]")
|
||||
filter_complex = "".join(labels) + f"concat=n={len(tmp_paths)}:v=1:a=0[v]"
|
||||
cmd = [
|
||||
"ffmpeg", "-y", "-loglevel", "error",
|
||||
*inputs,
|
||||
"-filter_complex", filter_complex,
|
||||
"-map", "[v]",
|
||||
"-c:v", "libx264", "-preset", "ultrafast", "-crf", "28",
|
||||
"-an", "-movflags", "+faststart",
|
||||
str(out_path),
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True)
|
||||
if result.returncode != 0:
|
||||
logger.error("ffmpeg segmented preview concat failed:\n%s", result.stderr.decode(errors="replace"))
|
||||
|
||||
for tmp in tmp_paths:
|
||||
try:
|
||||
tmp.unlink(missing_ok=True)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def _build_frame_locked_compare(ref_path: Path, src_path: Path, out_path: Path) -> None:
|
||||
"""Render reference and source into one side-by-side video stream."""
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
normalize = (
|
||||
"fps=25,scale=640:360:force_original_aspect_ratio=decrease,"
|
||||
"pad=640:360:(ow-iw)/2:(oh-ih)/2,setsar=1,setpts=PTS-STARTPTS"
|
||||
)
|
||||
filter_complex = (
|
||||
f"[0:v]{normalize}[ref];"
|
||||
f"[1:v]{normalize}[src];"
|
||||
"[ref][src]hstack=inputs=2[v]"
|
||||
)
|
||||
cmd = [
|
||||
"ffmpeg", "-y", "-loglevel", "error",
|
||||
"-i", str(ref_path),
|
||||
"-i", str(src_path),
|
||||
"-filter_complex", filter_complex,
|
||||
"-map", "[v]",
|
||||
"-c:v", "libx264",
|
||||
"-preset", "ultrafast",
|
||||
"-crf", "28",
|
||||
"-an",
|
||||
"-movflags", "+faststart",
|
||||
str(out_path),
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True)
|
||||
if result.returncode != 0:
|
||||
logger.error(
|
||||
"ffmpeg compare render failed for %s:\n%s",
|
||||
out_path.name,
|
||||
result.stderr.decode(errors="replace"),
|
||||
)
|
||||
|
||||
|
||||
def generate_report(beats: list, results: list, cfg: AppConfig) -> Path:
|
||||
"""
|
||||
Generate an HTML side-by-side report.
|
||||
Returns the path to the .html file.
|
||||
"""
|
||||
report_dir = cfg.paths.output_dir / "report"
|
||||
report_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
html_path = report_dir / "match_report.html"
|
||||
results_by_beat = {r.beat_id: r for r in results}
|
||||
|
||||
logger.info("Generating report clips in %s (this might take a moment) ...", report_dir)
|
||||
|
||||
html = [
|
||||
"<!DOCTYPE html>",
|
||||
"<html><head><meta charset='utf-8'><title>AI Trailer Match Report</title>",
|
||||
"<style>",
|
||||
"body { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; background: #0f0f0f; color: #e0e0e0; margin: 40px; }",
|
||||
"h1 { color: #fff; border-bottom: 1px solid #333; padding-bottom: 10px; }",
|
||||
".stats { font-size: 1.2em; margin-bottom: 30px; color: #aaa; }",
|
||||
".beat-row { display: flex; margin-bottom: 30px; background: #1a1a1a; padding: 20px; border-radius: 12px; border: 1px solid #333; }",
|
||||
".info { width: 250px; padding-right: 20px; flex-shrink: 0; }",
|
||||
".info h3 { margin-top: 0; color: #fff; }",
|
||||
".video-container { display: flex; gap: 20px; flex-grow: 1; }",
|
||||
".videos { flex-grow: 1; }",
|
||||
".compare { margin-bottom: 18px; }",
|
||||
".video-col { flex: 1; }",
|
||||
".video-col p { margin-top: 0; font-weight: bold; color: #888; }",
|
||||
"video { width: 100%; border-radius: 6px; box-shadow: 0 4px 6px rgba(0,0,0,0.5); background: #000; }",
|
||||
".status-match { color: #4ade80; font-weight: bold; font-size: 1.1em; }",
|
||||
".status-miss { color: #f87171; font-weight: bold; font-size: 1.1em; }",
|
||||
".score { font-family: monospace; font-size: 1.1em; color: #60a5fa; }",
|
||||
".code-hint { background: #000; padding: 10px; border-radius: 4px; font-family: monospace; font-size: 0.9em; margin-top: 15px; color: #a3e635; }",
|
||||
"</style></head><body>",
|
||||
f"<h1>AI Trailer Generator — Match Report</h1>",
|
||||
f"<div class='stats'>Total Beats: {len(beats)} | Matched: {len(results)}</div>",
|
||||
"<script>",
|
||||
"function syncBeat(row) {",
|
||||
" const vids = row.querySelectorAll('video');",
|
||||
" if (vids.length < 2) return;",
|
||||
" const ref = vids[0];",
|
||||
" const src = vids[1];",
|
||||
" let syncing = false;",
|
||||
" function align() {",
|
||||
" if (syncing) return;",
|
||||
" syncing = true;",
|
||||
" const target = Math.min(ref.currentTime, Math.max(0, (src.duration || ref.currentTime) - 0.02));",
|
||||
" if (Math.abs(src.currentTime - target) > 0.035) src.currentTime = target;",
|
||||
" if (ref.paused && !src.paused) src.pause();",
|
||||
" if (!ref.paused && src.paused) src.play().catch(() => {});",
|
||||
" syncing = false;",
|
||||
" }",
|
||||
" ref.addEventListener('play', () => { src.currentTime = Math.min(ref.currentTime, Math.max(0, (src.duration || ref.currentTime) - 0.02)); src.play().catch(() => {}); });",
|
||||
" ref.addEventListener('pause', () => src.pause());",
|
||||
" ref.addEventListener('seeked', () => { src.currentTime = Math.min(ref.currentTime, Math.max(0, (src.duration || ref.currentTime) - 0.02)); });",
|
||||
" ref.addEventListener('timeupdate', align);",
|
||||
"}",
|
||||
"document.addEventListener('DOMContentLoaded', () => document.querySelectorAll('.beat-row').forEach(syncBeat));",
|
||||
"</script>"
|
||||
]
|
||||
|
||||
for beat in beats:
|
||||
res = results_by_beat.get(beat.beat_id)
|
||||
|
||||
# Extract Reference Clip
|
||||
ref_mp4 = report_dir / f"beat_{beat.beat_id:03d}_ref.mp4"
|
||||
_extract_clip(beat.trailer_path, beat.start_s, beat.duration_s, ref_mp4)
|
||||
|
||||
html.append("<div class='beat-row'>")
|
||||
|
||||
# Info Panel
|
||||
html.append("<div class='info'>")
|
||||
html.append(f"<h3>Beat {beat.beat_id:03d}</h3>")
|
||||
html.append(f"<p><b>Type:</b> {beat.beat_type.name}</p>")
|
||||
html.append(f"<p><b>Trailer:</b> {beat.start_s:.2f}s → {beat.end_s:.2f}s</p>")
|
||||
|
||||
if res:
|
||||
segments = list(getattr(res, "segments", ()) or [])
|
||||
source_duration = sum(max(0.0, float(s.duration_s)) for s in segments)
|
||||
if not segments:
|
||||
source_duration = max(0.0, res.out_point_s - res.in_point_s)
|
||||
preview_duration = min(beat.duration_s, source_duration) if source_duration > 0 else beat.duration_s
|
||||
last_segment_end = max(
|
||||
(float(s.trailer_offset_s) + float(s.duration_s) for s in segments),
|
||||
default=preview_duration,
|
||||
)
|
||||
trailer_tail_s = max(0.0, beat.duration_s - last_segment_end)
|
||||
if getattr(res, "is_confirmed", True):
|
||||
html.append("<p class='status-match'>MATCHED</p>")
|
||||
else:
|
||||
html.append("<p style='color: #fbbf24; font-weight: bold; font-size: 1.1em;'>PROVISIONAL MATCH</p>")
|
||||
html.append(f"<p><b>Scene ID:</b> {res.scene_id}</p>")
|
||||
html.append(f"<p><b>Movie In:</b> {res.in_point_s:.2f}s</p>")
|
||||
html.append(f"<p><b>Source Dur:</b> {source_duration:.2f}s</p>")
|
||||
if len(segments) > 1:
|
||||
html.append(f"<p><b>Segments:</b> {len(segments)} matched visual islands</p>")
|
||||
if trailer_tail_s > 0:
|
||||
html.append(f"<p><b>Unmatched Tail:</b> {trailer_tail_s:.2f}s placeholder</p>")
|
||||
html.append(f"<p><b>Score:</b> <span class='score'>{res.match_score:.3f}</span></p>")
|
||||
if trailer_tail_s > 0:
|
||||
html.append("<p style='color: #fbbf24; font-size: 0.9em;'>Some trailer frames are still unmatched; report fills only those gaps with placeholder black.</p>")
|
||||
|
||||
# Warn if score is low
|
||||
if res.match_score < 0.80:
|
||||
html.append("<p style='color: #fbbf24; font-size: 0.9em;'>⚠️ Score below 0.80. Verify visually.</p>")
|
||||
|
||||
# Extract Source Clip
|
||||
src_mp4 = report_dir / f"beat_{beat.beat_id:03d}_src.mp4"
|
||||
compare_mp4 = report_dir / f"beat_{beat.beat_id:03d}_compare.mp4"
|
||||
if segments:
|
||||
_extract_segmented_clip(res.source_path, segments, beat.duration_s, src_mp4)
|
||||
else:
|
||||
_extract_clip_with_black_tail(
|
||||
res.source_path,
|
||||
res.in_point_s,
|
||||
preview_duration,
|
||||
beat.duration_s,
|
||||
src_mp4,
|
||||
)
|
||||
_build_frame_locked_compare(ref_mp4, src_mp4, compare_mp4)
|
||||
else:
|
||||
html.append("<p class='status-miss'>NO MATCH</p>")
|
||||
src_mp4 = None
|
||||
compare_mp4 = None
|
||||
|
||||
html.append(f"<div class='code-hint'>python cli.py rematch --beat {beat.beat_id}</div>")
|
||||
html.append("</div>") # /info
|
||||
|
||||
# Video Panel
|
||||
html.append("<div class='videos'>")
|
||||
if compare_mp4:
|
||||
html.append(f"<div class='compare'><p>Frame-Locked Compare</p><video src='{compare_mp4.name}' controls loop muted autoplay></video></div>")
|
||||
else:
|
||||
html.append("<div class='video-container'>")
|
||||
html.append(f"<div class='video-col'><p>Reference Trailer</p><video src='{ref_mp4.name}' controls loop muted autoplay></video></div>")
|
||||
html.append("<div class='video-col'><p>Matched Source</p><div style='width: 100%; aspect-ratio: 16/9; background: #222; display: flex; align-items: center; justify-content: center; border-radius: 6px; color: #555;'>No Match</div></div>")
|
||||
html.append("</div>") # /video-container
|
||||
html.append("</div>") # /videos
|
||||
html.append("</div>") # /beat-row
|
||||
|
||||
html.append("</body></html>")
|
||||
|
||||
html_path.write_text("\n".join(html), encoding="utf-8")
|
||||
return html_path
|
||||
@@ -0,0 +1,175 @@
|
||||
"""
|
||||
src/pipeline/trailer_analyzer.py — Reference trailer → list[TrailerBeat]
|
||||
|
||||
Responsibility:
|
||||
1. Run PySceneDetect on the REFERENCE TRAILER (not the source movie)
|
||||
to detect cut boundaries → raw beat intervals
|
||||
2. Fingerprint the midpoint frame of each beat (for Vibe Check)
|
||||
3. Transcribe dialogue per beat via Whisper (optional, injected)
|
||||
4. Optionally classify BeatType via the LLM dramaturg (injected)
|
||||
|
||||
Returns: list[TrailerBeat] ready to feed into run_matching().
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from dataclasses import replace
|
||||
from pathlib import Path
|
||||
from typing import Callable, Sequence
|
||||
|
||||
from src.core.config import AppConfig
|
||||
from src.core.models import BeatType, DialogueLine, TrailerBeat
|
||||
from src.cv.fingerprinting import fingerprint_frame
|
||||
from src.cv.frame_extractor import grab_midpoint_frame, open_video
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Injection type aliases — keeps this module free of hard audio/LLM imports
|
||||
TranscribeCallback = Callable[[Path, float, float, float], list[DialogueLine]]
|
||||
ClassifyCallback = Callable[[list[TrailerBeat]], list[TrailerBeat]]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Step 1: Scene detection on the reference trailer
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _detect_trailer_beats(cfg: AppConfig) -> list[tuple[float, float, int, int]]:
|
||||
"""
|
||||
Run PySceneDetect on the reference trailer.
|
||||
|
||||
Returns list of (start_s, end_s, start_frame, end_frame).
|
||||
Uses the same ContentDetector thresholds as the source movie.
|
||||
"""
|
||||
try:
|
||||
from scenedetect import open_video as sd_open_video, SceneManager
|
||||
from scenedetect.detectors import ContentDetector
|
||||
except ImportError:
|
||||
raise ImportError("pip install scenedetect[opencv]")
|
||||
|
||||
trailer_path = cfg.paths.reference_trailer
|
||||
video = sd_open_video(str(trailer_path))
|
||||
manager = SceneManager()
|
||||
manager.add_detector(
|
||||
ContentDetector(
|
||||
threshold=cfg.scene_detection.content_threshold,
|
||||
min_scene_len=int(
|
||||
cfg.scene_detection.min_scene_duration_s * video.frame_rate
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
logger.info("Detecting beats in reference trailer: %s …", trailer_path.name)
|
||||
manager.detect_scenes(video=video, show_progress=False)
|
||||
|
||||
raw = manager.get_scene_list()
|
||||
result = [
|
||||
(s.get_seconds(), e.get_seconds(), s.get_frames(), e.get_frames())
|
||||
for s, e in raw
|
||||
]
|
||||
logger.info("Detected %d beats in reference trailer.", len(result))
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Step 2: Fingerprint beats
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _fingerprint_beats(
|
||||
raw_beats: list[tuple[float, float, int, int]],
|
||||
cfg: AppConfig,
|
||||
) -> list[TrailerBeat]:
|
||||
"""Extract midpoint frame for each beat and compute fingerprints."""
|
||||
vc_cfg = cfg.cv.vibe_check
|
||||
trailer_path = cfg.paths.reference_trailer
|
||||
beats: list[TrailerBeat] = []
|
||||
|
||||
with open_video(trailer_path) as cap:
|
||||
for idx, (start_s, end_s, start_frame, end_frame) in enumerate(raw_beats):
|
||||
frame = grab_midpoint_frame(cap, start_s, end_s)
|
||||
|
||||
if frame is None:
|
||||
logger.warning("Beat %d: midpoint frame decode failed.", idx)
|
||||
beats.append(TrailerBeat(
|
||||
beat_id=idx,
|
||||
trailer_path=trailer_path,
|
||||
start_s=start_s, end_s=end_s,
|
||||
start_frame=start_frame, end_frame=end_frame,
|
||||
))
|
||||
continue
|
||||
|
||||
luma_b, sat_b, phash = fingerprint_frame(frame, vc_cfg)
|
||||
beats.append(TrailerBeat(
|
||||
beat_id=idx,
|
||||
trailer_path=trailer_path,
|
||||
start_s=start_s, end_s=end_s,
|
||||
start_frame=start_frame, end_frame=end_frame,
|
||||
luma_hist=luma_b,
|
||||
sat_hist=sat_b,
|
||||
phash=phash,
|
||||
))
|
||||
|
||||
return beats
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def analyze_reference_trailer(
|
||||
cfg: AppConfig,
|
||||
transcribe_callback: TranscribeCallback | None = None,
|
||||
classify_callback: ClassifyCallback | None = None,
|
||||
) -> list[TrailerBeat]:
|
||||
"""
|
||||
Full reference-trailer analysis pipeline.
|
||||
|
||||
Args:
|
||||
cfg: Application configuration.
|
||||
transcribe_callback: Optional fn(path, start_s, end_s, offset_s)
|
||||
→ list[DialogueLine]. Injected to keep this
|
||||
module free of faster-whisper imports.
|
||||
classify_callback: Optional fn(beats) → beats with BeatType set.
|
||||
Injected to keep this module LLM-free.
|
||||
|
||||
Returns:
|
||||
List of TrailerBeat objects with fingerprints (and optionally
|
||||
dialogue + BeatType) populated.
|
||||
"""
|
||||
# Step 1 — cut detection
|
||||
raw_beats = _detect_trailer_beats(cfg)
|
||||
|
||||
# Step 2 — fingerprint
|
||||
beats = _fingerprint_beats(raw_beats, cfg)
|
||||
|
||||
# Step 3 — dialogue (optional)
|
||||
if transcribe_callback is not None:
|
||||
enriched: list[TrailerBeat] = []
|
||||
for beat in beats:
|
||||
try:
|
||||
lines = transcribe_callback(
|
||||
beat.trailer_path,
|
||||
beat.start_s,
|
||||
beat.end_s,
|
||||
beat.start_s, # time_offset so timestamps are absolute
|
||||
)
|
||||
enriched.append(replace(beat, dialogue=tuple(lines)))
|
||||
except Exception as exc:
|
||||
logger.warning("Beat %d transcription failed: %s", beat.beat_id, exc)
|
||||
enriched.append(beat)
|
||||
beats = enriched
|
||||
|
||||
# Step 4 — LLM dramaturgy (optional)
|
||||
if classify_callback is not None:
|
||||
try:
|
||||
beats = classify_callback(beats)
|
||||
except Exception as exc:
|
||||
logger.warning("Beat classification failed: %s — keeping UNKNOWN.", exc)
|
||||
|
||||
logger.info(
|
||||
"Trailer analysis complete: %d beats, %d with dialogue, %d classified.",
|
||||
len(beats),
|
||||
sum(1 for b in beats if b.dialogue),
|
||||
sum(1 for b in beats if b.beat_type != BeatType.UNKNOWN),
|
||||
)
|
||||
return beats
|
||||
@@ -0,0 +1 @@
|
||||
# tests package
|
||||
@@ -0,0 +1,144 @@
|
||||
"""
|
||||
tests/test_config.py — Smoke tests for config loading and model integrity.
|
||||
|
||||
Run with: pytest tests/test_config.py -v
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
import pytest
|
||||
|
||||
from src.core.config import load_config, AppConfig
|
||||
from src.core.models import (
|
||||
Scene, TrailerBeat, MatchResult, VibeHit,
|
||||
EditClip, EditTimeline, BeatType, DialogueLine,
|
||||
)
|
||||
|
||||
|
||||
CONFIG_PATH = Path(__file__).parents[1] / "config.toml"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Config loader
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestConfigLoader:
|
||||
def test_loads_without_error(self) -> None:
|
||||
cfg = load_config(CONFIG_PATH)
|
||||
assert isinstance(cfg, AppConfig)
|
||||
|
||||
def test_project_meta(self) -> None:
|
||||
cfg = load_config(CONFIG_PATH)
|
||||
assert cfg.version == "2.0.0"
|
||||
assert cfg.log_level in ("DEBUG", "INFO", "WARNING", "ERROR")
|
||||
|
||||
def test_cv_thresholds_in_range(self) -> None:
|
||||
cfg = load_config(CONFIG_PATH)
|
||||
ds = cfg.cv.deep_scan
|
||||
assert 0.0 < ds.match_threshold < 1.0
|
||||
assert ds.coarse_step_seconds > 0
|
||||
|
||||
def test_vibe_check_crop_fractions(self) -> None:
|
||||
cfg = load_config(CONFIG_PATH)
|
||||
vc = cfg.cv.vibe_check
|
||||
assert 0.0 < vc.crop_top_fraction < 1.0
|
||||
assert 0.0 < vc.crop_bottom_fraction < 1.0
|
||||
assert vc.crop_top_fraction + vc.crop_bottom_fraction < 1.0
|
||||
|
||||
def test_missing_config_raises(self, tmp_path: Path) -> None:
|
||||
with pytest.raises(FileNotFoundError):
|
||||
load_config(tmp_path / "nonexistent.toml")
|
||||
|
||||
def test_paths_are_path_objects(self) -> None:
|
||||
cfg = load_config(CONFIG_PATH)
|
||||
assert isinstance(cfg.paths.source_movie, Path)
|
||||
assert isinstance(cfg.paths.reference_trailer, Path)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Data models — construction & properties
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestSceneModel:
|
||||
def test_duration(self) -> None:
|
||||
s = Scene(
|
||||
scene_id=0,
|
||||
source_path=Path("dummy.mp4"),
|
||||
start_s=10.0,
|
||||
end_s=25.5,
|
||||
start_frame=240,
|
||||
end_frame=612,
|
||||
)
|
||||
assert s.duration_s == pytest.approx(15.5)
|
||||
assert s.midpoint_s == pytest.approx(17.75)
|
||||
|
||||
def test_immutable(self) -> None:
|
||||
s = Scene(
|
||||
scene_id=0, source_path=Path("x.mp4"),
|
||||
start_s=0.0, end_s=1.0,
|
||||
start_frame=0, end_frame=24,
|
||||
)
|
||||
with pytest.raises(Exception): # FrozenInstanceError
|
||||
s.scene_id = 99 # type: ignore[misc]
|
||||
|
||||
|
||||
class TestTrailerBeatModel:
|
||||
def test_beat_type_default(self) -> None:
|
||||
b = TrailerBeat(
|
||||
beat_id=0, trailer_path=Path("trailer.mp4"),
|
||||
start_s=0.0, end_s=3.0,
|
||||
start_frame=0, end_frame=72,
|
||||
)
|
||||
assert b.beat_type == BeatType.UNKNOWN
|
||||
|
||||
|
||||
class TestMatchResultModel:
|
||||
def test_duration_computed(self) -> None:
|
||||
mr = MatchResult(
|
||||
beat_id=0, scene_id=3,
|
||||
source_path=Path("movie.mp4"),
|
||||
in_point_s=120.0,
|
||||
out_point_s=123.5,
|
||||
in_point_frame=2880,
|
||||
match_score=0.87,
|
||||
)
|
||||
assert mr.duration_s == pytest.approx(3.5)
|
||||
|
||||
def test_repr_contains_key_info(self) -> None:
|
||||
mr = MatchResult(
|
||||
beat_id=1, scene_id=7,
|
||||
source_path=Path("movie.mp4"),
|
||||
in_point_s=60.0, out_point_s=63.0,
|
||||
in_point_frame=1440, match_score=0.91,
|
||||
)
|
||||
r = repr(mr)
|
||||
assert "beat=1" in r
|
||||
assert "scene=7" in r
|
||||
|
||||
|
||||
class TestEditTimeline:
|
||||
def _make_clip(self, idx: int, t_start: float, t_end: float) -> EditClip:
|
||||
beat = TrailerBeat(
|
||||
beat_id=idx, trailer_path=Path("t.mp4"),
|
||||
start_s=t_start, end_s=t_end,
|
||||
start_frame=0, end_frame=1,
|
||||
)
|
||||
match = MatchResult(
|
||||
beat_id=idx, scene_id=0,
|
||||
source_path=Path("m.mp4"),
|
||||
in_point_s=0.0, out_point_s=t_end - t_start,
|
||||
in_point_frame=0, match_score=0.9,
|
||||
)
|
||||
return EditClip(
|
||||
clip_index=idx, beat=beat, match=match,
|
||||
timeline_start_s=t_start, timeline_end_s=t_end,
|
||||
)
|
||||
|
||||
def test_total_duration(self) -> None:
|
||||
clips = (self._make_clip(0, 0.0, 5.0), self._make_clip(1, 5.0, 9.0))
|
||||
tl = EditTimeline(title="Test Trailer", frame_rate=23.976, clips=clips)
|
||||
assert tl.total_duration_s == pytest.approx(9.0)
|
||||
assert tl.clip_count == 2
|
||||
|
||||
def test_empty_timeline(self) -> None:
|
||||
tl = EditTimeline(title="Empty", frame_rate=24.0, clips=())
|
||||
assert tl.total_duration_s == 0.0
|
||||
@@ -0,0 +1,140 @@
|
||||
"""
|
||||
tests/test_deep_scan.py — Unit tests for frame_extractor and deep_scan
|
||||
|
||||
Uses synthetic in-memory videos (cv2.VideoWriter → temp file) so no real
|
||||
video files are required. Tests cover the pure logic, not hardware decoding.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from src.cv.frame_extractor import (
|
||||
get_video_info,
|
||||
grab_frame_at,
|
||||
iter_frames_stepped,
|
||||
open_video,
|
||||
)
|
||||
from src.cv.fingerprinting import text_safe_crop
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers: build a tiny synthetic video on disk
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
FPS = 24
|
||||
WIDTH = 320
|
||||
HEIGHT = 240
|
||||
SECS = 3
|
||||
|
||||
|
||||
def _make_synthetic_video(path: Path, color_bgr: tuple[int, int, int] = (0, 128, 255)) -> Path:
|
||||
"""Write a 3-second single-colour video to *path*."""
|
||||
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
|
||||
writer = cv2.VideoWriter(str(path), fourcc, float(FPS), (WIDTH, HEIGHT))
|
||||
frame = np.full((HEIGHT, WIDTH, 3), color_bgr, dtype=np.uint8)
|
||||
for _ in range(FPS * SECS):
|
||||
writer.write(frame)
|
||||
writer.release()
|
||||
return path
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def synthetic_video(tmp_path: Path) -> Path:
|
||||
return _make_synthetic_video(tmp_path / "test.mp4")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# open_video
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestOpenVideo:
|
||||
def test_opens_valid_file(self, synthetic_video: Path) -> None:
|
||||
with open_video(synthetic_video) as cap:
|
||||
assert cap.isOpened()
|
||||
|
||||
def test_raises_on_missing_file(self, tmp_path: Path) -> None:
|
||||
with pytest.raises(FileNotFoundError):
|
||||
with open_video(tmp_path / "ghost.mp4"):
|
||||
pass
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# get_video_info
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestGetVideoInfo:
|
||||
def test_returns_correct_fps(self, synthetic_video: Path) -> None:
|
||||
info = get_video_info(synthetic_video)
|
||||
assert info["fps"] == pytest.approx(FPS, rel=0.05)
|
||||
|
||||
def test_duration_approx(self, synthetic_video: Path) -> None:
|
||||
info = get_video_info(synthetic_video)
|
||||
assert info["duration_s"] == pytest.approx(SECS, rel=0.1)
|
||||
|
||||
def test_resolution(self, synthetic_video: Path) -> None:
|
||||
info = get_video_info(synthetic_video)
|
||||
assert info["width"] == WIDTH
|
||||
assert info["height"] == HEIGHT
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# grab_frame_at
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestGrabFrameAt:
|
||||
def test_returns_ndarray(self, synthetic_video: Path) -> None:
|
||||
with open_video(synthetic_video) as cap:
|
||||
frame = grab_frame_at(cap, 1.0)
|
||||
assert frame is not None
|
||||
assert isinstance(frame, np.ndarray)
|
||||
assert frame.shape == (HEIGHT, WIDTH, 3)
|
||||
|
||||
def test_returns_none_past_end(self, synthetic_video: Path) -> None:
|
||||
with open_video(synthetic_video) as cap:
|
||||
frame = grab_frame_at(cap, 9999.0)
|
||||
# May return None or a repeated last frame depending on codec;
|
||||
# we only assert no exception is raised.
|
||||
assert frame is None or isinstance(frame, np.ndarray)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# iter_frames_stepped
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestIterFramesStepped:
|
||||
def test_yields_correct_count(self, synthetic_video: Path) -> None:
|
||||
with open_video(synthetic_video) as cap:
|
||||
frames = list(iter_frames_stepped(cap, 0.0, 1.0, 0.5))
|
||||
# Expect timestamps: 0.0, 0.5, 1.0 → 3 frames
|
||||
assert len(frames) == 3
|
||||
|
||||
def test_timestamps_increasing(self, synthetic_video: Path) -> None:
|
||||
with open_video(synthetic_video) as cap:
|
||||
frames = list(iter_frames_stepped(cap, 0.0, 2.0, 0.5))
|
||||
timestamps = [t for t, _ in frames]
|
||||
assert timestamps == sorted(timestamps)
|
||||
|
||||
def test_invalid_step_raises(self, synthetic_video: Path) -> None:
|
||||
with open_video(synthetic_video) as cap:
|
||||
with pytest.raises(ValueError, match="step_s"):
|
||||
list(iter_frames_stepped(cap, 0.0, 1.0, 0.0))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# text_safe_crop integration (sanity: cropped height consistent)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestCropSanity:
|
||||
def test_crop_reduces_height(self, synthetic_video: Path) -> None:
|
||||
with open_video(synthetic_video) as cap:
|
||||
frame = grab_frame_at(cap, 0.5)
|
||||
assert frame is not None
|
||||
cropped = text_safe_crop(frame, 0.15, 0.30)
|
||||
assert cropped.shape[0] < frame.shape[0]
|
||||
assert cropped.shape[1] == frame.shape[1] # width unchanged
|
||||
@@ -0,0 +1,218 @@
|
||||
"""
|
||||
tests/test_export.py — Unit tests for timecode conversion and export writers
|
||||
|
||||
Tests use synthetic EditTimeline objects (no real video files needed).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from src.export.timecode import (
|
||||
seconds_to_fcpxml,
|
||||
seconds_to_smpte,
|
||||
fcpxml_frame_duration,
|
||||
fcpxml_format_name,
|
||||
seconds_to_frame_count,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Timecode helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestSecondsToFcpxml:
|
||||
def test_zero(self) -> None:
|
||||
assert seconds_to_fcpxml(0.0, 24.0) == "0s"
|
||||
|
||||
def test_one_second_at_24fps(self) -> None:
|
||||
# 1.0s @ 24fps → 24 frames → 24/24s = 1/1s
|
||||
result = seconds_to_fcpxml(1.0, 24.0)
|
||||
assert result == "1/1s"
|
||||
|
||||
def test_one_second_at_23976(self) -> None:
|
||||
# 1s @ 23.976 → 24000/24000 * 1001/1001 = 1001/1000 ... let's just check it's rational
|
||||
result = seconds_to_fcpxml(1.0, 23.976)
|
||||
assert result.endswith("s")
|
||||
assert "/" in result
|
||||
|
||||
def test_ten_seconds_at_25fps(self) -> None:
|
||||
# 10s @ 25fps → 250 frames → 250/25s = 10/1s
|
||||
result = seconds_to_fcpxml(10.0, 25.0)
|
||||
assert result == "10/1s"
|
||||
|
||||
def test_rational_is_reduced(self) -> None:
|
||||
# Should never produce 24/24s
|
||||
result = seconds_to_fcpxml(1.0, 24.0)
|
||||
num, den = result.rstrip("s").split("/")
|
||||
from math import gcd
|
||||
assert gcd(int(num), int(den)) == 1
|
||||
|
||||
|
||||
class TestSecondsToSmpte:
|
||||
def test_zero(self) -> None:
|
||||
assert seconds_to_smpte(0.0, 24.0) == "00:00:00:00"
|
||||
|
||||
def test_one_minute(self) -> None:
|
||||
assert seconds_to_smpte(60.0, 25.0) == "00:01:00:00"
|
||||
|
||||
def test_one_hour(self) -> None:
|
||||
assert seconds_to_smpte(3600.0, 24.0) == "01:00:00:00"
|
||||
|
||||
def test_frames_overflow(self) -> None:
|
||||
# 25fps: 26 frames → 1s + 1 frame = 00:00:01:01
|
||||
result = seconds_to_smpte(26 / 25, 25.0)
|
||||
assert result == "00:00:01:01"
|
||||
|
||||
def test_format_length(self) -> None:
|
||||
result = seconds_to_smpte(123.456, 23.976)
|
||||
parts = result.split(":")
|
||||
assert len(parts) == 4
|
||||
assert all(len(p) == 2 for p in parts)
|
||||
|
||||
|
||||
class TestFcpxmlHelpers:
|
||||
def test_frame_duration_24fps(self) -> None:
|
||||
assert fcpxml_frame_duration(24.0) == "1/24s"
|
||||
|
||||
def test_frame_duration_23976(self) -> None:
|
||||
fd = fcpxml_frame_duration(23.976)
|
||||
# Should be "1001/24000s"
|
||||
assert fd == "1001/24000s"
|
||||
|
||||
def test_format_name_1080p_2398(self) -> None:
|
||||
name = fcpxml_format_name(23.976, 1920, 1080)
|
||||
assert "1080" in name
|
||||
assert "2398" in name
|
||||
|
||||
def test_frame_count_roundtrip(self) -> None:
|
||||
fps = 25.0
|
||||
seconds = 10.0
|
||||
frames = seconds_to_frame_count(seconds, fps)
|
||||
assert frames == 250
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# EDL writer (string output)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestEdlWriter:
|
||||
def _make_timeline(self) -> "src.core.models.EditTimeline": # type: ignore
|
||||
from src.core.models import (
|
||||
BeatType, EditClip, EditTimeline, MatchResult, TrailerBeat,
|
||||
)
|
||||
|
||||
beat = TrailerBeat(
|
||||
beat_id=0, trailer_path=Path("trailer.mp4"),
|
||||
start_s=0.0, end_s=5.0, start_frame=0, end_frame=120,
|
||||
beat_type=BeatType.HOOK,
|
||||
)
|
||||
match = MatchResult(
|
||||
beat_id=0, scene_id=3,
|
||||
source_path=Path("movie.mp4"),
|
||||
in_point_s=30.0, out_point_s=35.0,
|
||||
in_point_frame=720, match_score=0.88,
|
||||
)
|
||||
clip = EditClip(
|
||||
clip_index=0, beat=beat, match=match,
|
||||
timeline_start_s=0.0, timeline_end_s=5.0,
|
||||
)
|
||||
return EditTimeline(
|
||||
title="TestTrailer", frame_rate=25.0, clips=(clip,)
|
||||
)
|
||||
|
||||
def test_edl_contains_title(self, tmp_path: Path) -> None:
|
||||
from src.core.config import load_config
|
||||
from src.export.edl_writer import write_edl
|
||||
|
||||
cfg = load_config()
|
||||
tl = self._make_timeline()
|
||||
out = write_edl(tl, cfg, output_path=tmp_path / "test.edl")
|
||||
|
||||
text = out.read_text(encoding="utf-8")
|
||||
assert "TITLE: TestTrailer" in text
|
||||
|
||||
def test_edl_has_event_line(self, tmp_path: Path) -> None:
|
||||
from src.core.config import load_config
|
||||
from src.export.edl_writer import write_edl
|
||||
|
||||
cfg = load_config()
|
||||
tl = self._make_timeline()
|
||||
out = write_edl(tl, cfg, output_path=tmp_path / "test.edl")
|
||||
|
||||
text = out.read_text(encoding="utf-8")
|
||||
assert "001" in text # event number
|
||||
assert "AX" in text # reel name
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# FCPXML writer (XML structure)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestFcpxmlWriter:
|
||||
def _make_timeline(self) -> "src.core.models.EditTimeline": # type: ignore
|
||||
from src.core.models import (
|
||||
BeatType, EditClip, EditTimeline, MatchResult, TrailerBeat,
|
||||
)
|
||||
|
||||
beat = TrailerBeat(
|
||||
beat_id=0, trailer_path=Path("trailer.mp4"),
|
||||
start_s=0.0, end_s=5.0, start_frame=0, end_frame=120,
|
||||
beat_type=BeatType.HOOK,
|
||||
)
|
||||
match = MatchResult(
|
||||
beat_id=0, scene_id=3,
|
||||
source_path=Path("B:/Proxy/movie.mp4"),
|
||||
in_point_s=30.0, out_point_s=35.0,
|
||||
in_point_frame=720, match_score=0.88,
|
||||
)
|
||||
clip = EditClip(
|
||||
clip_index=0, beat=beat, match=match,
|
||||
timeline_start_s=0.0, timeline_end_s=5.0,
|
||||
)
|
||||
return EditTimeline(
|
||||
title="TestTrailer", frame_rate=25.0, clips=(clip,)
|
||||
)
|
||||
|
||||
def test_fcpxml_is_valid_xml(self, tmp_path: Path) -> None:
|
||||
from xml.etree import ElementTree as ET
|
||||
from src.core.config import load_config
|
||||
from src.export.fcpxml_writer import write_fcpxml
|
||||
|
||||
cfg = load_config()
|
||||
tl = self._make_timeline()
|
||||
out = write_fcpxml(tl, cfg, output_path=tmp_path / "test.fcpxml")
|
||||
|
||||
text = out.read_text(encoding="utf-8")
|
||||
text_no_doctype = "\n".join(
|
||||
line for line in text.splitlines()
|
||||
if not line.strip().startswith("<!DOCTYPE")
|
||||
)
|
||||
root = ET.fromstring(text_no_doctype)
|
||||
# Strip namespace prefix for comparison
|
||||
local_tag = root.tag.split("}")[-1] if "}" in root.tag else root.tag
|
||||
assert local_tag == "fcpxml"
|
||||
|
||||
def test_fcpxml_has_spine(self, tmp_path: Path) -> None:
|
||||
from xml.etree import ElementTree as ET
|
||||
from src.core.config import load_config
|
||||
from src.export.fcpxml_writer import write_fcpxml
|
||||
|
||||
cfg = load_config()
|
||||
tl = self._make_timeline()
|
||||
out = write_fcpxml(tl, cfg, output_path=tmp_path / "test.fcpxml")
|
||||
|
||||
text = out.read_text(encoding="utf-8")
|
||||
text_no_doctype = "\n".join(
|
||||
line for line in text.splitlines()
|
||||
if not line.strip().startswith("<!DOCTYPE")
|
||||
)
|
||||
# Register the FCPXML namespace so find() works
|
||||
ns = {"fcp": "http://www.apple.com/dt/FCPXML/1_10"}
|
||||
root = ET.fromstring(text_no_doctype)
|
||||
spine = root.find(".//fcp:spine", ns)
|
||||
assert spine is not None
|
||||
clips = list(spine)
|
||||
assert len(clips) == 1
|
||||
@@ -0,0 +1,112 @@
|
||||
"""
|
||||
tests/test_fingerprinting.py — Unit tests for src/cv/fingerprinting.py
|
||||
|
||||
Tests run WITHOUT requiring real video files.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from src.cv.fingerprinting import (
|
||||
text_safe_crop,
|
||||
extract_hs_histograms,
|
||||
compare_histograms,
|
||||
hist_to_bytes,
|
||||
bytes_to_hist,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fixtures
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@pytest.fixture
|
||||
def solid_blue_frame() -> np.ndarray:
|
||||
"""256×256 solid blue BGR frame."""
|
||||
frame = np.zeros((256, 256, 3), dtype=np.uint8)
|
||||
frame[:, :] = (255, 0, 0) # BGR blue
|
||||
return frame
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def solid_red_frame() -> np.ndarray:
|
||||
"""256×256 solid red BGR frame."""
|
||||
frame = np.zeros((256, 256, 3), dtype=np.uint8)
|
||||
frame[:, :] = (0, 0, 255) # BGR red
|
||||
return frame
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# text_safe_crop
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestTextSafeCrop:
|
||||
def test_removes_correct_rows(self, solid_blue_frame: np.ndarray) -> None:
|
||||
cropped = text_safe_crop(solid_blue_frame, crop_top=0.15, crop_bottom=0.30)
|
||||
h = solid_blue_frame.shape[0] # 256
|
||||
expected_h = int(h * (1.0 - 0.30)) - int(h * 0.15)
|
||||
assert cropped.shape[0] == expected_h
|
||||
|
||||
def test_zero_crop_returns_same_size(self, solid_blue_frame: np.ndarray) -> None:
|
||||
cropped = text_safe_crop(solid_blue_frame, crop_top=0.0, crop_bottom=0.0)
|
||||
assert cropped.shape == solid_blue_frame.shape
|
||||
|
||||
def test_invalid_top_raises(self, solid_blue_frame: np.ndarray) -> None:
|
||||
with pytest.raises(ValueError, match="crop_top"):
|
||||
text_safe_crop(solid_blue_frame, crop_top=1.0, crop_bottom=0.0)
|
||||
|
||||
def test_invalid_bottom_raises(self, solid_blue_frame: np.ndarray) -> None:
|
||||
with pytest.raises(ValueError, match="crop_bottom"):
|
||||
text_safe_crop(solid_blue_frame, crop_top=0.0, crop_bottom=-0.1)
|
||||
|
||||
def test_overlapping_crops_raise(self, solid_blue_frame: np.ndarray) -> None:
|
||||
with pytest.raises(ValueError, match="must be < 1.0"):
|
||||
text_safe_crop(solid_blue_frame, crop_top=0.6, crop_bottom=0.5)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Histograms
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestHistograms:
|
||||
def test_output_shape(self, solid_blue_frame: np.ndarray) -> None:
|
||||
luma, sat = extract_hs_histograms(solid_blue_frame, bins_hue=50, bins_sat=60)
|
||||
assert luma.shape == (50,)
|
||||
assert sat.shape == (60,)
|
||||
|
||||
def test_normalised(self, solid_blue_frame: np.ndarray) -> None:
|
||||
import numpy as np
|
||||
luma, sat = extract_hs_histograms(solid_blue_frame, bins_hue=50, bins_sat=60)
|
||||
# L2-normalised → norm ≈ 1.0
|
||||
assert np.linalg.norm(luma) == pytest.approx(1.0, abs=1e-5)
|
||||
assert np.linalg.norm(sat) == pytest.approx(1.0, abs=1e-5)
|
||||
|
||||
def test_same_frame_correl_is_one(self, solid_blue_frame: np.ndarray) -> None:
|
||||
import cv2
|
||||
luma, _ = extract_hs_histograms(solid_blue_frame, bins_hue=50, bins_sat=60)
|
||||
score = compare_histograms(luma, luma, method=cv2.HISTCMP_CORREL)
|
||||
assert score == pytest.approx(1.0, abs=1e-5)
|
||||
|
||||
def test_different_frames_correl_lower(
|
||||
self,
|
||||
solid_blue_frame: np.ndarray,
|
||||
solid_red_frame: np.ndarray,
|
||||
) -> None:
|
||||
import cv2
|
||||
luma_b, _ = extract_hs_histograms(solid_blue_frame, 50, 60)
|
||||
luma_r, _ = extract_hs_histograms(solid_red_frame, 50, 60)
|
||||
score = compare_histograms(luma_b, luma_r, method=cv2.HISTCMP_CORREL)
|
||||
assert score < 1.0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Serialisation round-trip
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestSerialisation:
|
||||
def test_round_trip(self, solid_blue_frame: np.ndarray) -> None:
|
||||
luma, _ = extract_hs_histograms(solid_blue_frame, 50, 60)
|
||||
restored = bytes_to_hist(hist_to_bytes(luma))
|
||||
np.testing.assert_array_almost_equal(luma, restored)
|
||||
Reference in New Issue
Block a user