Initial project import

2026-05-02 09:07:41 +02:00
commit 8e1bcf142f
38 changed files with 7928 additions and 0 deletions
@@ -0,0 +1,182 @@
+"""
+src/audio/transcriber.py — Whisper transcription via faster-whisper
+
+Responsibility:
+  - Transcribe audio from a video file into a list of DialogueLine objects
+  - Optionally restrict to a time window [start_s, end_s] (for single beats)
+  - All model config (model name, device, compute_type) comes from AppConfig
+
+The LLM is NOT used here. This is pure audio-to-text.
+"""
+
+from __future__ import annotations
+
+import logging
+import tempfile
+from pathlib import Path
+from typing import Sequence
+
+from src.core.config import AppConfig
+from src.core.models import DialogueLine
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Audio extraction helper (video → wav via ffmpeg)
+# ---------------------------------------------------------------------------
+
+def _extract_audio_segment(
+    video_path: Path,
+    start_s: float | None,
+    end_s: float | None,
+    out_wav: Path,
+) -> None:
+    """
+    Use ffmpeg (subprocess) to extract a mono 16kHz WAV from *video_path*.
+
+    Args:
+        video_path: Source video.
+        start_s:    Start time in seconds (None = beginning of file).
+        end_s:      End time in seconds (None = end of file).
+        out_wav:    Destination WAV path.
+
+    Raises:
+        RuntimeError: If ffmpeg exits with a non-zero code.
+    """
+    import subprocess
+
+    cmd = ["ffmpeg", "-y", "-loglevel", "error"]
+
+    if start_s is not None:
+        cmd += ["-ss", str(start_s)]
+    if end_s is not None and start_s is not None:
+        cmd += ["-t", str(end_s - start_s)]
+    elif end_s is not None:
+        cmd += ["-to", str(end_s)]
+
+    cmd += [
+        "-i", str(video_path),
+        "-vn",                        # no video
+        "-ac", "1",                   # mono
+        "-ar", "16000",               # 16 kHz — Whisper native rate
+        "-f", "wav",
+        str(out_wav),
+    ]
+
+    result = subprocess.run(cmd, capture_output=True)
+    if result.returncode != 0:
+        raise RuntimeError(
+            f"ffmpeg failed (code {result.returncode}):\n"
+            f"{result.stderr.decode(errors='replace')}"
+        )
+
+
+# ---------------------------------------------------------------------------
+# Core transcription
+# ---------------------------------------------------------------------------
+
+def transcribe_video(
+    video_path: Path,
+    cfg: AppConfig,
+    start_s: float | None = None,
+    end_s: float | None = None,
+    time_offset_s: float = 0.0,
+) -> list[DialogueLine]:
+    """
+    Transcribe dialogue from *video_path* using faster-whisper.
+
+    Args:
+        video_path:    Path to source or trailer video.
+        cfg:           Application configuration (whisper section).
+        start_s:       Clip start in video-file seconds (None = beginning).
+        end_s:         Clip end   in video-file seconds (None = end of file).
+        time_offset_s: Added to every transcript timestamp so that beat-level
+                       transcripts align with absolute movie time.
+
+    Returns:
+        List of DialogueLine ordered by start time.
+    """
+    try:
+        from faster_whisper import WhisperModel
+    except ImportError:
+        raise ImportError("faster-whisper not installed. Run: pip install faster-whisper")
+
+    w = cfg.whisper
+
+    logger.info(
+        "Transcribing %s [%.1f–%s] with %s on %s …",
+        video_path.name,
+        start_s or 0.0,
+        f"{end_s:.1f}s" if end_s else "end",
+        w.model,
+        w.device,
+    )
+
+    with tempfile.TemporaryDirectory() as tmp:
+        wav = Path(tmp) / "audio.wav"
+        _extract_audio_segment(video_path, start_s, end_s, wav)
+
+        model = WhisperModel(w.model, device=w.device, compute_type=w.compute_type)
+        segments, _ = model.transcribe(
+            str(wav),
+            language=w.language if w.language else None,
+            beam_size=5,
+        )
+
+        lines: list[DialogueLine] = []
+        for seg in segments:
+            lines.append(DialogueLine(
+                start_s=seg.start + time_offset_s,
+                end_s=seg.end   + time_offset_s,
+                text=seg.text.strip(),
+            ))
+
+    logger.info("Transcription done: %d segments.", len(lines))
+    return lines
+
+
+# ---------------------------------------------------------------------------
+# Convenience: transcribe a whole file and return grouped by scene
+# ---------------------------------------------------------------------------
+
+def transcribe_full_movie(
+    cfg: AppConfig,
+) -> list[DialogueLine]:
+    """
+    Transcribe the entire source movie. Use this result to enrich Scenes
+    via a dialogue_callback passed to build_scene_index().
+    """
+    return transcribe_video(cfg.paths.source_movie, cfg)
+
+
+def assign_dialogue_to_scenes(
+    all_dialogue: Sequence[DialogueLine],
+    scenes: list["src.core.models.Scene"],  # type: ignore[name-defined]
+) -> list["src.core.models.Scene"]:  # type: ignore[name-defined]
+    """
+    Distribute pre-transcribed DialogueLines into their respective Scenes.
+
+    A line is assigned to the scene whose window contains its midpoint.
+
+    Args:
+        all_dialogue: Full-movie transcript as flat list.
+        scenes:       Scene list (will be replaced with enriched copies).
+
+    Returns:
+        New list of Scene objects with dialogue tuples populated.
+    """
+    from dataclasses import replace
+    from src.core.models import Scene
+
+    enriched: list[Scene] = []
+    for scene in scenes:
+        matched = tuple(
+            line for line in all_dialogue
+            if scene.start_s <= (line.start_s + line.end_s) / 2.0 < scene.end_s
+        )
+        enriched.append(replace(scene, dialogue=matched))
+
+    total_assigned = sum(len(s.dialogue) for s in enriched)
+    logger.info("Assigned %d dialogue lines across %d scenes.", total_assigned, len(enriched))
+    return enriched