""" src/audio/transcriber.py — Whisper transcription via faster-whisper Responsibility: - Transcribe audio from a video file into a list of DialogueLine objects - Optionally restrict to a time window [start_s, end_s] (for single beats) - All model config (model name, device, compute_type) comes from AppConfig The LLM is NOT used here. This is pure audio-to-text. """ from __future__ import annotations import logging import tempfile from pathlib import Path from typing import Sequence from src.core.config import AppConfig from src.core.models import DialogueLine logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Audio extraction helper (video → wav via ffmpeg) # --------------------------------------------------------------------------- def _extract_audio_segment( video_path: Path, start_s: float | None, end_s: float | None, out_wav: Path, ) -> None: """ Use ffmpeg (subprocess) to extract a mono 16kHz WAV from *video_path*. Args: video_path: Source video. start_s: Start time in seconds (None = beginning of file). end_s: End time in seconds (None = end of file). out_wav: Destination WAV path. Raises: RuntimeError: If ffmpeg exits with a non-zero code. """ import subprocess cmd = ["ffmpeg", "-y", "-loglevel", "error"] if start_s is not None: cmd += ["-ss", str(start_s)] if end_s is not None and start_s is not None: cmd += ["-t", str(end_s - start_s)] elif end_s is not None: cmd += ["-to", str(end_s)] cmd += [ "-i", str(video_path), "-vn", # no video "-ac", "1", # mono "-ar", "16000", # 16 kHz — Whisper native rate "-f", "wav", str(out_wav), ] result = subprocess.run(cmd, capture_output=True) if result.returncode != 0: raise RuntimeError( f"ffmpeg failed (code {result.returncode}):\n" f"{result.stderr.decode(errors='replace')}" ) # --------------------------------------------------------------------------- # Core transcription # --------------------------------------------------------------------------- def transcribe_video( video_path: Path, cfg: AppConfig, start_s: float | None = None, end_s: float | None = None, time_offset_s: float = 0.0, ) -> list[DialogueLine]: """ Transcribe dialogue from *video_path* using faster-whisper. Args: video_path: Path to source or trailer video. cfg: Application configuration (whisper section). start_s: Clip start in video-file seconds (None = beginning). end_s: Clip end in video-file seconds (None = end of file). time_offset_s: Added to every transcript timestamp so that beat-level transcripts align with absolute movie time. Returns: List of DialogueLine ordered by start time. """ try: from faster_whisper import WhisperModel except ImportError: raise ImportError("faster-whisper not installed. Run: pip install faster-whisper") w = cfg.whisper logger.info( "Transcribing %s [%.1f–%s] with %s on %s …", video_path.name, start_s or 0.0, f"{end_s:.1f}s" if end_s else "end", w.model, w.device, ) with tempfile.TemporaryDirectory() as tmp: wav = Path(tmp) / "audio.wav" _extract_audio_segment(video_path, start_s, end_s, wav) model = WhisperModel(w.model, device=w.device, compute_type=w.compute_type) segments, _ = model.transcribe( str(wav), language=w.language if w.language else None, beam_size=5, ) lines: list[DialogueLine] = [] for seg in segments: lines.append(DialogueLine( start_s=seg.start + time_offset_s, end_s=seg.end + time_offset_s, text=seg.text.strip(), )) logger.info("Transcription done: %d segments.", len(lines)) return lines # --------------------------------------------------------------------------- # Convenience: transcribe a whole file and return grouped by scene # --------------------------------------------------------------------------- def transcribe_full_movie( cfg: AppConfig, ) -> list[DialogueLine]: """ Transcribe the entire source movie. Use this result to enrich Scenes via a dialogue_callback passed to build_scene_index(). """ return transcribe_video(cfg.paths.source_movie, cfg) def assign_dialogue_to_scenes( all_dialogue: Sequence[DialogueLine], scenes: list["src.core.models.Scene"], # type: ignore[name-defined] ) -> list["src.core.models.Scene"]: # type: ignore[name-defined] """ Distribute pre-transcribed DialogueLines into their respective Scenes. A line is assigned to the scene whose window contains its midpoint. Args: all_dialogue: Full-movie transcript as flat list. scenes: Scene list (will be replaced with enriched copies). Returns: New list of Scene objects with dialogue tuples populated. """ from dataclasses import replace from src.core.models import Scene enriched: list[Scene] = [] for scene in scenes: matched = tuple( line for line in all_dialogue if scene.start_s <= (line.start_s + line.end_s) / 2.0 < scene.end_s ) enriched.append(replace(scene, dialogue=matched)) total_assigned = sum(len(s.dialogue) for s in enriched) logger.info("Assigned %d dialogue lines across %d scenes.", total_assigned, len(enriched)) return enriched