aitrailer/src/audio/transcriber.py

"""
src/audio/transcriber.py — Whisper transcription via faster-whisper

Responsibility:
  - Transcribe audio from a video file into a list of DialogueLine objects
  - Optionally restrict to a time window [start_s, end_s] (for single beats)
  - All model config (model name, device, compute_type) comes from AppConfig

The LLM is NOT used here. This is pure audio-to-text.
"""

from __future__ import annotations

import logging
import tempfile
from pathlib import Path
from typing import Sequence

from src.core.config import AppConfig
from src.core.models import DialogueLine

logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Audio extraction helper (video → wav via ffmpeg)
# ---------------------------------------------------------------------------

def _extract_audio_segment(
    video_path: Path,
    start_s: float | None,
    end_s: float | None,
    out_wav: Path,
) -> None:
    """
    Use ffmpeg (subprocess) to extract a mono 16kHz WAV from *video_path*.

    Args:
        video_path: Source video.
        start_s:    Start time in seconds (None = beginning of file).
        end_s:      End time in seconds (None = end of file).
        out_wav:    Destination WAV path.

    Raises:
        RuntimeError: If ffmpeg exits with a non-zero code.
    """
    import subprocess

    cmd = ["ffmpeg", "-y", "-loglevel", "error"]

    if start_s is not None:
        cmd += ["-ss", str(start_s)]
    if end_s is not None and start_s is not None:
        cmd += ["-t", str(end_s - start_s)]
    elif end_s is not None:
        cmd += ["-to", str(end_s)]

    cmd += [
        "-i", str(video_path),
        "-vn",                        # no video
        "-ac", "1",                   # mono
        "-ar", "16000",               # 16 kHz — Whisper native rate
        "-f", "wav",
        str(out_wav),
    ]

    result = subprocess.run(cmd, capture_output=True)
    if result.returncode != 0:
        raise RuntimeError(
            f"ffmpeg failed (code {result.returncode}):\n"
            f"{result.stderr.decode(errors='replace')}"
        )


# ---------------------------------------------------------------------------
# Core transcription
# ---------------------------------------------------------------------------

def transcribe_video(
    video_path: Path,
    cfg: AppConfig,
    start_s: float | None = None,
    end_s: float | None = None,
    time_offset_s: float = 0.0,
) -> list[DialogueLine]:
    """
    Transcribe dialogue from *video_path* using faster-whisper.

    Args:
        video_path:    Path to source or trailer video.
        cfg:           Application configuration (whisper section).
        start_s:       Clip start in video-file seconds (None = beginning).
        end_s:         Clip end   in video-file seconds (None = end of file).
        time_offset_s: Added to every transcript timestamp so that beat-level
                       transcripts align with absolute movie time.

    Returns:
        List of DialogueLine ordered by start time.
    """
    try:
        from faster_whisper import WhisperModel
    except ImportError:
        raise ImportError("faster-whisper not installed. Run: pip install faster-whisper")

    w = cfg.whisper

    logger.info(
        "Transcribing %s [%.1f–%s] with %s on %s …",
        video_path.name,
        start_s or 0.0,
        f"{end_s:.1f}s" if end_s else "end",
        w.model,
        w.device,
    )

    with tempfile.TemporaryDirectory() as tmp:
        wav = Path(tmp) / "audio.wav"
        _extract_audio_segment(video_path, start_s, end_s, wav)

        model = WhisperModel(w.model, device=w.device, compute_type=w.compute_type)
        segments, _ = model.transcribe(
            str(wav),
            language=w.language if w.language else None,
            beam_size=5,
        )

        lines: list[DialogueLine] = []
        for seg in segments:
            lines.append(DialogueLine(
                start_s=seg.start + time_offset_s,
                end_s=seg.end   + time_offset_s,
                text=seg.text.strip(),
            ))

    logger.info("Transcription done: %d segments.", len(lines))
    return lines


# ---------------------------------------------------------------------------
# Convenience: transcribe a whole file and return grouped by scene
# ---------------------------------------------------------------------------

def transcribe_full_movie(
    cfg: AppConfig,
) -> list[DialogueLine]:
    """
    Transcribe the entire source movie. Use this result to enrich Scenes
    via a dialogue_callback passed to build_scene_index().
    """
    return transcribe_video(cfg.paths.source_movie, cfg)


def assign_dialogue_to_scenes(
    all_dialogue: Sequence[DialogueLine],
    scenes: list["src.core.models.Scene"],  # type: ignore[name-defined]
) -> list["src.core.models.Scene"]:  # type: ignore[name-defined]
    """
    Distribute pre-transcribed DialogueLines into their respective Scenes.

    A line is assigned to the scene whose window contains its midpoint.

    Args:
        all_dialogue: Full-movie transcript as flat list.
        scenes:       Scene list (will be replaced with enriched copies).

    Returns:
        New list of Scene objects with dialogue tuples populated.
    """
    from dataclasses import replace
    from src.core.models import Scene

    enriched: list[Scene] = []
    for scene in scenes:
        matched = tuple(
            line for line in all_dialogue
            if scene.start_s <= (line.start_s + line.end_s) / 2.0 < scene.end_s
        )
        enriched.append(replace(scene, dialogue=matched))

    total_assigned = sum(len(s.dialogue) for s in enriched)
    logger.info("Assigned %d dialogue lines across %d scenes.", total_assigned, len(enriched))
    return enriched