Files
aitrailer/src/audio/transcriber.py
T
2026-05-02 09:07:41 +02:00

183 lines
5.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
src/audio/transcriber.py — Whisper transcription via faster-whisper
Responsibility:
- Transcribe audio from a video file into a list of DialogueLine objects
- Optionally restrict to a time window [start_s, end_s] (for single beats)
- All model config (model name, device, compute_type) comes from AppConfig
The LLM is NOT used here. This is pure audio-to-text.
"""
from __future__ import annotations
import logging
import tempfile
from pathlib import Path
from typing import Sequence
from src.core.config import AppConfig
from src.core.models import DialogueLine
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Audio extraction helper (video → wav via ffmpeg)
# ---------------------------------------------------------------------------
def _extract_audio_segment(
video_path: Path,
start_s: float | None,
end_s: float | None,
out_wav: Path,
) -> None:
"""
Use ffmpeg (subprocess) to extract a mono 16kHz WAV from *video_path*.
Args:
video_path: Source video.
start_s: Start time in seconds (None = beginning of file).
end_s: End time in seconds (None = end of file).
out_wav: Destination WAV path.
Raises:
RuntimeError: If ffmpeg exits with a non-zero code.
"""
import subprocess
cmd = ["ffmpeg", "-y", "-loglevel", "error"]
if start_s is not None:
cmd += ["-ss", str(start_s)]
if end_s is not None and start_s is not None:
cmd += ["-t", str(end_s - start_s)]
elif end_s is not None:
cmd += ["-to", str(end_s)]
cmd += [
"-i", str(video_path),
"-vn", # no video
"-ac", "1", # mono
"-ar", "16000", # 16 kHz — Whisper native rate
"-f", "wav",
str(out_wav),
]
result = subprocess.run(cmd, capture_output=True)
if result.returncode != 0:
raise RuntimeError(
f"ffmpeg failed (code {result.returncode}):\n"
f"{result.stderr.decode(errors='replace')}"
)
# ---------------------------------------------------------------------------
# Core transcription
# ---------------------------------------------------------------------------
def transcribe_video(
video_path: Path,
cfg: AppConfig,
start_s: float | None = None,
end_s: float | None = None,
time_offset_s: float = 0.0,
) -> list[DialogueLine]:
"""
Transcribe dialogue from *video_path* using faster-whisper.
Args:
video_path: Path to source or trailer video.
cfg: Application configuration (whisper section).
start_s: Clip start in video-file seconds (None = beginning).
end_s: Clip end in video-file seconds (None = end of file).
time_offset_s: Added to every transcript timestamp so that beat-level
transcripts align with absolute movie time.
Returns:
List of DialogueLine ordered by start time.
"""
try:
from faster_whisper import WhisperModel
except ImportError:
raise ImportError("faster-whisper not installed. Run: pip install faster-whisper")
w = cfg.whisper
logger.info(
"Transcribing %s [%.1f%s] with %s on %s",
video_path.name,
start_s or 0.0,
f"{end_s:.1f}s" if end_s else "end",
w.model,
w.device,
)
with tempfile.TemporaryDirectory() as tmp:
wav = Path(tmp) / "audio.wav"
_extract_audio_segment(video_path, start_s, end_s, wav)
model = WhisperModel(w.model, device=w.device, compute_type=w.compute_type)
segments, _ = model.transcribe(
str(wav),
language=w.language if w.language else None,
beam_size=5,
)
lines: list[DialogueLine] = []
for seg in segments:
lines.append(DialogueLine(
start_s=seg.start + time_offset_s,
end_s=seg.end + time_offset_s,
text=seg.text.strip(),
))
logger.info("Transcription done: %d segments.", len(lines))
return lines
# ---------------------------------------------------------------------------
# Convenience: transcribe a whole file and return grouped by scene
# ---------------------------------------------------------------------------
def transcribe_full_movie(
cfg: AppConfig,
) -> list[DialogueLine]:
"""
Transcribe the entire source movie. Use this result to enrich Scenes
via a dialogue_callback passed to build_scene_index().
"""
return transcribe_video(cfg.paths.source_movie, cfg)
def assign_dialogue_to_scenes(
all_dialogue: Sequence[DialogueLine],
scenes: list["src.core.models.Scene"], # type: ignore[name-defined]
) -> list["src.core.models.Scene"]: # type: ignore[name-defined]
"""
Distribute pre-transcribed DialogueLines into their respective Scenes.
A line is assigned to the scene whose window contains its midpoint.
Args:
all_dialogue: Full-movie transcript as flat list.
scenes: Scene list (will be replaced with enriched copies).
Returns:
New list of Scene objects with dialogue tuples populated.
"""
from dataclasses import replace
from src.core.models import Scene
enriched: list[Scene] = []
for scene in scenes:
matched = tuple(
line for line in all_dialogue
if scene.start_s <= (line.start_s + line.end_s) / 2.0 < scene.end_s
)
enriched.append(replace(scene, dialogue=matched))
total_assigned = sum(len(s.dialogue) for s in enriched)
logger.info("Assigned %d dialogue lines across %d scenes.", total_assigned, len(enriched))
return enriched