183 lines
5.7 KiB
Python
183 lines
5.7 KiB
Python
"""
|
||
src/audio/transcriber.py — Whisper transcription via faster-whisper
|
||
|
||
Responsibility:
|
||
- Transcribe audio from a video file into a list of DialogueLine objects
|
||
- Optionally restrict to a time window [start_s, end_s] (for single beats)
|
||
- All model config (model name, device, compute_type) comes from AppConfig
|
||
|
||
The LLM is NOT used here. This is pure audio-to-text.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import logging
|
||
import tempfile
|
||
from pathlib import Path
|
||
from typing import Sequence
|
||
|
||
from src.core.config import AppConfig
|
||
from src.core.models import DialogueLine
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Audio extraction helper (video → wav via ffmpeg)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _extract_audio_segment(
|
||
video_path: Path,
|
||
start_s: float | None,
|
||
end_s: float | None,
|
||
out_wav: Path,
|
||
) -> None:
|
||
"""
|
||
Use ffmpeg (subprocess) to extract a mono 16kHz WAV from *video_path*.
|
||
|
||
Args:
|
||
video_path: Source video.
|
||
start_s: Start time in seconds (None = beginning of file).
|
||
end_s: End time in seconds (None = end of file).
|
||
out_wav: Destination WAV path.
|
||
|
||
Raises:
|
||
RuntimeError: If ffmpeg exits with a non-zero code.
|
||
"""
|
||
import subprocess
|
||
|
||
cmd = ["ffmpeg", "-y", "-loglevel", "error"]
|
||
|
||
if start_s is not None:
|
||
cmd += ["-ss", str(start_s)]
|
||
if end_s is not None and start_s is not None:
|
||
cmd += ["-t", str(end_s - start_s)]
|
||
elif end_s is not None:
|
||
cmd += ["-to", str(end_s)]
|
||
|
||
cmd += [
|
||
"-i", str(video_path),
|
||
"-vn", # no video
|
||
"-ac", "1", # mono
|
||
"-ar", "16000", # 16 kHz — Whisper native rate
|
||
"-f", "wav",
|
||
str(out_wav),
|
||
]
|
||
|
||
result = subprocess.run(cmd, capture_output=True)
|
||
if result.returncode != 0:
|
||
raise RuntimeError(
|
||
f"ffmpeg failed (code {result.returncode}):\n"
|
||
f"{result.stderr.decode(errors='replace')}"
|
||
)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Core transcription
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def transcribe_video(
|
||
video_path: Path,
|
||
cfg: AppConfig,
|
||
start_s: float | None = None,
|
||
end_s: float | None = None,
|
||
time_offset_s: float = 0.0,
|
||
) -> list[DialogueLine]:
|
||
"""
|
||
Transcribe dialogue from *video_path* using faster-whisper.
|
||
|
||
Args:
|
||
video_path: Path to source or trailer video.
|
||
cfg: Application configuration (whisper section).
|
||
start_s: Clip start in video-file seconds (None = beginning).
|
||
end_s: Clip end in video-file seconds (None = end of file).
|
||
time_offset_s: Added to every transcript timestamp so that beat-level
|
||
transcripts align with absolute movie time.
|
||
|
||
Returns:
|
||
List of DialogueLine ordered by start time.
|
||
"""
|
||
try:
|
||
from faster_whisper import WhisperModel
|
||
except ImportError:
|
||
raise ImportError("faster-whisper not installed. Run: pip install faster-whisper")
|
||
|
||
w = cfg.whisper
|
||
|
||
logger.info(
|
||
"Transcribing %s [%.1f–%s] with %s on %s …",
|
||
video_path.name,
|
||
start_s or 0.0,
|
||
f"{end_s:.1f}s" if end_s else "end",
|
||
w.model,
|
||
w.device,
|
||
)
|
||
|
||
with tempfile.TemporaryDirectory() as tmp:
|
||
wav = Path(tmp) / "audio.wav"
|
||
_extract_audio_segment(video_path, start_s, end_s, wav)
|
||
|
||
model = WhisperModel(w.model, device=w.device, compute_type=w.compute_type)
|
||
segments, _ = model.transcribe(
|
||
str(wav),
|
||
language=w.language if w.language else None,
|
||
beam_size=5,
|
||
)
|
||
|
||
lines: list[DialogueLine] = []
|
||
for seg in segments:
|
||
lines.append(DialogueLine(
|
||
start_s=seg.start + time_offset_s,
|
||
end_s=seg.end + time_offset_s,
|
||
text=seg.text.strip(),
|
||
))
|
||
|
||
logger.info("Transcription done: %d segments.", len(lines))
|
||
return lines
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Convenience: transcribe a whole file and return grouped by scene
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def transcribe_full_movie(
|
||
cfg: AppConfig,
|
||
) -> list[DialogueLine]:
|
||
"""
|
||
Transcribe the entire source movie. Use this result to enrich Scenes
|
||
via a dialogue_callback passed to build_scene_index().
|
||
"""
|
||
return transcribe_video(cfg.paths.source_movie, cfg)
|
||
|
||
|
||
def assign_dialogue_to_scenes(
|
||
all_dialogue: Sequence[DialogueLine],
|
||
scenes: list["src.core.models.Scene"], # type: ignore[name-defined]
|
||
) -> list["src.core.models.Scene"]: # type: ignore[name-defined]
|
||
"""
|
||
Distribute pre-transcribed DialogueLines into their respective Scenes.
|
||
|
||
A line is assigned to the scene whose window contains its midpoint.
|
||
|
||
Args:
|
||
all_dialogue: Full-movie transcript as flat list.
|
||
scenes: Scene list (will be replaced with enriched copies).
|
||
|
||
Returns:
|
||
New list of Scene objects with dialogue tuples populated.
|
||
"""
|
||
from dataclasses import replace
|
||
from src.core.models import Scene
|
||
|
||
enriched: list[Scene] = []
|
||
for scene in scenes:
|
||
matched = tuple(
|
||
line for line in all_dialogue
|
||
if scene.start_s <= (line.start_s + line.end_s) / 2.0 < scene.end_s
|
||
)
|
||
enriched.append(replace(scene, dialogue=matched))
|
||
|
||
total_assigned = sum(len(s.dialogue) for s in enriched)
|
||
logger.info("Assigned %d dialogue lines across %d scenes.", total_assigned, len(enriched))
|
||
return enriched
|