Initial project import

This commit is contained in:
Melbar
2026-05-02 09:07:41 +02:00
commit 8e1bcf142f
38 changed files with 7928 additions and 0 deletions
+182
View File
@@ -0,0 +1,182 @@
"""
src/audio/transcriber.py — Whisper transcription via faster-whisper
Responsibility:
- Transcribe audio from a video file into a list of DialogueLine objects
- Optionally restrict to a time window [start_s, end_s] (for single beats)
- All model config (model name, device, compute_type) comes from AppConfig
The LLM is NOT used here. This is pure audio-to-text.
"""
from __future__ import annotations
import logging
import tempfile
from pathlib import Path
from typing import Sequence
from src.core.config import AppConfig
from src.core.models import DialogueLine
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Audio extraction helper (video → wav via ffmpeg)
# ---------------------------------------------------------------------------
def _extract_audio_segment(
video_path: Path,
start_s: float | None,
end_s: float | None,
out_wav: Path,
) -> None:
"""
Use ffmpeg (subprocess) to extract a mono 16kHz WAV from *video_path*.
Args:
video_path: Source video.
start_s: Start time in seconds (None = beginning of file).
end_s: End time in seconds (None = end of file).
out_wav: Destination WAV path.
Raises:
RuntimeError: If ffmpeg exits with a non-zero code.
"""
import subprocess
cmd = ["ffmpeg", "-y", "-loglevel", "error"]
if start_s is not None:
cmd += ["-ss", str(start_s)]
if end_s is not None and start_s is not None:
cmd += ["-t", str(end_s - start_s)]
elif end_s is not None:
cmd += ["-to", str(end_s)]
cmd += [
"-i", str(video_path),
"-vn", # no video
"-ac", "1", # mono
"-ar", "16000", # 16 kHz — Whisper native rate
"-f", "wav",
str(out_wav),
]
result = subprocess.run(cmd, capture_output=True)
if result.returncode != 0:
raise RuntimeError(
f"ffmpeg failed (code {result.returncode}):\n"
f"{result.stderr.decode(errors='replace')}"
)
# ---------------------------------------------------------------------------
# Core transcription
# ---------------------------------------------------------------------------
def transcribe_video(
video_path: Path,
cfg: AppConfig,
start_s: float | None = None,
end_s: float | None = None,
time_offset_s: float = 0.0,
) -> list[DialogueLine]:
"""
Transcribe dialogue from *video_path* using faster-whisper.
Args:
video_path: Path to source or trailer video.
cfg: Application configuration (whisper section).
start_s: Clip start in video-file seconds (None = beginning).
end_s: Clip end in video-file seconds (None = end of file).
time_offset_s: Added to every transcript timestamp so that beat-level
transcripts align with absolute movie time.
Returns:
List of DialogueLine ordered by start time.
"""
try:
from faster_whisper import WhisperModel
except ImportError:
raise ImportError("faster-whisper not installed. Run: pip install faster-whisper")
w = cfg.whisper
logger.info(
"Transcribing %s [%.1f%s] with %s on %s",
video_path.name,
start_s or 0.0,
f"{end_s:.1f}s" if end_s else "end",
w.model,
w.device,
)
with tempfile.TemporaryDirectory() as tmp:
wav = Path(tmp) / "audio.wav"
_extract_audio_segment(video_path, start_s, end_s, wav)
model = WhisperModel(w.model, device=w.device, compute_type=w.compute_type)
segments, _ = model.transcribe(
str(wav),
language=w.language if w.language else None,
beam_size=5,
)
lines: list[DialogueLine] = []
for seg in segments:
lines.append(DialogueLine(
start_s=seg.start + time_offset_s,
end_s=seg.end + time_offset_s,
text=seg.text.strip(),
))
logger.info("Transcription done: %d segments.", len(lines))
return lines
# ---------------------------------------------------------------------------
# Convenience: transcribe a whole file and return grouped by scene
# ---------------------------------------------------------------------------
def transcribe_full_movie(
cfg: AppConfig,
) -> list[DialogueLine]:
"""
Transcribe the entire source movie. Use this result to enrich Scenes
via a dialogue_callback passed to build_scene_index().
"""
return transcribe_video(cfg.paths.source_movie, cfg)
def assign_dialogue_to_scenes(
all_dialogue: Sequence[DialogueLine],
scenes: list["src.core.models.Scene"], # type: ignore[name-defined]
) -> list["src.core.models.Scene"]: # type: ignore[name-defined]
"""
Distribute pre-transcribed DialogueLines into their respective Scenes.
A line is assigned to the scene whose window contains its midpoint.
Args:
all_dialogue: Full-movie transcript as flat list.
scenes: Scene list (will be replaced with enriched copies).
Returns:
New list of Scene objects with dialogue tuples populated.
"""
from dataclasses import replace
from src.core.models import Scene
enriched: list[Scene] = []
for scene in scenes:
matched = tuple(
line for line in all_dialogue
if scene.start_s <= (line.start_s + line.end_s) / 2.0 < scene.end_s
)
enriched.append(replace(scene, dialogue=matched))
total_assigned = sum(len(s.dialogue) for s in enriched)
logger.info("Assigned %d dialogue lines across %d scenes.", total_assigned, len(enriched))
return enriched