Improve vision matching for dissolve-heavy beats

This commit is contained in:
Melbar
2026-05-02 16:15:51 +02:00
parent 858a814db1
commit d9e470c877
4 changed files with 261 additions and 25 deletions
+17 -5
View File
@@ -154,6 +154,11 @@ Modell beschrieben. Die Beschreibungen liegen in
`.cache/vision_descriptions.json` und werden wiederverwendet. Vision erzeugt `.cache/vision_descriptions.json` und werden wiederverwendet. Vision erzeugt
nur zusätzliche Suchanker; der eigentliche Match muss weiterhin durch CV, nur zusätzliche Suchanker; der eigentliche Match muss weiterhin durch CV,
Content-Reranking, Timing und Duration-Coverage bestätigt werden. Content-Reranking, Timing und Duration-Coverage bestätigt werden.
Gecachte Szenenbeschreibungen zählen nur, wenn sie vom aktuell konfigurierten
Vision-Modell stammen. Bei langen semantisch passenden Source-Szenen beschreibt
der Vision-Layer zusätzlich wenige lokale Zeitfenster und cached auch diese
Fenster, damit eine grob ähnliche Szene nicht automatisch mit dem falschen
Bewegungs- oder Dialogmoment gleichgesetzt wird.
Der gewichtete Vision-Seed-Pfad ersetzt standardmäßig keinen normalen Der gewichtete Vision-Seed-Pfad ersetzt standardmäßig keinen normalen
FFmpeg-Vollscan. Vision-Beschreibungen sind semantische Hinweise, aber keine FFmpeg-Vollscan. Vision-Beschreibungen sind semantische Hinweise, aber keine
Beweise; der volle CV-Scan bleibt deshalb aktiv, damit falsch bewertete Beweise; der volle CV-Scan bleibt deshalb aktiv, damit falsch bewertete
@@ -235,6 +240,9 @@ Bild-/Phasenvalidierung wie der normale Matcher. Nur nicht gelöste Beats fallen
danach auf den vollständigen Scan zurück. Die Qualitätsparameter für lokale danach auf den vollständigen Scan zurück. Die Qualitätsparameter für lokale
Vision-Szenenscans und Refine-Kandidaten bleiben dabei erhalten; der Prepass ist Vision-Szenenscans und Refine-Kandidaten bleiben dabei erhalten; der Prepass ist
eine Reihenfolge-Optimierung, kein Qualitätsdeckel. eine Reihenfolge-Optimierung, kein Qualitätsdeckel.
Provisional Treffer aus diesem schnellen Prepass sind nicht endgültig: wenn sie
unterhalb der Confirmed-Schwelle bleiben, läuft zusätzlich der vollständige
CV-Scan und darf den besseren oder bestätigten Treffer übernehmen.
OpenRouter-/Vision-Rate-Limits werden mit progressiv längeren Pausen erneut OpenRouter-/Vision-Rate-Limits werden mit progressiv längeren Pausen erneut
versucht. Billing-, Credit- oder Token-Guthaben-Fehler werden dagegen sofort als versucht. Billing-, Credit- oder Token-Guthaben-Fehler werden dagegen sofort als
echter Blocker gemeldet, weil Warten dort nicht hilft. echter Blocker gemeldet, weil Warten dort nicht hilft.
@@ -278,6 +286,10 @@ auf-/abgeblendete Referenzframes aus Score, Inhalts-Reranking,
Phasen-Alignment und Motion-Templates herausgenommen. Blenden sollen bestimmen, Phasen-Alignment und Motion-Templates herausgenommen. Blenden sollen bestimmen,
wie der Clip später exportiert wird, aber nicht, ob der Bildinhalt als Match wie der Clip später exportiert wird, aber nicht, ob der Bildinhalt als Match
gilt. gilt.
Sichtbare Fade-Rampen werden nur in eine matchbare Insel hinein erweitert, wenn
sie strukturell stark zum ersten bzw. letzten scorebaren Frame derselben
Einstellung passen. Doppelbelichtungen aus Cross-Dissolves bleiben dadurch
Übergangsmaterial und werden nicht als einzelner Quellclip erzwungen.
Treffer unter `provisional_content_threshold` werden gar nicht mehr gespeichert Treffer unter `provisional_content_threshold` werden gar nicht mehr gespeichert
oder aus alten Cache-Ergebnissen übernommen. Das verhindert, dass offensichtlich oder aus alten Cache-Ergebnissen übernommen. Das verhindert, dass offensichtlich
falsche Szenen im Report als Match-Kandidat weiterleben. falsche Szenen im Report als Match-Kandidat weiterleben.
@@ -399,15 +411,15 @@ scoreable_contrast_min = 24.0 # Kontrastarme Blenden/Titelinseln ignorieren
[vision] [vision]
enabled = false # Kostenkontrolle: per CLI mit --vision aktivierbar enabled = false # Kostenkontrolle: per CLI mit --vision aktivierbar
model = "google/gemma-4-31b-it" # Muss ein visionfähiges OpenAI-kompatibles Modell sein model = "google/gemma-4-31b-it" # Muss ein visionfähiges OpenAI-kompatibles Modell sein
scene_candidate_top_k = 8 # Nur wenige Top-Szenen pro Beat beschreiben scene_candidate_top_k = 48 # Breiter Vision-Kandidatenpool für schwierige Beats
max_new_descriptions_per_run = 12 # API-Kosten pro Lauf begrenzen max_new_descriptions_per_run = 24 # Gecachte Beschreibungen pro Lauf; Rate-Limits bekommen Backoff
max_seed_scenes = 3 # Nur beste Vision-Szenen als Suchanker weitergeben max_seed_scenes = 8 # Mehr Vision-Szenen als Suchanker, kein manueller Override
seed_points_per_scene = 12 # Inpoint-Samples pro Vision-Szene seed_points_per_scene = 12 # Inpoint-Samples pro Vision-Szene
seed_score = 0.88 # Vision-Seeds bekommen mehr Priorität als normale Scene-Seeds seed_score = 0.88 # Vision-Seeds bekommen mehr Priorität als normale Scene-Seeds
max_refine_candidates = 6 # Vision-Pfad prüft mehrere Bewegungsphasen derselben Szene max_refine_candidates = 12 # Vision-Pfad prüft mehrere Bewegungsphasen derselben Szene
local_scan_step_s = 0.12 # Dichte lokale Bildsuche in Vision-Szenen local_scan_step_s = 0.12 # Dichte lokale Bildsuche in Vision-Szenen
local_scan_max_points_per_scene = 180 # Laufzeitgrenze pro Source-Szene local_scan_max_points_per_scene = 180 # Laufzeitgrenze pro Source-Szene
local_scan_top_candidates = 18 # Beste lokale Kandidaten gehen ins Refinement local_scan_top_candidates = 36 # Beste lokale Kandidaten gehen ins Refinement
local_scan_tie_break_score_delta = 0.08 # Ähnliche Vision-Treffer: frühere Phase bevorzugen local_scan_tie_break_score_delta = 0.08 # Ähnliche Vision-Treffer: frühere Phase bevorzugen
multi_shot_cut_corr_threshold = 0.20 # Interne Trailer-Umschnitte erkennen multi_shot_cut_corr_threshold = 0.20 # Interne Trailer-Umschnitte erkennen
multi_shot_boundary_tolerance_s = 0.20 # Source-Grenze muss zum Trailer-Cut passen multi_shot_boundary_tolerance_s = 0.20 # Source-Grenze muss zum Trailer-Cut passen
+103 -9
View File
@@ -458,7 +458,23 @@ def _find_scene_for_in_point(cfg, in_point_s: float):
def _reference_scoreable_segments(beat, cfg) -> list[tuple[float, float]]: def _reference_scoreable_segments(beat, cfg) -> list[tuple[float, float]]:
"""Find visible source-matchable islands inside a trailer beat.""" """Find visible source-matchable islands inside a trailer beat."""
from src.cv.frame_extractor import grab_frame_at_path from src.cv.frame_extractor import grab_frame_at_path
from src.cv.global_scan import _is_scoreable_reference_frame from src.cv.global_scan import (
_corr_same_size,
_is_scoreable_reference_frame,
_prepare_haystack,
_reference_visibility_stats,
)
def is_visible(frame) -> bool:
if frame is None:
return False
mean_luma, p90_luma, contrast = _reference_visibility_stats(frame, cfg)
visible_luma = (
mean_luma >= cfg.cv.deep_scan.scoreable_luma_mean_min * 0.45
or p90_luma >= cfg.cv.deep_scan.scoreable_luma_p90_min * 0.50
)
visible_contrast = contrast >= max(8.0, cfg.cv.deep_scan.scoreable_contrast_min * 0.30)
return visible_luma and visible_contrast
step_s = max(0.08, cfg.cv.deep_scan.span_sample_step_s) step_s = max(0.08, cfg.cv.deep_scan.span_sample_step_s)
min_segment_s = max(0.32, step_s * 3.0) min_segment_s = max(0.32, step_s * 3.0)
@@ -487,7 +503,46 @@ def _reference_scoreable_segments(beat, cfg) -> list[tuple[float, float]]:
if end - start >= min_segment_s: if end - start >= min_segment_s:
raw.append((start, end)) raw.append((start, end))
return raw expanded: list[tuple[float, float]] = []
same_shot_corr_min = 0.72
for start_s, end_s in raw:
start_anchor = grab_frame_at_path(beat.trailer_path, beat.start_s + start_s)
end_anchor = grab_frame_at_path(beat.trailer_path, beat.start_s + max(start_s, end_s - step_s))
start_feature = _prepare_haystack(start_anchor, cfg) if start_anchor is not None else None
end_feature = _prepare_haystack(end_anchor, cfg) if end_anchor is not None else None
soft_start = start_s
t = round(start_s - step_s, 6)
while t >= 0.0:
frame = grab_frame_at_path(beat.trailer_path, beat.start_s + t)
if not is_visible(frame):
break
if start_feature is not None and _corr_same_size(_prepare_haystack(frame, cfg), start_feature) < same_shot_corr_min:
break
soft_start = max(0.0, t)
t = round(t - step_s, 6)
soft_end = end_s
t = round(end_s, 6)
while t <= beat.duration_s + 1e-6:
frame = grab_frame_at_path(beat.trailer_path, beat.start_s + t)
if not is_visible(frame):
break
if end_feature is not None and _corr_same_size(_prepare_haystack(frame, cfg), end_feature) < same_shot_corr_min:
break
soft_end = min(beat.duration_s, t + step_s)
t = round(t + step_s, 6)
if soft_end - soft_start >= min_segment_s:
expanded.append((soft_start, soft_end))
merged: list[tuple[float, float]] = []
for start_s, end_s in expanded:
if merged and start_s - merged[-1][1] <= bridge_gap_s:
merged[-1] = (merged[-1][0], max(merged[-1][1], end_s))
else:
merged.append((start_s, end_s))
return merged
def _trim_beats_to_single_visual_island(beats: list, cfg) -> tuple[list, dict[int, tuple[float, float]]]: def _trim_beats_to_single_visual_island(beats: list, cfg) -> tuple[list, dict[int, tuple[float, float]]]:
@@ -555,6 +610,28 @@ def _apply_single_island_segments(results: list, trims: dict[int, tuple[float, f
return expanded return expanded
def _merge_best_results(existing: list, candidates: list, cfg) -> list:
"""Merge matches by beat, preferring confirmed or higher-scoring results."""
by_id = {r.beat_id: r for r in existing}
for candidate in candidates:
old = by_id.get(candidate.beat_id)
if old is None:
by_id[candidate.beat_id] = candidate
continue
candidate_confirmed = candidate.match_score >= cfg.cv.deep_scan.match_threshold or candidate.is_confirmed
old_confirmed = old.match_score >= cfg.cv.deep_scan.match_threshold or old.is_confirmed
if (
candidate_confirmed and not old_confirmed
or candidate.match_score > old.match_score + cfg.cv.deep_scan.duration_tie_break_score_delta
or (
candidate.match_score >= old.match_score - cfg.cv.deep_scan.duration_tie_break_score_delta
and candidate.duration_s > old.duration_s
)
):
by_id[candidate.beat_id] = candidate
return sorted(by_id.values(), key=lambda r: r.beat_id)
def _attach_visual_segments(results: list, beats: list, cfg) -> list: def _attach_visual_segments(results: list, beats: list, cfg) -> list:
"""Attach automatic sub-shot matches for multi-island trailer beats.""" """Attach automatic sub-shot matches for multi-island trailer beats."""
from dataclasses import replace from dataclasses import replace
@@ -657,16 +734,21 @@ def _run_segment_match(segment_beat, continuity, cfg, allow_fullscan: bool = Tru
seed_in_points=continuity, seed_in_points=continuity,
) )
if fast_matches: if fast_matches:
return fast_matches if not allow_fullscan or all(
m.is_confirmed or m.match_score >= cfg.cv.deep_scan.match_threshold
for m in fast_matches
):
return fast_matches
if not allow_fullscan: if not allow_fullscan:
return [] return fast_matches if cfg.vision.enabled else []
return run_matching( full_matches = run_matching(
cfg, cfg,
[segment_beat], [segment_beat],
seed_in_points=continuity, seed_in_points=continuity,
) )
return _merge_best_results(fast_matches if cfg.vision.enabled else [], full_matches, cfg)
def _match_unmatched_visual_segments( def _match_unmatched_visual_segments(
@@ -862,9 +944,21 @@ def cmd_match(args: argparse.Namespace, cfg) -> list:
seed_in_points=seed_in_points, seed_in_points=seed_in_points,
) )
if len(results) < len(scan_beats): if len(results) < len(scan_beats) or any(
matched_ids = {r.beat_id for r in results} not r.is_confirmed and r.match_score < cfg.cv.deep_scan.match_threshold
remaining_beats = [b for b in scan_beats if b.beat_id not in matched_ids] for r in results
):
results_by_id = {r.beat_id: r for r in results}
remaining_beats = [
b for b in scan_beats
if (
b.beat_id not in results_by_id
or (
not results_by_id[b.beat_id].is_confirmed
and results_by_id[b.beat_id].match_score < cfg.cv.deep_scan.match_threshold
)
)
]
if remaining_beats: if remaining_beats:
full_results = run_matching( full_results = run_matching(
cfg, cfg,
@@ -872,7 +966,7 @@ def cmd_match(args: argparse.Namespace, cfg) -> list:
force_reindex=args.force_reindex, force_reindex=args.force_reindex,
seed_in_points=seed_in_points, seed_in_points=seed_in_points,
) )
results = sorted([*results, *full_results], key=lambda r: r.beat_id) results = _merge_best_results(results, full_results, cfg)
results = _apply_single_island_segments(results, single_island_trims) results = _apply_single_island_segments(results, single_island_trims)
results = _match_unmatched_visual_segments( results = _match_unmatched_visual_segments(
results, results,
+5 -5
View File
@@ -173,15 +173,15 @@ max_tokens = 350
# Cost controls: per beat, only the top scene-level candidates are described, # Cost controls: per beat, only the top scene-level candidates are described,
# and cached descriptions in .cache/vision_descriptions.json are reused. # and cached descriptions in .cache/vision_descriptions.json are reused.
scene_candidate_top_k = 8 scene_candidate_top_k = 48
max_new_descriptions_per_run = 12 max_new_descriptions_per_run = 24
max_seed_scenes = 3 max_seed_scenes = 8
seed_points_per_scene = 12 seed_points_per_scene = 12
seed_score = 0.88 seed_score = 0.88
max_refine_candidates = 6 max_refine_candidates = 12
local_scan_step_s = 0.12 local_scan_step_s = 0.12
local_scan_max_points_per_scene = 180 local_scan_max_points_per_scene = 180
local_scan_top_candidates = 18 local_scan_top_candidates = 36
local_scan_tie_break_score_delta = 0.08 local_scan_tie_break_score_delta = 0.08
multi_shot_cut_corr_threshold = 0.20 multi_shot_cut_corr_threshold = 0.20
multi_shot_boundary_tolerance_s = 0.20 multi_shot_boundary_tolerance_s = 0.20
+136 -6
View File
@@ -261,6 +261,107 @@ def _scene_seed_points(scene: Scene, max_points: int) -> list[float]:
return [scene.start_s + step * idx for idx in range(max_points)] return [scene.start_s + step * idx for idx in range(max_points)]
def _scene_window_ranges(scene: Scene, beat: TrailerBeat, max_windows: int) -> list[tuple[float, float]]:
if max_windows <= 0 or scene.duration_s <= 0:
return []
window_s = min(scene.duration_s, max(1.0, beat.duration_s))
if scene.duration_s <= window_s + 0.2:
return [(scene.start_s, min(scene.end_s, scene.start_s + window_s))]
usable_start = scene.start_s
usable_end = max(scene.start_s, scene.end_s - window_s)
if max_windows == 1:
starts = [usable_start + (usable_end - usable_start) * 0.5]
else:
step = (usable_end - usable_start) / max(1, max_windows - 1)
starts = [usable_start + step * idx for idx in range(max_windows)]
return [(start_s, min(scene.end_s, start_s + window_s)) for start_s in starts]
def _cached_scene_descriptions(
cache: dict,
scenes_by_id: dict[int, Scene],
cfg: AppConfig,
) -> list[tuple[Scene, str]]:
descriptions: list[tuple[Scene, str]] = []
items = cache.get("items", {})
if not isinstance(items, dict):
return descriptions
current_model_marker = f":{cfg.vision.provider}:{cfg.vision.model}:"
for key, item in items.items():
if current_model_marker not in str(key):
continue
if not isinstance(item, dict) or item.get("kind") != "scene":
continue
scene_id = item.get("item_id")
if not isinstance(scene_id, int):
continue
scene = scenes_by_id.get(scene_id)
description = item.get("description", "")
if scene is not None and isinstance(description, str) and description.strip():
descriptions.append((scene, description))
return descriptions
def _add_window_seed_descriptions(
*,
beat: TrailerBeat,
beat_desc: str,
ranked: list[tuple[float, Scene, str]],
cfg: AppConfig,
cache: dict,
budget: list[int],
ranked_by_scene: dict[int, tuple[float, Scene, str]],
) -> list[tuple[float, float]]:
points: list[tuple[float, float]] = []
if budget[0] <= 0:
return points
scenes_to_probe = ranked[: max(1, cfg.vision.max_seed_scenes)]
windows_per_scene = max(1, min(6, cfg.vision.seed_points_per_scene // 2))
for _, scene, _ in scenes_to_probe:
if budget[0] <= 0:
break
if scene.duration_s <= max(beat.duration_s * 1.6, 6.0):
continue
for start_s, end_s in _scene_window_ranges(scene, beat, windows_per_scene):
if budget[0] <= 0:
break
desc = _describe_sample(
kind="scene_window",
item_id=scene.scene_id,
label=f"source scene {scene.scene_id} window {start_s:.2f}-{end_s:.2f}",
video_path=scene.source_path,
start_s=start_s,
end_s=end_s,
cfg=cfg,
cache=cache,
budget=budget,
)
if not desc:
continue
score = _text_similarity(beat_desc, desc)
if score < cfg.vision.similarity_threshold:
continue
semantic_score = min(0.99, score + 0.30)
existing = ranked_by_scene.get(scene.scene_id)
if existing is None or semantic_score > existing[0]:
ranked_by_scene[scene.scene_id] = (semantic_score, scene, "window")
logger.info(
"Beat %d: vision window seed scene=%d start=%.3fs score=%.3f",
beat.beat_id,
scene.scene_id,
start_s,
semantic_score,
)
weighted_score = max(
cfg.cv.deep_scan.coarse_candidate_threshold,
min(0.99, cfg.vision.seed_score * (0.78 + min(1.0, semantic_score) * 0.22)),
)
points.append((start_s, weighted_score))
return points
def build_vision_seed_in_points( def build_vision_seed_in_points(
beats: Sequence[TrailerBeat], beats: Sequence[TrailerBeat],
scenes: Sequence[Scene], scenes: Sequence[Scene],
@@ -308,7 +409,7 @@ def build_vision_seed_in_points(
phash_max_distance=64, phash_max_distance=64,
) )
ranked: list[tuple[float, Scene]] = [] ranked_by_scene: dict[int, tuple[float, Scene, str]] = {}
for hit in hits: for hit in hits:
scene = scenes_by_id.get(hit.scene_id) scene = scenes_by_id.get(hit.scene_id)
if scene is None: if scene is None:
@@ -328,16 +429,45 @@ def build_vision_seed_in_points(
continue continue
score = _text_similarity(beat_desc, scene_desc) score = _text_similarity(beat_desc, scene_desc)
if score >= cfg.vision.similarity_threshold: if score >= cfg.vision.similarity_threshold:
ranked.append((score, scene)) ranked_by_scene[scene.scene_id] = (min(0.99, score + 0.25), scene, "vision")
ranked.sort(key=lambda item: item[0], reverse=True) # Keep the strongest low-level visual candidates as seeds as well.
points: list[tuple[float, float]] = [] # Text descriptions can miss timing-specific repeats inside one
for score, scene in ranked[:cfg.vision.max_seed_scenes]: # scene; the deep scan still has to validate every seed frame.
vibe_score = max(0.0, min(1.0, float(hit.combined_score)))
existing = ranked_by_scene.get(scene.scene_id)
if existing is None or vibe_score > existing[0]:
ranked_by_scene[scene.scene_id] = (vibe_score, scene, "vibe")
for scene, scene_desc in _cached_scene_descriptions(cache, scenes_by_id, cfg):
score = _text_similarity(beat_desc, scene_desc)
if score < cfg.vision.similarity_threshold:
continue
semantic_score = min(0.99, score + 0.25)
existing = ranked_by_scene.get(scene.scene_id)
if existing is None or semantic_score > existing[0]:
ranked_by_scene[scene.scene_id] = (semantic_score, scene, "cache")
ranked = sorted(ranked_by_scene.values(), key=lambda item: item[0], reverse=True)
window_points = _add_window_seed_descriptions(
beat=beat,
beat_desc=beat_desc,
ranked=ranked,
cfg=cfg,
cache=cache,
budget=budget,
ranked_by_scene=ranked_by_scene,
)
ranked = sorted(ranked_by_scene.values(), key=lambda item: item[0], reverse=True)
seed_limit = min(len(ranked), max(cfg.vision.max_seed_scenes, cfg.vision.max_seed_scenes * 2))
points: list[tuple[float, float]] = [*window_points]
for score, scene, source in ranked[:seed_limit]:
logger.info( logger.info(
"Beat %d: vision seed scene=%d score=%.3f", "Beat %d: vision seed scene=%d score=%.3f source=%s",
beat.beat_id, beat.beat_id,
scene.scene_id, scene.scene_id,
score, score,
source,
) )
weighted_score = max( weighted_score = max(
cfg.cv.deep_scan.coarse_candidate_threshold, cfg.cv.deep_scan.coarse_candidate_threshold,