Improve vision matching for dissolve-heavy beats
This commit is contained in:
@@ -154,6 +154,11 @@ Modell beschrieben. Die Beschreibungen liegen in
|
||||
`.cache/vision_descriptions.json` und werden wiederverwendet. Vision erzeugt
|
||||
nur zusätzliche Suchanker; der eigentliche Match muss weiterhin durch CV,
|
||||
Content-Reranking, Timing und Duration-Coverage bestätigt werden.
|
||||
Gecachte Szenenbeschreibungen zählen nur, wenn sie vom aktuell konfigurierten
|
||||
Vision-Modell stammen. Bei langen semantisch passenden Source-Szenen beschreibt
|
||||
der Vision-Layer zusätzlich wenige lokale Zeitfenster und cached auch diese
|
||||
Fenster, damit eine grob ähnliche Szene nicht automatisch mit dem falschen
|
||||
Bewegungs- oder Dialogmoment gleichgesetzt wird.
|
||||
Der gewichtete Vision-Seed-Pfad ersetzt standardmäßig keinen normalen
|
||||
FFmpeg-Vollscan. Vision-Beschreibungen sind semantische Hinweise, aber keine
|
||||
Beweise; der volle CV-Scan bleibt deshalb aktiv, damit falsch bewertete
|
||||
@@ -235,6 +240,9 @@ Bild-/Phasenvalidierung wie der normale Matcher. Nur nicht gelöste Beats fallen
|
||||
danach auf den vollständigen Scan zurück. Die Qualitätsparameter für lokale
|
||||
Vision-Szenenscans und Refine-Kandidaten bleiben dabei erhalten; der Prepass ist
|
||||
eine Reihenfolge-Optimierung, kein Qualitätsdeckel.
|
||||
Provisional Treffer aus diesem schnellen Prepass sind nicht endgültig: wenn sie
|
||||
unterhalb der Confirmed-Schwelle bleiben, läuft zusätzlich der vollständige
|
||||
CV-Scan und darf den besseren oder bestätigten Treffer übernehmen.
|
||||
OpenRouter-/Vision-Rate-Limits werden mit progressiv längeren Pausen erneut
|
||||
versucht. Billing-, Credit- oder Token-Guthaben-Fehler werden dagegen sofort als
|
||||
echter Blocker gemeldet, weil Warten dort nicht hilft.
|
||||
@@ -278,6 +286,10 @@ auf-/abgeblendete Referenzframes aus Score, Inhalts-Reranking,
|
||||
Phasen-Alignment und Motion-Templates herausgenommen. Blenden sollen bestimmen,
|
||||
wie der Clip später exportiert wird, aber nicht, ob der Bildinhalt als Match
|
||||
gilt.
|
||||
Sichtbare Fade-Rampen werden nur in eine matchbare Insel hinein erweitert, wenn
|
||||
sie strukturell stark zum ersten bzw. letzten scorebaren Frame derselben
|
||||
Einstellung passen. Doppelbelichtungen aus Cross-Dissolves bleiben dadurch
|
||||
Übergangsmaterial und werden nicht als einzelner Quellclip erzwungen.
|
||||
Treffer unter `provisional_content_threshold` werden gar nicht mehr gespeichert
|
||||
oder aus alten Cache-Ergebnissen übernommen. Das verhindert, dass offensichtlich
|
||||
falsche Szenen im Report als Match-Kandidat weiterleben.
|
||||
@@ -399,15 +411,15 @@ scoreable_contrast_min = 24.0 # Kontrastarme Blenden/Titelinseln ignorieren
|
||||
[vision]
|
||||
enabled = false # Kostenkontrolle: per CLI mit --vision aktivierbar
|
||||
model = "google/gemma-4-31b-it" # Muss ein visionfähiges OpenAI-kompatibles Modell sein
|
||||
scene_candidate_top_k = 8 # Nur wenige Top-Szenen pro Beat beschreiben
|
||||
max_new_descriptions_per_run = 12 # API-Kosten pro Lauf begrenzen
|
||||
max_seed_scenes = 3 # Nur beste Vision-Szenen als Suchanker weitergeben
|
||||
scene_candidate_top_k = 48 # Breiter Vision-Kandidatenpool für schwierige Beats
|
||||
max_new_descriptions_per_run = 24 # Gecachte Beschreibungen pro Lauf; Rate-Limits bekommen Backoff
|
||||
max_seed_scenes = 8 # Mehr Vision-Szenen als Suchanker, kein manueller Override
|
||||
seed_points_per_scene = 12 # Inpoint-Samples pro Vision-Szene
|
||||
seed_score = 0.88 # Vision-Seeds bekommen mehr Priorität als normale Scene-Seeds
|
||||
max_refine_candidates = 6 # Vision-Pfad prüft mehrere Bewegungsphasen derselben Szene
|
||||
max_refine_candidates = 12 # Vision-Pfad prüft mehrere Bewegungsphasen derselben Szene
|
||||
local_scan_step_s = 0.12 # Dichte lokale Bildsuche in Vision-Szenen
|
||||
local_scan_max_points_per_scene = 180 # Laufzeitgrenze pro Source-Szene
|
||||
local_scan_top_candidates = 18 # Beste lokale Kandidaten gehen ins Refinement
|
||||
local_scan_top_candidates = 36 # Beste lokale Kandidaten gehen ins Refinement
|
||||
local_scan_tie_break_score_delta = 0.08 # Ähnliche Vision-Treffer: frühere Phase bevorzugen
|
||||
multi_shot_cut_corr_threshold = 0.20 # Interne Trailer-Umschnitte erkennen
|
||||
multi_shot_boundary_tolerance_s = 0.20 # Source-Grenze muss zum Trailer-Cut passen
|
||||
|
||||
@@ -458,7 +458,23 @@ def _find_scene_for_in_point(cfg, in_point_s: float):
|
||||
def _reference_scoreable_segments(beat, cfg) -> list[tuple[float, float]]:
|
||||
"""Find visible source-matchable islands inside a trailer beat."""
|
||||
from src.cv.frame_extractor import grab_frame_at_path
|
||||
from src.cv.global_scan import _is_scoreable_reference_frame
|
||||
from src.cv.global_scan import (
|
||||
_corr_same_size,
|
||||
_is_scoreable_reference_frame,
|
||||
_prepare_haystack,
|
||||
_reference_visibility_stats,
|
||||
)
|
||||
|
||||
def is_visible(frame) -> bool:
|
||||
if frame is None:
|
||||
return False
|
||||
mean_luma, p90_luma, contrast = _reference_visibility_stats(frame, cfg)
|
||||
visible_luma = (
|
||||
mean_luma >= cfg.cv.deep_scan.scoreable_luma_mean_min * 0.45
|
||||
or p90_luma >= cfg.cv.deep_scan.scoreable_luma_p90_min * 0.50
|
||||
)
|
||||
visible_contrast = contrast >= max(8.0, cfg.cv.deep_scan.scoreable_contrast_min * 0.30)
|
||||
return visible_luma and visible_contrast
|
||||
|
||||
step_s = max(0.08, cfg.cv.deep_scan.span_sample_step_s)
|
||||
min_segment_s = max(0.32, step_s * 3.0)
|
||||
@@ -487,7 +503,46 @@ def _reference_scoreable_segments(beat, cfg) -> list[tuple[float, float]]:
|
||||
if end - start >= min_segment_s:
|
||||
raw.append((start, end))
|
||||
|
||||
return raw
|
||||
expanded: list[tuple[float, float]] = []
|
||||
same_shot_corr_min = 0.72
|
||||
for start_s, end_s in raw:
|
||||
start_anchor = grab_frame_at_path(beat.trailer_path, beat.start_s + start_s)
|
||||
end_anchor = grab_frame_at_path(beat.trailer_path, beat.start_s + max(start_s, end_s - step_s))
|
||||
start_feature = _prepare_haystack(start_anchor, cfg) if start_anchor is not None else None
|
||||
end_feature = _prepare_haystack(end_anchor, cfg) if end_anchor is not None else None
|
||||
|
||||
soft_start = start_s
|
||||
t = round(start_s - step_s, 6)
|
||||
while t >= 0.0:
|
||||
frame = grab_frame_at_path(beat.trailer_path, beat.start_s + t)
|
||||
if not is_visible(frame):
|
||||
break
|
||||
if start_feature is not None and _corr_same_size(_prepare_haystack(frame, cfg), start_feature) < same_shot_corr_min:
|
||||
break
|
||||
soft_start = max(0.0, t)
|
||||
t = round(t - step_s, 6)
|
||||
|
||||
soft_end = end_s
|
||||
t = round(end_s, 6)
|
||||
while t <= beat.duration_s + 1e-6:
|
||||
frame = grab_frame_at_path(beat.trailer_path, beat.start_s + t)
|
||||
if not is_visible(frame):
|
||||
break
|
||||
if end_feature is not None and _corr_same_size(_prepare_haystack(frame, cfg), end_feature) < same_shot_corr_min:
|
||||
break
|
||||
soft_end = min(beat.duration_s, t + step_s)
|
||||
t = round(t + step_s, 6)
|
||||
|
||||
if soft_end - soft_start >= min_segment_s:
|
||||
expanded.append((soft_start, soft_end))
|
||||
|
||||
merged: list[tuple[float, float]] = []
|
||||
for start_s, end_s in expanded:
|
||||
if merged and start_s - merged[-1][1] <= bridge_gap_s:
|
||||
merged[-1] = (merged[-1][0], max(merged[-1][1], end_s))
|
||||
else:
|
||||
merged.append((start_s, end_s))
|
||||
return merged
|
||||
|
||||
|
||||
def _trim_beats_to_single_visual_island(beats: list, cfg) -> tuple[list, dict[int, tuple[float, float]]]:
|
||||
@@ -555,6 +610,28 @@ def _apply_single_island_segments(results: list, trims: dict[int, tuple[float, f
|
||||
return expanded
|
||||
|
||||
|
||||
def _merge_best_results(existing: list, candidates: list, cfg) -> list:
|
||||
"""Merge matches by beat, preferring confirmed or higher-scoring results."""
|
||||
by_id = {r.beat_id: r for r in existing}
|
||||
for candidate in candidates:
|
||||
old = by_id.get(candidate.beat_id)
|
||||
if old is None:
|
||||
by_id[candidate.beat_id] = candidate
|
||||
continue
|
||||
candidate_confirmed = candidate.match_score >= cfg.cv.deep_scan.match_threshold or candidate.is_confirmed
|
||||
old_confirmed = old.match_score >= cfg.cv.deep_scan.match_threshold or old.is_confirmed
|
||||
if (
|
||||
candidate_confirmed and not old_confirmed
|
||||
or candidate.match_score > old.match_score + cfg.cv.deep_scan.duration_tie_break_score_delta
|
||||
or (
|
||||
candidate.match_score >= old.match_score - cfg.cv.deep_scan.duration_tie_break_score_delta
|
||||
and candidate.duration_s > old.duration_s
|
||||
)
|
||||
):
|
||||
by_id[candidate.beat_id] = candidate
|
||||
return sorted(by_id.values(), key=lambda r: r.beat_id)
|
||||
|
||||
|
||||
def _attach_visual_segments(results: list, beats: list, cfg) -> list:
|
||||
"""Attach automatic sub-shot matches for multi-island trailer beats."""
|
||||
from dataclasses import replace
|
||||
@@ -657,16 +734,21 @@ def _run_segment_match(segment_beat, continuity, cfg, allow_fullscan: bool = Tru
|
||||
seed_in_points=continuity,
|
||||
)
|
||||
if fast_matches:
|
||||
if not allow_fullscan or all(
|
||||
m.is_confirmed or m.match_score >= cfg.cv.deep_scan.match_threshold
|
||||
for m in fast_matches
|
||||
):
|
||||
return fast_matches
|
||||
|
||||
if not allow_fullscan:
|
||||
return []
|
||||
return fast_matches if cfg.vision.enabled else []
|
||||
|
||||
return run_matching(
|
||||
full_matches = run_matching(
|
||||
cfg,
|
||||
[segment_beat],
|
||||
seed_in_points=continuity,
|
||||
)
|
||||
return _merge_best_results(fast_matches if cfg.vision.enabled else [], full_matches, cfg)
|
||||
|
||||
|
||||
def _match_unmatched_visual_segments(
|
||||
@@ -862,9 +944,21 @@ def cmd_match(args: argparse.Namespace, cfg) -> list:
|
||||
seed_in_points=seed_in_points,
|
||||
)
|
||||
|
||||
if len(results) < len(scan_beats):
|
||||
matched_ids = {r.beat_id for r in results}
|
||||
remaining_beats = [b for b in scan_beats if b.beat_id not in matched_ids]
|
||||
if len(results) < len(scan_beats) or any(
|
||||
not r.is_confirmed and r.match_score < cfg.cv.deep_scan.match_threshold
|
||||
for r in results
|
||||
):
|
||||
results_by_id = {r.beat_id: r for r in results}
|
||||
remaining_beats = [
|
||||
b for b in scan_beats
|
||||
if (
|
||||
b.beat_id not in results_by_id
|
||||
or (
|
||||
not results_by_id[b.beat_id].is_confirmed
|
||||
and results_by_id[b.beat_id].match_score < cfg.cv.deep_scan.match_threshold
|
||||
)
|
||||
)
|
||||
]
|
||||
if remaining_beats:
|
||||
full_results = run_matching(
|
||||
cfg,
|
||||
@@ -872,7 +966,7 @@ def cmd_match(args: argparse.Namespace, cfg) -> list:
|
||||
force_reindex=args.force_reindex,
|
||||
seed_in_points=seed_in_points,
|
||||
)
|
||||
results = sorted([*results, *full_results], key=lambda r: r.beat_id)
|
||||
results = _merge_best_results(results, full_results, cfg)
|
||||
results = _apply_single_island_segments(results, single_island_trims)
|
||||
results = _match_unmatched_visual_segments(
|
||||
results,
|
||||
|
||||
+5
-5
@@ -173,15 +173,15 @@ max_tokens = 350
|
||||
|
||||
# Cost controls: per beat, only the top scene-level candidates are described,
|
||||
# and cached descriptions in .cache/vision_descriptions.json are reused.
|
||||
scene_candidate_top_k = 8
|
||||
max_new_descriptions_per_run = 12
|
||||
max_seed_scenes = 3
|
||||
scene_candidate_top_k = 48
|
||||
max_new_descriptions_per_run = 24
|
||||
max_seed_scenes = 8
|
||||
seed_points_per_scene = 12
|
||||
seed_score = 0.88
|
||||
max_refine_candidates = 6
|
||||
max_refine_candidates = 12
|
||||
local_scan_step_s = 0.12
|
||||
local_scan_max_points_per_scene = 180
|
||||
local_scan_top_candidates = 18
|
||||
local_scan_top_candidates = 36
|
||||
local_scan_tie_break_score_delta = 0.08
|
||||
multi_shot_cut_corr_threshold = 0.20
|
||||
multi_shot_boundary_tolerance_s = 0.20
|
||||
|
||||
+136
-6
@@ -261,6 +261,107 @@ def _scene_seed_points(scene: Scene, max_points: int) -> list[float]:
|
||||
return [scene.start_s + step * idx for idx in range(max_points)]
|
||||
|
||||
|
||||
def _scene_window_ranges(scene: Scene, beat: TrailerBeat, max_windows: int) -> list[tuple[float, float]]:
|
||||
if max_windows <= 0 or scene.duration_s <= 0:
|
||||
return []
|
||||
window_s = min(scene.duration_s, max(1.0, beat.duration_s))
|
||||
if scene.duration_s <= window_s + 0.2:
|
||||
return [(scene.start_s, min(scene.end_s, scene.start_s + window_s))]
|
||||
|
||||
usable_start = scene.start_s
|
||||
usable_end = max(scene.start_s, scene.end_s - window_s)
|
||||
if max_windows == 1:
|
||||
starts = [usable_start + (usable_end - usable_start) * 0.5]
|
||||
else:
|
||||
step = (usable_end - usable_start) / max(1, max_windows - 1)
|
||||
starts = [usable_start + step * idx for idx in range(max_windows)]
|
||||
return [(start_s, min(scene.end_s, start_s + window_s)) for start_s in starts]
|
||||
|
||||
|
||||
def _cached_scene_descriptions(
|
||||
cache: dict,
|
||||
scenes_by_id: dict[int, Scene],
|
||||
cfg: AppConfig,
|
||||
) -> list[tuple[Scene, str]]:
|
||||
descriptions: list[tuple[Scene, str]] = []
|
||||
items = cache.get("items", {})
|
||||
if not isinstance(items, dict):
|
||||
return descriptions
|
||||
current_model_marker = f":{cfg.vision.provider}:{cfg.vision.model}:"
|
||||
for key, item in items.items():
|
||||
if current_model_marker not in str(key):
|
||||
continue
|
||||
if not isinstance(item, dict) or item.get("kind") != "scene":
|
||||
continue
|
||||
scene_id = item.get("item_id")
|
||||
if not isinstance(scene_id, int):
|
||||
continue
|
||||
scene = scenes_by_id.get(scene_id)
|
||||
description = item.get("description", "")
|
||||
if scene is not None and isinstance(description, str) and description.strip():
|
||||
descriptions.append((scene, description))
|
||||
return descriptions
|
||||
|
||||
|
||||
def _add_window_seed_descriptions(
|
||||
*,
|
||||
beat: TrailerBeat,
|
||||
beat_desc: str,
|
||||
ranked: list[tuple[float, Scene, str]],
|
||||
cfg: AppConfig,
|
||||
cache: dict,
|
||||
budget: list[int],
|
||||
ranked_by_scene: dict[int, tuple[float, Scene, str]],
|
||||
) -> list[tuple[float, float]]:
|
||||
points: list[tuple[float, float]] = []
|
||||
if budget[0] <= 0:
|
||||
return points
|
||||
|
||||
scenes_to_probe = ranked[: max(1, cfg.vision.max_seed_scenes)]
|
||||
windows_per_scene = max(1, min(6, cfg.vision.seed_points_per_scene // 2))
|
||||
for _, scene, _ in scenes_to_probe:
|
||||
if budget[0] <= 0:
|
||||
break
|
||||
if scene.duration_s <= max(beat.duration_s * 1.6, 6.0):
|
||||
continue
|
||||
for start_s, end_s in _scene_window_ranges(scene, beat, windows_per_scene):
|
||||
if budget[0] <= 0:
|
||||
break
|
||||
desc = _describe_sample(
|
||||
kind="scene_window",
|
||||
item_id=scene.scene_id,
|
||||
label=f"source scene {scene.scene_id} window {start_s:.2f}-{end_s:.2f}",
|
||||
video_path=scene.source_path,
|
||||
start_s=start_s,
|
||||
end_s=end_s,
|
||||
cfg=cfg,
|
||||
cache=cache,
|
||||
budget=budget,
|
||||
)
|
||||
if not desc:
|
||||
continue
|
||||
score = _text_similarity(beat_desc, desc)
|
||||
if score < cfg.vision.similarity_threshold:
|
||||
continue
|
||||
semantic_score = min(0.99, score + 0.30)
|
||||
existing = ranked_by_scene.get(scene.scene_id)
|
||||
if existing is None or semantic_score > existing[0]:
|
||||
ranked_by_scene[scene.scene_id] = (semantic_score, scene, "window")
|
||||
logger.info(
|
||||
"Beat %d: vision window seed scene=%d start=%.3fs score=%.3f",
|
||||
beat.beat_id,
|
||||
scene.scene_id,
|
||||
start_s,
|
||||
semantic_score,
|
||||
)
|
||||
weighted_score = max(
|
||||
cfg.cv.deep_scan.coarse_candidate_threshold,
|
||||
min(0.99, cfg.vision.seed_score * (0.78 + min(1.0, semantic_score) * 0.22)),
|
||||
)
|
||||
points.append((start_s, weighted_score))
|
||||
return points
|
||||
|
||||
|
||||
def build_vision_seed_in_points(
|
||||
beats: Sequence[TrailerBeat],
|
||||
scenes: Sequence[Scene],
|
||||
@@ -308,7 +409,7 @@ def build_vision_seed_in_points(
|
||||
phash_max_distance=64,
|
||||
)
|
||||
|
||||
ranked: list[tuple[float, Scene]] = []
|
||||
ranked_by_scene: dict[int, tuple[float, Scene, str]] = {}
|
||||
for hit in hits:
|
||||
scene = scenes_by_id.get(hit.scene_id)
|
||||
if scene is None:
|
||||
@@ -328,16 +429,45 @@ def build_vision_seed_in_points(
|
||||
continue
|
||||
score = _text_similarity(beat_desc, scene_desc)
|
||||
if score >= cfg.vision.similarity_threshold:
|
||||
ranked.append((score, scene))
|
||||
ranked_by_scene[scene.scene_id] = (min(0.99, score + 0.25), scene, "vision")
|
||||
|
||||
ranked.sort(key=lambda item: item[0], reverse=True)
|
||||
points: list[tuple[float, float]] = []
|
||||
for score, scene in ranked[:cfg.vision.max_seed_scenes]:
|
||||
# Keep the strongest low-level visual candidates as seeds as well.
|
||||
# Text descriptions can miss timing-specific repeats inside one
|
||||
# scene; the deep scan still has to validate every seed frame.
|
||||
vibe_score = max(0.0, min(1.0, float(hit.combined_score)))
|
||||
existing = ranked_by_scene.get(scene.scene_id)
|
||||
if existing is None or vibe_score > existing[0]:
|
||||
ranked_by_scene[scene.scene_id] = (vibe_score, scene, "vibe")
|
||||
|
||||
for scene, scene_desc in _cached_scene_descriptions(cache, scenes_by_id, cfg):
|
||||
score = _text_similarity(beat_desc, scene_desc)
|
||||
if score < cfg.vision.similarity_threshold:
|
||||
continue
|
||||
semantic_score = min(0.99, score + 0.25)
|
||||
existing = ranked_by_scene.get(scene.scene_id)
|
||||
if existing is None or semantic_score > existing[0]:
|
||||
ranked_by_scene[scene.scene_id] = (semantic_score, scene, "cache")
|
||||
|
||||
ranked = sorted(ranked_by_scene.values(), key=lambda item: item[0], reverse=True)
|
||||
window_points = _add_window_seed_descriptions(
|
||||
beat=beat,
|
||||
beat_desc=beat_desc,
|
||||
ranked=ranked,
|
||||
cfg=cfg,
|
||||
cache=cache,
|
||||
budget=budget,
|
||||
ranked_by_scene=ranked_by_scene,
|
||||
)
|
||||
ranked = sorted(ranked_by_scene.values(), key=lambda item: item[0], reverse=True)
|
||||
seed_limit = min(len(ranked), max(cfg.vision.max_seed_scenes, cfg.vision.max_seed_scenes * 2))
|
||||
points: list[tuple[float, float]] = [*window_points]
|
||||
for score, scene, source in ranked[:seed_limit]:
|
||||
logger.info(
|
||||
"Beat %d: vision seed scene=%d score=%.3f",
|
||||
"Beat %d: vision seed scene=%d score=%.3f source=%s",
|
||||
beat.beat_id,
|
||||
scene.scene_id,
|
||||
score,
|
||||
source,
|
||||
)
|
||||
weighted_score = max(
|
||||
cfg.cv.deep_scan.coarse_candidate_threshold,
|
||||
|
||||
Reference in New Issue
Block a user