From d9e470c8776e2885401bf7314e62b4dec7c4fc88 Mon Sep 17 00:00:00 2001 From: Melbar Date: Sat, 2 May 2026 16:15:51 +0200 Subject: [PATCH] Improve vision matching for dissolve-heavy beats --- README.md | 22 +++++-- cli.py | 112 ++++++++++++++++++++++++++++--- config.toml | 10 +-- src/llm/vision_cache.py | 142 ++++++++++++++++++++++++++++++++++++++-- 4 files changed, 261 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 2dc5f39..6e46c5a 100644 --- a/README.md +++ b/README.md @@ -154,6 +154,11 @@ Modell beschrieben. Die Beschreibungen liegen in `.cache/vision_descriptions.json` und werden wiederverwendet. Vision erzeugt nur zusätzliche Suchanker; der eigentliche Match muss weiterhin durch CV, Content-Reranking, Timing und Duration-Coverage bestätigt werden. +Gecachte Szenenbeschreibungen zählen nur, wenn sie vom aktuell konfigurierten +Vision-Modell stammen. Bei langen semantisch passenden Source-Szenen beschreibt +der Vision-Layer zusätzlich wenige lokale Zeitfenster und cached auch diese +Fenster, damit eine grob ähnliche Szene nicht automatisch mit dem falschen +Bewegungs- oder Dialogmoment gleichgesetzt wird. Der gewichtete Vision-Seed-Pfad ersetzt standardmäßig keinen normalen FFmpeg-Vollscan. Vision-Beschreibungen sind semantische Hinweise, aber keine Beweise; der volle CV-Scan bleibt deshalb aktiv, damit falsch bewertete @@ -235,6 +240,9 @@ Bild-/Phasenvalidierung wie der normale Matcher. Nur nicht gelöste Beats fallen danach auf den vollständigen Scan zurück. Die Qualitätsparameter für lokale Vision-Szenenscans und Refine-Kandidaten bleiben dabei erhalten; der Prepass ist eine Reihenfolge-Optimierung, kein Qualitätsdeckel. +Provisional Treffer aus diesem schnellen Prepass sind nicht endgültig: wenn sie +unterhalb der Confirmed-Schwelle bleiben, läuft zusätzlich der vollständige +CV-Scan und darf den besseren oder bestätigten Treffer übernehmen. OpenRouter-/Vision-Rate-Limits werden mit progressiv längeren Pausen erneut versucht. Billing-, Credit- oder Token-Guthaben-Fehler werden dagegen sofort als echter Blocker gemeldet, weil Warten dort nicht hilft. @@ -278,6 +286,10 @@ auf-/abgeblendete Referenzframes aus Score, Inhalts-Reranking, Phasen-Alignment und Motion-Templates herausgenommen. Blenden sollen bestimmen, wie der Clip später exportiert wird, aber nicht, ob der Bildinhalt als Match gilt. +Sichtbare Fade-Rampen werden nur in eine matchbare Insel hinein erweitert, wenn +sie strukturell stark zum ersten bzw. letzten scorebaren Frame derselben +Einstellung passen. Doppelbelichtungen aus Cross-Dissolves bleiben dadurch +Übergangsmaterial und werden nicht als einzelner Quellclip erzwungen. Treffer unter `provisional_content_threshold` werden gar nicht mehr gespeichert oder aus alten Cache-Ergebnissen übernommen. Das verhindert, dass offensichtlich falsche Szenen im Report als Match-Kandidat weiterleben. @@ -399,15 +411,15 @@ scoreable_contrast_min = 24.0 # Kontrastarme Blenden/Titelinseln ignorieren [vision] enabled = false # Kostenkontrolle: per CLI mit --vision aktivierbar model = "google/gemma-4-31b-it" # Muss ein visionfähiges OpenAI-kompatibles Modell sein -scene_candidate_top_k = 8 # Nur wenige Top-Szenen pro Beat beschreiben -max_new_descriptions_per_run = 12 # API-Kosten pro Lauf begrenzen -max_seed_scenes = 3 # Nur beste Vision-Szenen als Suchanker weitergeben +scene_candidate_top_k = 48 # Breiter Vision-Kandidatenpool für schwierige Beats +max_new_descriptions_per_run = 24 # Gecachte Beschreibungen pro Lauf; Rate-Limits bekommen Backoff +max_seed_scenes = 8 # Mehr Vision-Szenen als Suchanker, kein manueller Override seed_points_per_scene = 12 # Inpoint-Samples pro Vision-Szene seed_score = 0.88 # Vision-Seeds bekommen mehr Priorität als normale Scene-Seeds -max_refine_candidates = 6 # Vision-Pfad prüft mehrere Bewegungsphasen derselben Szene +max_refine_candidates = 12 # Vision-Pfad prüft mehrere Bewegungsphasen derselben Szene local_scan_step_s = 0.12 # Dichte lokale Bildsuche in Vision-Szenen local_scan_max_points_per_scene = 180 # Laufzeitgrenze pro Source-Szene -local_scan_top_candidates = 18 # Beste lokale Kandidaten gehen ins Refinement +local_scan_top_candidates = 36 # Beste lokale Kandidaten gehen ins Refinement local_scan_tie_break_score_delta = 0.08 # Ähnliche Vision-Treffer: frühere Phase bevorzugen multi_shot_cut_corr_threshold = 0.20 # Interne Trailer-Umschnitte erkennen multi_shot_boundary_tolerance_s = 0.20 # Source-Grenze muss zum Trailer-Cut passen diff --git a/cli.py b/cli.py index 3c299f5..1897b9b 100644 --- a/cli.py +++ b/cli.py @@ -458,7 +458,23 @@ def _find_scene_for_in_point(cfg, in_point_s: float): def _reference_scoreable_segments(beat, cfg) -> list[tuple[float, float]]: """Find visible source-matchable islands inside a trailer beat.""" from src.cv.frame_extractor import grab_frame_at_path - from src.cv.global_scan import _is_scoreable_reference_frame + from src.cv.global_scan import ( + _corr_same_size, + _is_scoreable_reference_frame, + _prepare_haystack, + _reference_visibility_stats, + ) + + def is_visible(frame) -> bool: + if frame is None: + return False + mean_luma, p90_luma, contrast = _reference_visibility_stats(frame, cfg) + visible_luma = ( + mean_luma >= cfg.cv.deep_scan.scoreable_luma_mean_min * 0.45 + or p90_luma >= cfg.cv.deep_scan.scoreable_luma_p90_min * 0.50 + ) + visible_contrast = contrast >= max(8.0, cfg.cv.deep_scan.scoreable_contrast_min * 0.30) + return visible_luma and visible_contrast step_s = max(0.08, cfg.cv.deep_scan.span_sample_step_s) min_segment_s = max(0.32, step_s * 3.0) @@ -487,7 +503,46 @@ def _reference_scoreable_segments(beat, cfg) -> list[tuple[float, float]]: if end - start >= min_segment_s: raw.append((start, end)) - return raw + expanded: list[tuple[float, float]] = [] + same_shot_corr_min = 0.72 + for start_s, end_s in raw: + start_anchor = grab_frame_at_path(beat.trailer_path, beat.start_s + start_s) + end_anchor = grab_frame_at_path(beat.trailer_path, beat.start_s + max(start_s, end_s - step_s)) + start_feature = _prepare_haystack(start_anchor, cfg) if start_anchor is not None else None + end_feature = _prepare_haystack(end_anchor, cfg) if end_anchor is not None else None + + soft_start = start_s + t = round(start_s - step_s, 6) + while t >= 0.0: + frame = grab_frame_at_path(beat.trailer_path, beat.start_s + t) + if not is_visible(frame): + break + if start_feature is not None and _corr_same_size(_prepare_haystack(frame, cfg), start_feature) < same_shot_corr_min: + break + soft_start = max(0.0, t) + t = round(t - step_s, 6) + + soft_end = end_s + t = round(end_s, 6) + while t <= beat.duration_s + 1e-6: + frame = grab_frame_at_path(beat.trailer_path, beat.start_s + t) + if not is_visible(frame): + break + if end_feature is not None and _corr_same_size(_prepare_haystack(frame, cfg), end_feature) < same_shot_corr_min: + break + soft_end = min(beat.duration_s, t + step_s) + t = round(t + step_s, 6) + + if soft_end - soft_start >= min_segment_s: + expanded.append((soft_start, soft_end)) + + merged: list[tuple[float, float]] = [] + for start_s, end_s in expanded: + if merged and start_s - merged[-1][1] <= bridge_gap_s: + merged[-1] = (merged[-1][0], max(merged[-1][1], end_s)) + else: + merged.append((start_s, end_s)) + return merged def _trim_beats_to_single_visual_island(beats: list, cfg) -> tuple[list, dict[int, tuple[float, float]]]: @@ -555,6 +610,28 @@ def _apply_single_island_segments(results: list, trims: dict[int, tuple[float, f return expanded +def _merge_best_results(existing: list, candidates: list, cfg) -> list: + """Merge matches by beat, preferring confirmed or higher-scoring results.""" + by_id = {r.beat_id: r for r in existing} + for candidate in candidates: + old = by_id.get(candidate.beat_id) + if old is None: + by_id[candidate.beat_id] = candidate + continue + candidate_confirmed = candidate.match_score >= cfg.cv.deep_scan.match_threshold or candidate.is_confirmed + old_confirmed = old.match_score >= cfg.cv.deep_scan.match_threshold or old.is_confirmed + if ( + candidate_confirmed and not old_confirmed + or candidate.match_score > old.match_score + cfg.cv.deep_scan.duration_tie_break_score_delta + or ( + candidate.match_score >= old.match_score - cfg.cv.deep_scan.duration_tie_break_score_delta + and candidate.duration_s > old.duration_s + ) + ): + by_id[candidate.beat_id] = candidate + return sorted(by_id.values(), key=lambda r: r.beat_id) + + def _attach_visual_segments(results: list, beats: list, cfg) -> list: """Attach automatic sub-shot matches for multi-island trailer beats.""" from dataclasses import replace @@ -657,16 +734,21 @@ def _run_segment_match(segment_beat, continuity, cfg, allow_fullscan: bool = Tru seed_in_points=continuity, ) if fast_matches: - return fast_matches + if not allow_fullscan or all( + m.is_confirmed or m.match_score >= cfg.cv.deep_scan.match_threshold + for m in fast_matches + ): + return fast_matches if not allow_fullscan: - return [] + return fast_matches if cfg.vision.enabled else [] - return run_matching( + full_matches = run_matching( cfg, [segment_beat], seed_in_points=continuity, ) + return _merge_best_results(fast_matches if cfg.vision.enabled else [], full_matches, cfg) def _match_unmatched_visual_segments( @@ -862,9 +944,21 @@ def cmd_match(args: argparse.Namespace, cfg) -> list: seed_in_points=seed_in_points, ) - if len(results) < len(scan_beats): - matched_ids = {r.beat_id for r in results} - remaining_beats = [b for b in scan_beats if b.beat_id not in matched_ids] + if len(results) < len(scan_beats) or any( + not r.is_confirmed and r.match_score < cfg.cv.deep_scan.match_threshold + for r in results + ): + results_by_id = {r.beat_id: r for r in results} + remaining_beats = [ + b for b in scan_beats + if ( + b.beat_id not in results_by_id + or ( + not results_by_id[b.beat_id].is_confirmed + and results_by_id[b.beat_id].match_score < cfg.cv.deep_scan.match_threshold + ) + ) + ] if remaining_beats: full_results = run_matching( cfg, @@ -872,7 +966,7 @@ def cmd_match(args: argparse.Namespace, cfg) -> list: force_reindex=args.force_reindex, seed_in_points=seed_in_points, ) - results = sorted([*results, *full_results], key=lambda r: r.beat_id) + results = _merge_best_results(results, full_results, cfg) results = _apply_single_island_segments(results, single_island_trims) results = _match_unmatched_visual_segments( results, diff --git a/config.toml b/config.toml index 56df40e..7b64b9a 100644 --- a/config.toml +++ b/config.toml @@ -173,15 +173,15 @@ max_tokens = 350 # Cost controls: per beat, only the top scene-level candidates are described, # and cached descriptions in .cache/vision_descriptions.json are reused. -scene_candidate_top_k = 8 -max_new_descriptions_per_run = 12 -max_seed_scenes = 3 +scene_candidate_top_k = 48 +max_new_descriptions_per_run = 24 +max_seed_scenes = 8 seed_points_per_scene = 12 seed_score = 0.88 -max_refine_candidates = 6 +max_refine_candidates = 12 local_scan_step_s = 0.12 local_scan_max_points_per_scene = 180 -local_scan_top_candidates = 18 +local_scan_top_candidates = 36 local_scan_tie_break_score_delta = 0.08 multi_shot_cut_corr_threshold = 0.20 multi_shot_boundary_tolerance_s = 0.20 diff --git a/src/llm/vision_cache.py b/src/llm/vision_cache.py index 2ce175e..fde8fe3 100644 --- a/src/llm/vision_cache.py +++ b/src/llm/vision_cache.py @@ -261,6 +261,107 @@ def _scene_seed_points(scene: Scene, max_points: int) -> list[float]: return [scene.start_s + step * idx for idx in range(max_points)] +def _scene_window_ranges(scene: Scene, beat: TrailerBeat, max_windows: int) -> list[tuple[float, float]]: + if max_windows <= 0 or scene.duration_s <= 0: + return [] + window_s = min(scene.duration_s, max(1.0, beat.duration_s)) + if scene.duration_s <= window_s + 0.2: + return [(scene.start_s, min(scene.end_s, scene.start_s + window_s))] + + usable_start = scene.start_s + usable_end = max(scene.start_s, scene.end_s - window_s) + if max_windows == 1: + starts = [usable_start + (usable_end - usable_start) * 0.5] + else: + step = (usable_end - usable_start) / max(1, max_windows - 1) + starts = [usable_start + step * idx for idx in range(max_windows)] + return [(start_s, min(scene.end_s, start_s + window_s)) for start_s in starts] + + +def _cached_scene_descriptions( + cache: dict, + scenes_by_id: dict[int, Scene], + cfg: AppConfig, +) -> list[tuple[Scene, str]]: + descriptions: list[tuple[Scene, str]] = [] + items = cache.get("items", {}) + if not isinstance(items, dict): + return descriptions + current_model_marker = f":{cfg.vision.provider}:{cfg.vision.model}:" + for key, item in items.items(): + if current_model_marker not in str(key): + continue + if not isinstance(item, dict) or item.get("kind") != "scene": + continue + scene_id = item.get("item_id") + if not isinstance(scene_id, int): + continue + scene = scenes_by_id.get(scene_id) + description = item.get("description", "") + if scene is not None and isinstance(description, str) and description.strip(): + descriptions.append((scene, description)) + return descriptions + + +def _add_window_seed_descriptions( + *, + beat: TrailerBeat, + beat_desc: str, + ranked: list[tuple[float, Scene, str]], + cfg: AppConfig, + cache: dict, + budget: list[int], + ranked_by_scene: dict[int, tuple[float, Scene, str]], +) -> list[tuple[float, float]]: + points: list[tuple[float, float]] = [] + if budget[0] <= 0: + return points + + scenes_to_probe = ranked[: max(1, cfg.vision.max_seed_scenes)] + windows_per_scene = max(1, min(6, cfg.vision.seed_points_per_scene // 2)) + for _, scene, _ in scenes_to_probe: + if budget[0] <= 0: + break + if scene.duration_s <= max(beat.duration_s * 1.6, 6.0): + continue + for start_s, end_s in _scene_window_ranges(scene, beat, windows_per_scene): + if budget[0] <= 0: + break + desc = _describe_sample( + kind="scene_window", + item_id=scene.scene_id, + label=f"source scene {scene.scene_id} window {start_s:.2f}-{end_s:.2f}", + video_path=scene.source_path, + start_s=start_s, + end_s=end_s, + cfg=cfg, + cache=cache, + budget=budget, + ) + if not desc: + continue + score = _text_similarity(beat_desc, desc) + if score < cfg.vision.similarity_threshold: + continue + semantic_score = min(0.99, score + 0.30) + existing = ranked_by_scene.get(scene.scene_id) + if existing is None or semantic_score > existing[0]: + ranked_by_scene[scene.scene_id] = (semantic_score, scene, "window") + logger.info( + "Beat %d: vision window seed scene=%d start=%.3fs score=%.3f", + beat.beat_id, + scene.scene_id, + start_s, + semantic_score, + ) + weighted_score = max( + cfg.cv.deep_scan.coarse_candidate_threshold, + min(0.99, cfg.vision.seed_score * (0.78 + min(1.0, semantic_score) * 0.22)), + ) + points.append((start_s, weighted_score)) + return points + + def build_vision_seed_in_points( beats: Sequence[TrailerBeat], scenes: Sequence[Scene], @@ -308,7 +409,7 @@ def build_vision_seed_in_points( phash_max_distance=64, ) - ranked: list[tuple[float, Scene]] = [] + ranked_by_scene: dict[int, tuple[float, Scene, str]] = {} for hit in hits: scene = scenes_by_id.get(hit.scene_id) if scene is None: @@ -328,16 +429,45 @@ def build_vision_seed_in_points( continue score = _text_similarity(beat_desc, scene_desc) if score >= cfg.vision.similarity_threshold: - ranked.append((score, scene)) + ranked_by_scene[scene.scene_id] = (min(0.99, score + 0.25), scene, "vision") - ranked.sort(key=lambda item: item[0], reverse=True) - points: list[tuple[float, float]] = [] - for score, scene in ranked[:cfg.vision.max_seed_scenes]: + # Keep the strongest low-level visual candidates as seeds as well. + # Text descriptions can miss timing-specific repeats inside one + # scene; the deep scan still has to validate every seed frame. + vibe_score = max(0.0, min(1.0, float(hit.combined_score))) + existing = ranked_by_scene.get(scene.scene_id) + if existing is None or vibe_score > existing[0]: + ranked_by_scene[scene.scene_id] = (vibe_score, scene, "vibe") + + for scene, scene_desc in _cached_scene_descriptions(cache, scenes_by_id, cfg): + score = _text_similarity(beat_desc, scene_desc) + if score < cfg.vision.similarity_threshold: + continue + semantic_score = min(0.99, score + 0.25) + existing = ranked_by_scene.get(scene.scene_id) + if existing is None or semantic_score > existing[0]: + ranked_by_scene[scene.scene_id] = (semantic_score, scene, "cache") + + ranked = sorted(ranked_by_scene.values(), key=lambda item: item[0], reverse=True) + window_points = _add_window_seed_descriptions( + beat=beat, + beat_desc=beat_desc, + ranked=ranked, + cfg=cfg, + cache=cache, + budget=budget, + ranked_by_scene=ranked_by_scene, + ) + ranked = sorted(ranked_by_scene.values(), key=lambda item: item[0], reverse=True) + seed_limit = min(len(ranked), max(cfg.vision.max_seed_scenes, cfg.vision.max_seed_scenes * 2)) + points: list[tuple[float, float]] = [*window_points] + for score, scene, source in ranked[:seed_limit]: logger.info( - "Beat %d: vision seed scene=%d score=%.3f", + "Beat %d: vision seed scene=%d score=%.3f source=%s", beat.beat_id, scene.scene_id, score, + source, ) weighted_score = max( cfg.cv.deep_scan.coarse_candidate_threshold,