From 252f710396686ba8f796f685be37e323175e7325 Mon Sep 17 00:00:00 2001 From: Melbar Date: Sat, 2 May 2026 18:53:27 +0200 Subject: [PATCH] Rank long-scene probes without action penalty --- README.md | 3 +++ src/llm/vision_cache.py | 22 ++++++++++++++++++++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 20f25f2..a40ba78 100644 --- a/README.md +++ b/README.md @@ -168,6 +168,9 @@ Dieser lokale Fenster-Probe ist bewusst breiter als die finale Seed-Auswahl: Eine lange Dialogszene kann in der Gesamtbeschreibung nur als Gespräch erscheinen, aber an einer späteren Stelle trotzdem genau die gesuchte Aktionsphase enthalten. +Für diese Probe wird deshalb die grobe Szenenähnlichkeit ohne harte +Aktionsstrafe gerankt; die harte Aktionsprüfung greift erst auf den lokalen +Fenstern und dem finalen Source-Zeitbereich. Nach dem CV-Match kann derselbe Vision-Layer den konkreten finalen Source- Zeitbereich nochmals gegen den Trailer-Beat prüfen. Starke Aktionsphasen wie Annäherung, Kuss/Stirnkontakt, Handbewegungen oder Schneiden müssen dann auch diff --git a/src/llm/vision_cache.py b/src/llm/vision_cache.py index ee80232..c03c992 100644 --- a/src/llm/vision_cache.py +++ b/src/llm/vision_cache.py @@ -365,7 +365,7 @@ def _add_window_seed_descriptions( probe_limit = max( cfg.vision.max_seed_scenes * 4, - cfg.vision.scene_candidate_top_k // 2, + cfg.vision.scene_candidate_top_k, ) scenes_to_probe = ranked[: max(1, min(len(ranked), probe_limit))] windows_per_scene = max(1, min(6, cfg.vision.seed_points_per_scene // 2)) @@ -460,6 +460,7 @@ def build_vision_seed_in_points( ) ranked_by_scene: dict[int, tuple[float, Scene, str]] = {} + probe_ranked_by_scene: dict[int, tuple[float, Scene, str]] = {} for hit in hits: scene = scenes_by_id.get(hit.scene_id) if scene is None: @@ -477,6 +478,13 @@ def build_vision_seed_in_points( ) if not scene_desc: continue + probe_score = _text_similarity(beat_desc, scene_desc) + if probe_score >= cfg.vision.similarity_threshold: + probe_ranked_by_scene[scene.scene_id] = ( + min(0.99, probe_score + 0.25), + scene, + "probe", + ) score, _reason = _semantic_match_score(beat_desc, scene_desc) if score >= cfg.vision.similarity_threshold: ranked_by_scene[scene.scene_id] = (min(0.99, score + 0.25), scene, "vision") @@ -488,8 +496,17 @@ def build_vision_seed_in_points( existing = ranked_by_scene.get(scene.scene_id) if existing is None or vibe_score > existing[0]: ranked_by_scene[scene.scene_id] = (vibe_score, scene, "vibe") + probe_existing = probe_ranked_by_scene.get(scene.scene_id) + if probe_existing is None or vibe_score > probe_existing[0]: + probe_ranked_by_scene[scene.scene_id] = (vibe_score, scene, "vibe") for scene, scene_desc in _cached_scene_descriptions(cache, scenes_by_id, cfg): + probe_score = _text_similarity(beat_desc, scene_desc) + if probe_score >= cfg.vision.similarity_threshold: + probe_semantic_score = min(0.99, probe_score + 0.25) + probe_existing = probe_ranked_by_scene.get(scene.scene_id) + if probe_existing is None or probe_semantic_score > probe_existing[0]: + probe_ranked_by_scene[scene.scene_id] = (probe_semantic_score, scene, "probe-cache") score, _reason = _semantic_match_score(beat_desc, scene_desc) if score < cfg.vision.similarity_threshold: continue @@ -499,10 +516,11 @@ def build_vision_seed_in_points( ranked_by_scene[scene.scene_id] = (semantic_score, scene, "cache") ranked = sorted(ranked_by_scene.values(), key=lambda item: item[0], reverse=True) + probe_ranked = sorted(probe_ranked_by_scene.values(), key=lambda item: item[0], reverse=True) window_points = _add_window_seed_descriptions( beat=beat, beat_desc=beat_desc, - ranked=ranked, + ranked=probe_ranked, cfg=cfg, cache=cache, budget=budget,