diff --git a/README.md b/README.md index e53c029..f0207a1 100644 --- a/README.md +++ b/README.md @@ -164,6 +164,11 @@ Zeitbereich nochmals gegen den Trailer-Beat prüfen. Starke Aktionsphasen wie Annäherung, Kuss/Stirnkontakt, Handbewegungen oder Schneiden müssen dann auch im Source-Fenster beschrieben sein; fehlt diese Aktionsphase, wird der Treffer nicht gespeichert, selbst wenn der Low-Level-CV-Score hoch ist. +Wenn die Szene selbst plausibel ist, aber der konkrete Source-Zeitpunkt diese +Aktionsphase verfehlt, sucht der Matcher automatisch dichter innerhalb derselben +Source-Szene nach lokalen Vision-Fenstern mit der passenden Aktion und richtet +den Inpoint darauf neu aus. Erst wenn auch diese In-Scene-Reparatur scheitert, +wird der Treffer verworfen. Der gewichtete Vision-Seed-Pfad ersetzt standardmäßig keinen normalen FFmpeg-Vollscan. Vision-Beschreibungen sind semantische Hinweise, aber keine Beweise; der volle CV-Scan bleibt deshalb aktiv, damit falsch bewertete diff --git a/cli.py b/cli.py index 9024aa3..18f68bb 100644 --- a/cli.py +++ b/cli.py @@ -638,10 +638,50 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg) return results from dataclasses import replace - from src.llm.vision_cache import validate_match_window_with_vision + from src.llm.vision_cache import find_action_window_in_scene, validate_match_window_with_vision + from src.cv.scene_indexer import build_scene_index + from src.cv.global_scan import align_in_point_by_content logger = logging.getLogger(__name__) beats_by_id = {beat.beat_id: beat for beat in beats} + scenes_by_id = {scene.scene_id: scene for scene in build_scene_index(cfg)} + + def realign_window(check_beat, scene_id: int): + scene = scenes_by_id.get(scene_id) + if scene is None: + return None + found = find_action_window_in_scene(check_beat, scene, cfg) + if found is None: + return None + start_s, end_s, semantic_score, reason = found + window_s = max(1.0, min(4.0, (end_s - start_s) * 1.5)) + aligned_in_s, content_score = align_in_point_by_content( + check_beat, + start_s, + cfg, + search_window_s=window_s, + ) + aligned_in_s = max(scene.start_s, min(aligned_in_s, max(scene.start_s, scene.end_s - check_beat.duration_s))) + ok, verify_reason = validate_match_window_with_vision( + check_beat, + source_path=scene.source_path, + scene_id=scene.scene_id, + in_point_s=aligned_in_s, + out_point_s=aligned_in_s + check_beat.duration_s, + cfg=cfg, + ) + if not ok: + logger.info( + "Beat %d: action-window realign rejected scene=%d in=%.3fs (%s)", + check_beat.beat_id, + scene.scene_id, + aligned_in_s, + verify_reason, + ) + return None + score = max(content_score, min(0.99, semantic_score)) + return scene, aligned_in_s, score, f"{reason}; {verify_reason}" + kept = [] for result in results: beat = beats_by_id.get(result.beat_id) @@ -684,6 +724,68 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg) if valid: kept.append(result) else: + if getattr(result, "segments", ()): + new_segments = [] + all_repaired = True + repair_reasons = [] + for segment in result.segments: + segment_beat = replace( + beat, + start_s=beat.start_s + segment.trailer_offset_s, + end_s=beat.start_s + segment.trailer_offset_s + segment.duration_s, + ) + repair = realign_window(segment_beat, segment.scene_id) + if repair is None: + all_repaired = False + break + scene, aligned_in_s, score, repair_reason = repair + repair_reasons.append(repair_reason) + new_segments.append(replace( + segment, + scene_id=scene.scene_id, + in_point_s=aligned_in_s, + out_point_s=aligned_in_s + segment.duration_s, + match_score=score, + is_confirmed=score >= cfg.cv.deep_scan.match_threshold, + )) + if all_repaired and new_segments: + first = new_segments[0] + repaired_score = min(seg.match_score for seg in new_segments) + logger.info( + "Beat %d: realigned inside matched scene by vision action windows (%s)", + result.beat_id, + "; ".join(repair_reasons), + ) + kept.append(replace( + result, + scene_id=first.scene_id, + in_point_s=first.in_point_s, + out_point_s=first.out_point_s, + in_point_frame=int(first.in_point_s * cfg.export.edl_frame_rate), + match_score=repaired_score, + is_confirmed=repaired_score >= cfg.cv.deep_scan.match_threshold, + segments=tuple(new_segments), + )) + continue + else: + repair = realign_window(beat, result.scene_id) + if repair is not None: + scene, aligned_in_s, score, repair_reason = repair + logger.info( + "Beat %d: realigned inside matched scene by vision action window (%s)", + result.beat_id, + repair_reason, + ) + kept.append(replace( + result, + scene_id=scene.scene_id, + in_point_s=aligned_in_s, + out_point_s=aligned_in_s + result.duration_s, + in_point_frame=int(aligned_in_s * cfg.export.edl_frame_rate), + match_score=score, + is_confirmed=score >= cfg.cv.deep_scan.match_threshold, + )) + continue logger.warning( "Beat %d: rejected by vision action-phase verification (%s)", result.beat_id, diff --git a/src/llm/vision_cache.py b/src/llm/vision_cache.py index 2632179..2ac449c 100644 --- a/src/llm/vision_cache.py +++ b/src/llm/vision_cache.py @@ -595,3 +595,73 @@ def validate_match_window_with_vision( if missing_actions and score < threshold: return False, f"{reason} missing_actions={sorted(missing_actions)}" return True, reason + + +def find_action_window_in_scene( + beat: TrailerBeat, + scene: Scene, + cfg: AppConfig, +) -> tuple[float, float, float, str] | None: + """ + Search one already selected source scene for the beat's action phase. + + This is used after CV picked the right broad scene but the wrong time + inside that scene. It stays automatic and cached: windows are described + evenly across the scene until the per-run vision budget is consumed. + """ + if not cfg.vision.enabled or scene.duration_s <= 0: + return None + + cache = _load_cache(cfg) + budget = [max(0, cfg.vision.max_new_descriptions_per_run)] + beat_desc = _describe_sample( + kind="beat", + item_id=beat.beat_id, + label=f"trailer beat {beat.beat_id} action search", + video_path=beat.trailer_path, + start_s=beat.start_s, + end_s=beat.end_s, + cfg=cfg, + cache=cache, + budget=budget, + ) + if not beat_desc: + return None + + beat_actions = _semantic_action_groups(beat_desc) & _STRONG_ACTION_GROUPS + if not beat_actions: + return None + + max_windows = max( + cfg.vision.seed_points_per_scene, + cfg.vision.max_new_descriptions_per_run, + ) + best: tuple[float, float, float, str] | None = None + for start_s, end_s in _scene_window_ranges(scene, beat, max_windows): + desc = _describe_sample( + kind="action_window", + item_id=scene.scene_id, + label=f"source scene {scene.scene_id} action window {start_s:.2f}-{end_s:.2f}", + video_path=scene.source_path, + start_s=start_s, + end_s=end_s, + cfg=cfg, + cache=cache, + budget=budget, + ) + if not desc: + continue + score, reason = _semantic_match_score(beat_desc, desc) + source_actions = _semantic_action_groups(desc) + missing_actions = beat_actions - source_actions + if missing_actions: + continue + threshold = max(0.38, cfg.vision.similarity_threshold + 0.18) + if score < threshold: + continue + candidate = (start_s, end_s, score, reason) + if best is None or candidate[2] > best[2]: + best = candidate + + _save_cache(cfg, cache) + return best