diff --git a/README.md b/README.md index 0c10105..6f79a94 100644 --- a/README.md +++ b/README.md @@ -185,6 +185,12 @@ Diese In-Scene-Reparatur läuft auch für semantisch gültige Treffer aus langen Source-Szenen. Dadurch kann ein grob passender Dialogmoment nicht bestehen bleiben, wenn ein anderes lokales Fenster derselben Szene die gesuchte Aktionsphase und Bewegung klarer trifft. +Bei blendigen oder segmentierten Beats nutzt die semantische Action-Suche den +ganzen Trailerbeat als Kontext. Die eigentliche Frame-Ausrichtung bleibt auf das +sichtbare Segment begrenzt; der gefundene Source-Inpoint wird dabei um den +Trailer-Offset des Segments verschoben. So geht die globale Aktionsbeschreibung +eines Beats nicht verloren, nur weil der scorebare Teil erst nach einer Blende +beginnt. Der gewichtete Vision-Seed-Pfad ersetzt standardmäßig keinen normalen FFmpeg-Vollscan. Vision-Beschreibungen sind semantische Hinweise, aber keine Beweise; der volle CV-Scan bleibt deshalb aktiv, damit falsch bewertete diff --git a/cli.py b/cli.py index 69d3637..c0be792 100644 --- a/cli.py +++ b/cli.py @@ -646,14 +646,18 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg) beats_by_id = {beat.beat_id: beat for beat in beats} scenes_by_id = {scene.scene_id: scene for scene in build_scene_index(cfg)} - def realign_window(check_beat, scene_id: int): + def realign_window(check_beat, scene_id: int, action_beat=None): scene = scenes_by_id.get(scene_id) if scene is None: return None - found = find_action_window_in_scene(check_beat, scene, cfg) + found = find_action_window_in_scene(action_beat or check_beat, scene, cfg) if found is None: return None start_s, end_s, semantic_score, reason = found + if action_beat is not None: + offset_delta_s = max(0.0, check_beat.start_s - action_beat.start_s) + start_s += offset_delta_s + end_s += offset_delta_s window_s = max(3.0, min(8.0, (end_s - start_s) * 4.0)) aligned_in_s, combined_score, content_score, motion_score = align_in_point_by_content_and_motion( check_beat, @@ -737,7 +741,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg) start_s=beat.start_s + segment.trailer_offset_s, end_s=beat.start_s + segment.trailer_offset_s + segment.duration_s, ) - repair = realign_window(segment_beat, segment.scene_id) + repair = realign_window(segment_beat, segment.scene_id, action_beat=beat) if repair is None: new_segments.append(segment) continue @@ -809,7 +813,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg) start_s=beat.start_s + segment.trailer_offset_s, end_s=beat.start_s + segment.trailer_offset_s + segment.duration_s, ) - repair = realign_window(segment_beat, segment.scene_id) + repair = realign_window(segment_beat, segment.scene_id, action_beat=beat) if repair is None: all_repaired = False break diff --git a/src/llm/vision_cache.py b/src/llm/vision_cache.py index c03c992..a621ba8 100644 --- a/src/llm/vision_cache.py +++ b/src/llm/vision_cache.py @@ -283,7 +283,7 @@ def _semantic_match_score(beat_desc: str, candidate_desc: str) -> tuple[float, s beat_actions = _semantic_action_groups(beat_desc) candidate_actions = _semantic_action_groups(candidate_desc) required = beat_actions & _STRONG_ACTION_GROUPS - missing = required - candidate_actions + missing = _missing_action_groups(required, candidate_actions) if missing: penalty = min(0.45, 0.18 * len(missing)) text_score = max(0.0, text_score - penalty) @@ -297,6 +297,80 @@ def _semantic_match_score(beat_desc: str, candidate_desc: str) -> tuple[float, s return text_score, reason +def _missing_action_groups(required: set[str], candidate_actions: set[str]) -> set[str]: + missing = set(required) - set(candidate_actions) + if "approach" in missing and ({"kiss", "forehead_touch"} & set(candidate_actions)): + missing.remove("approach") + return missing + + +def _action_phase_text(desc: str) -> str: + match = re.search(r'"action_phase"\s*:\s*"([^"]+)"', desc, flags=re.IGNORECASE | re.DOTALL) + if match: + return match.group(1).lower() + return desc.lower() + + +def _action_phase_adjustment(beat_desc: str, candidate_desc: str) -> tuple[float, str]: + """ + Score whether a candidate is in the same action phase, not just the same scene. + + Vision text can make two moments in one long dialogue sound equally similar: + "moving closer until foreheads touch" and "static conversation after contact" + share subjects/composition, but their temporal phase is different. This + adjustment is intentionally generic and only looks for broad phase words. + """ + beat_phase = _action_phase_text(beat_desc) + candidate_phase = _action_phase_text(candidate_desc) + adjustment = 0.0 + notes: list[str] = [] + + beat_dynamic = any(term in beat_phase for term in ( + "moving", "move", "approach", "approaching", "lean", "leaning", + "raise", "raising", "lift", "lifting", "turn", "turning", + "pull", "pulling", "transition", + )) + candidate_static = any(term in candidate_phase for term in ( + "static", "conversation", "talking", "speaking", "listening", + "subtle facial", "slight facial", + )) + if beat_dynamic and candidate_static: + adjustment -= 0.18 + notes.append("dynamic_vs_static") + + beat_contact_target = any(term in beat_phase for term in ( + "forehead", "foreheads", "touch", "kiss", + )) + candidate_contact = any(term in candidate_phase for term in ( + "forehead", "foreheads", "touch", "touching", "kiss", "kissing", + )) + candidate_after_contact = any(term in candidate_phase for term in ( + "gap", "pull back", "pulling back", "look at each other", + "looks at each other", "conversation", + )) + if beat_contact_target and candidate_contact: + adjustment += 0.08 + notes.append("contact") + if beat_contact_target and candidate_after_contact and "until" in beat_phase: + adjustment -= 0.28 + notes.append("after_contact") + + beat_kiss = "kiss" in beat_phase or "kissing" in beat_phase + candidate_kiss = "kiss" in candidate_phase or "kissing" in candidate_phase + if beat_kiss and candidate_kiss: + adjustment += 0.12 + notes.append("kiss") + elif beat_contact_target and candidate_kiss: + adjustment += 0.06 + notes.append("kiss_contact") + + if "talk" not in beat_phase and "conversation" in candidate_phase: + adjustment -= 0.10 + notes.append("unexpected_conversation") + + return adjustment, ",".join(notes) if notes else "phase_neutral" + + def _scene_seed_points(scene: Scene, max_points: int) -> list[float]: if max_points <= 1 or scene.duration_s <= 0: return [scene.start_s] @@ -612,7 +686,7 @@ def validate_match_window_with_vision( score, reason = _semantic_match_score(beat_desc, source_desc) beat_actions = _semantic_action_groups(beat_desc) & _STRONG_ACTION_GROUPS source_actions = _semantic_action_groups(source_desc) - missing_actions = beat_actions - source_actions + missing_actions = _missing_action_groups(beat_actions, source_actions) threshold = max(0.32, cfg.vision.similarity_threshold + 0.12) if missing_actions and score < threshold: return False, f"{reason} missing_actions={sorted(missing_actions)}" @@ -675,14 +749,25 @@ def find_action_window_in_scene( continue score, reason = _semantic_match_score(beat_desc, desc) source_actions = _semantic_action_groups(desc) - missing_actions = beat_actions - source_actions + missing_actions = _missing_action_groups(beat_actions, source_actions) if missing_actions: continue threshold = max(0.38, cfg.vision.similarity_threshold + 0.18) if score < threshold: continue - candidate = (start_s, end_s, score, reason) - if best is None or candidate[2] > best[2]: + phase_adjustment, phase_reason = _action_phase_adjustment(beat_desc, desc) + adjusted_score = max(0.0, min(1.0, score + phase_adjustment)) + if adjusted_score < threshold: + continue + candidate = ( + start_s, + end_s, + adjusted_score, + f"{reason} phase={phase_reason} raw={score:.3f}", + ) + if best is None or candidate[2] > best[2] + 0.03 or ( + candidate[2] >= best[2] - 0.03 and candidate[0] < best[0] + ): best = candidate _save_cache(cfg, cache)