Use beat context for segmented action retiming

This commit is contained in:
Melbar
2026-05-02 21:58:35 +02:00
parent 8415516f89
commit e293835a86
3 changed files with 104 additions and 9 deletions
+6
View File
@@ -185,6 +185,12 @@ Diese In-Scene-Reparatur läuft auch für semantisch gültige Treffer aus langen
Source-Szenen. Dadurch kann ein grob passender Dialogmoment nicht bestehen
bleiben, wenn ein anderes lokales Fenster derselben Szene die gesuchte
Aktionsphase und Bewegung klarer trifft.
Bei blendigen oder segmentierten Beats nutzt die semantische Action-Suche den
ganzen Trailerbeat als Kontext. Die eigentliche Frame-Ausrichtung bleibt auf das
sichtbare Segment begrenzt; der gefundene Source-Inpoint wird dabei um den
Trailer-Offset des Segments verschoben. So geht die globale Aktionsbeschreibung
eines Beats nicht verloren, nur weil der scorebare Teil erst nach einer Blende
beginnt.
Der gewichtete Vision-Seed-Pfad ersetzt standardmäßig keinen normalen
FFmpeg-Vollscan. Vision-Beschreibungen sind semantische Hinweise, aber keine
Beweise; der volle CV-Scan bleibt deshalb aktiv, damit falsch bewertete
+8 -4
View File
@@ -646,14 +646,18 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
beats_by_id = {beat.beat_id: beat for beat in beats}
scenes_by_id = {scene.scene_id: scene for scene in build_scene_index(cfg)}
def realign_window(check_beat, scene_id: int):
def realign_window(check_beat, scene_id: int, action_beat=None):
scene = scenes_by_id.get(scene_id)
if scene is None:
return None
found = find_action_window_in_scene(check_beat, scene, cfg)
found = find_action_window_in_scene(action_beat or check_beat, scene, cfg)
if found is None:
return None
start_s, end_s, semantic_score, reason = found
if action_beat is not None:
offset_delta_s = max(0.0, check_beat.start_s - action_beat.start_s)
start_s += offset_delta_s
end_s += offset_delta_s
window_s = max(3.0, min(8.0, (end_s - start_s) * 4.0))
aligned_in_s, combined_score, content_score, motion_score = align_in_point_by_content_and_motion(
check_beat,
@@ -737,7 +741,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
start_s=beat.start_s + segment.trailer_offset_s,
end_s=beat.start_s + segment.trailer_offset_s + segment.duration_s,
)
repair = realign_window(segment_beat, segment.scene_id)
repair = realign_window(segment_beat, segment.scene_id, action_beat=beat)
if repair is None:
new_segments.append(segment)
continue
@@ -809,7 +813,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
start_s=beat.start_s + segment.trailer_offset_s,
end_s=beat.start_s + segment.trailer_offset_s + segment.duration_s,
)
repair = realign_window(segment_beat, segment.scene_id)
repair = realign_window(segment_beat, segment.scene_id, action_beat=beat)
if repair is None:
all_repaired = False
break
+90 -5
View File
@@ -283,7 +283,7 @@ def _semantic_match_score(beat_desc: str, candidate_desc: str) -> tuple[float, s
beat_actions = _semantic_action_groups(beat_desc)
candidate_actions = _semantic_action_groups(candidate_desc)
required = beat_actions & _STRONG_ACTION_GROUPS
missing = required - candidate_actions
missing = _missing_action_groups(required, candidate_actions)
if missing:
penalty = min(0.45, 0.18 * len(missing))
text_score = max(0.0, text_score - penalty)
@@ -297,6 +297,80 @@ def _semantic_match_score(beat_desc: str, candidate_desc: str) -> tuple[float, s
return text_score, reason
def _missing_action_groups(required: set[str], candidate_actions: set[str]) -> set[str]:
missing = set(required) - set(candidate_actions)
if "approach" in missing and ({"kiss", "forehead_touch"} & set(candidate_actions)):
missing.remove("approach")
return missing
def _action_phase_text(desc: str) -> str:
match = re.search(r'"action_phase"\s*:\s*"([^"]+)"', desc, flags=re.IGNORECASE | re.DOTALL)
if match:
return match.group(1).lower()
return desc.lower()
def _action_phase_adjustment(beat_desc: str, candidate_desc: str) -> tuple[float, str]:
"""
Score whether a candidate is in the same action phase, not just the same scene.
Vision text can make two moments in one long dialogue sound equally similar:
"moving closer until foreheads touch" and "static conversation after contact"
share subjects/composition, but their temporal phase is different. This
adjustment is intentionally generic and only looks for broad phase words.
"""
beat_phase = _action_phase_text(beat_desc)
candidate_phase = _action_phase_text(candidate_desc)
adjustment = 0.0
notes: list[str] = []
beat_dynamic = any(term in beat_phase for term in (
"moving", "move", "approach", "approaching", "lean", "leaning",
"raise", "raising", "lift", "lifting", "turn", "turning",
"pull", "pulling", "transition",
))
candidate_static = any(term in candidate_phase for term in (
"static", "conversation", "talking", "speaking", "listening",
"subtle facial", "slight facial",
))
if beat_dynamic and candidate_static:
adjustment -= 0.18
notes.append("dynamic_vs_static")
beat_contact_target = any(term in beat_phase for term in (
"forehead", "foreheads", "touch", "kiss",
))
candidate_contact = any(term in candidate_phase for term in (
"forehead", "foreheads", "touch", "touching", "kiss", "kissing",
))
candidate_after_contact = any(term in candidate_phase for term in (
"gap", "pull back", "pulling back", "look at each other",
"looks at each other", "conversation",
))
if beat_contact_target and candidate_contact:
adjustment += 0.08
notes.append("contact")
if beat_contact_target and candidate_after_contact and "until" in beat_phase:
adjustment -= 0.28
notes.append("after_contact")
beat_kiss = "kiss" in beat_phase or "kissing" in beat_phase
candidate_kiss = "kiss" in candidate_phase or "kissing" in candidate_phase
if beat_kiss and candidate_kiss:
adjustment += 0.12
notes.append("kiss")
elif beat_contact_target and candidate_kiss:
adjustment += 0.06
notes.append("kiss_contact")
if "talk" not in beat_phase and "conversation" in candidate_phase:
adjustment -= 0.10
notes.append("unexpected_conversation")
return adjustment, ",".join(notes) if notes else "phase_neutral"
def _scene_seed_points(scene: Scene, max_points: int) -> list[float]:
if max_points <= 1 or scene.duration_s <= 0:
return [scene.start_s]
@@ -612,7 +686,7 @@ def validate_match_window_with_vision(
score, reason = _semantic_match_score(beat_desc, source_desc)
beat_actions = _semantic_action_groups(beat_desc) & _STRONG_ACTION_GROUPS
source_actions = _semantic_action_groups(source_desc)
missing_actions = beat_actions - source_actions
missing_actions = _missing_action_groups(beat_actions, source_actions)
threshold = max(0.32, cfg.vision.similarity_threshold + 0.12)
if missing_actions and score < threshold:
return False, f"{reason} missing_actions={sorted(missing_actions)}"
@@ -675,14 +749,25 @@ def find_action_window_in_scene(
continue
score, reason = _semantic_match_score(beat_desc, desc)
source_actions = _semantic_action_groups(desc)
missing_actions = beat_actions - source_actions
missing_actions = _missing_action_groups(beat_actions, source_actions)
if missing_actions:
continue
threshold = max(0.38, cfg.vision.similarity_threshold + 0.18)
if score < threshold:
continue
candidate = (start_s, end_s, score, reason)
if best is None or candidate[2] > best[2]:
phase_adjustment, phase_reason = _action_phase_adjustment(beat_desc, desc)
adjusted_score = max(0.0, min(1.0, score + phase_adjustment))
if adjusted_score < threshold:
continue
candidate = (
start_s,
end_s,
adjusted_score,
f"{reason} phase={phase_reason} raw={score:.3f}",
)
if best is None or candidate[2] > best[2] + 0.03 or (
candidate[2] >= best[2] - 0.03 and candidate[0] < best[0]
):
best = candidate
_save_cache(cfg, cache)