Use beat context for segmented action retiming

This commit is contained in:
Melbar
2026-05-02 21:58:35 +02:00
parent 8415516f89
commit e293835a86
3 changed files with 104 additions and 9 deletions
+90 -5
View File
@@ -283,7 +283,7 @@ def _semantic_match_score(beat_desc: str, candidate_desc: str) -> tuple[float, s
beat_actions = _semantic_action_groups(beat_desc)
candidate_actions = _semantic_action_groups(candidate_desc)
required = beat_actions & _STRONG_ACTION_GROUPS
missing = required - candidate_actions
missing = _missing_action_groups(required, candidate_actions)
if missing:
penalty = min(0.45, 0.18 * len(missing))
text_score = max(0.0, text_score - penalty)
@@ -297,6 +297,80 @@ def _semantic_match_score(beat_desc: str, candidate_desc: str) -> tuple[float, s
return text_score, reason
def _missing_action_groups(required: set[str], candidate_actions: set[str]) -> set[str]:
missing = set(required) - set(candidate_actions)
if "approach" in missing and ({"kiss", "forehead_touch"} & set(candidate_actions)):
missing.remove("approach")
return missing
def _action_phase_text(desc: str) -> str:
match = re.search(r'"action_phase"\s*:\s*"([^"]+)"', desc, flags=re.IGNORECASE | re.DOTALL)
if match:
return match.group(1).lower()
return desc.lower()
def _action_phase_adjustment(beat_desc: str, candidate_desc: str) -> tuple[float, str]:
"""
Score whether a candidate is in the same action phase, not just the same scene.
Vision text can make two moments in one long dialogue sound equally similar:
"moving closer until foreheads touch" and "static conversation after contact"
share subjects/composition, but their temporal phase is different. This
adjustment is intentionally generic and only looks for broad phase words.
"""
beat_phase = _action_phase_text(beat_desc)
candidate_phase = _action_phase_text(candidate_desc)
adjustment = 0.0
notes: list[str] = []
beat_dynamic = any(term in beat_phase for term in (
"moving", "move", "approach", "approaching", "lean", "leaning",
"raise", "raising", "lift", "lifting", "turn", "turning",
"pull", "pulling", "transition",
))
candidate_static = any(term in candidate_phase for term in (
"static", "conversation", "talking", "speaking", "listening",
"subtle facial", "slight facial",
))
if beat_dynamic and candidate_static:
adjustment -= 0.18
notes.append("dynamic_vs_static")
beat_contact_target = any(term in beat_phase for term in (
"forehead", "foreheads", "touch", "kiss",
))
candidate_contact = any(term in candidate_phase for term in (
"forehead", "foreheads", "touch", "touching", "kiss", "kissing",
))
candidate_after_contact = any(term in candidate_phase for term in (
"gap", "pull back", "pulling back", "look at each other",
"looks at each other", "conversation",
))
if beat_contact_target and candidate_contact:
adjustment += 0.08
notes.append("contact")
if beat_contact_target and candidate_after_contact and "until" in beat_phase:
adjustment -= 0.28
notes.append("after_contact")
beat_kiss = "kiss" in beat_phase or "kissing" in beat_phase
candidate_kiss = "kiss" in candidate_phase or "kissing" in candidate_phase
if beat_kiss and candidate_kiss:
adjustment += 0.12
notes.append("kiss")
elif beat_contact_target and candidate_kiss:
adjustment += 0.06
notes.append("kiss_contact")
if "talk" not in beat_phase and "conversation" in candidate_phase:
adjustment -= 0.10
notes.append("unexpected_conversation")
return adjustment, ",".join(notes) if notes else "phase_neutral"
def _scene_seed_points(scene: Scene, max_points: int) -> list[float]:
if max_points <= 1 or scene.duration_s <= 0:
return [scene.start_s]
@@ -612,7 +686,7 @@ def validate_match_window_with_vision(
score, reason = _semantic_match_score(beat_desc, source_desc)
beat_actions = _semantic_action_groups(beat_desc) & _STRONG_ACTION_GROUPS
source_actions = _semantic_action_groups(source_desc)
missing_actions = beat_actions - source_actions
missing_actions = _missing_action_groups(beat_actions, source_actions)
threshold = max(0.32, cfg.vision.similarity_threshold + 0.12)
if missing_actions and score < threshold:
return False, f"{reason} missing_actions={sorted(missing_actions)}"
@@ -675,14 +749,25 @@ def find_action_window_in_scene(
continue
score, reason = _semantic_match_score(beat_desc, desc)
source_actions = _semantic_action_groups(desc)
missing_actions = beat_actions - source_actions
missing_actions = _missing_action_groups(beat_actions, source_actions)
if missing_actions:
continue
threshold = max(0.38, cfg.vision.similarity_threshold + 0.18)
if score < threshold:
continue
candidate = (start_s, end_s, score, reason)
if best is None or candidate[2] > best[2]:
phase_adjustment, phase_reason = _action_phase_adjustment(beat_desc, desc)
adjusted_score = max(0.0, min(1.0, score + phase_adjustment))
if adjusted_score < threshold:
continue
candidate = (
start_s,
end_s,
adjusted_score,
f"{reason} phase={phase_reason} raw={score:.3f}",
)
if best is None or candidate[2] > best[2] + 0.03 or (
candidate[2] >= best[2] - 0.03 and candidate[0] < best[0]
):
best = candidate
_save_cache(cfg, cache)