Use beat context for segmented action retiming
This commit is contained in:
@@ -185,6 +185,12 @@ Diese In-Scene-Reparatur läuft auch für semantisch gültige Treffer aus langen
|
||||
Source-Szenen. Dadurch kann ein grob passender Dialogmoment nicht bestehen
|
||||
bleiben, wenn ein anderes lokales Fenster derselben Szene die gesuchte
|
||||
Aktionsphase und Bewegung klarer trifft.
|
||||
Bei blendigen oder segmentierten Beats nutzt die semantische Action-Suche den
|
||||
ganzen Trailerbeat als Kontext. Die eigentliche Frame-Ausrichtung bleibt auf das
|
||||
sichtbare Segment begrenzt; der gefundene Source-Inpoint wird dabei um den
|
||||
Trailer-Offset des Segments verschoben. So geht die globale Aktionsbeschreibung
|
||||
eines Beats nicht verloren, nur weil der scorebare Teil erst nach einer Blende
|
||||
beginnt.
|
||||
Der gewichtete Vision-Seed-Pfad ersetzt standardmäßig keinen normalen
|
||||
FFmpeg-Vollscan. Vision-Beschreibungen sind semantische Hinweise, aber keine
|
||||
Beweise; der volle CV-Scan bleibt deshalb aktiv, damit falsch bewertete
|
||||
|
||||
@@ -646,14 +646,18 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
|
||||
beats_by_id = {beat.beat_id: beat for beat in beats}
|
||||
scenes_by_id = {scene.scene_id: scene for scene in build_scene_index(cfg)}
|
||||
|
||||
def realign_window(check_beat, scene_id: int):
|
||||
def realign_window(check_beat, scene_id: int, action_beat=None):
|
||||
scene = scenes_by_id.get(scene_id)
|
||||
if scene is None:
|
||||
return None
|
||||
found = find_action_window_in_scene(check_beat, scene, cfg)
|
||||
found = find_action_window_in_scene(action_beat or check_beat, scene, cfg)
|
||||
if found is None:
|
||||
return None
|
||||
start_s, end_s, semantic_score, reason = found
|
||||
if action_beat is not None:
|
||||
offset_delta_s = max(0.0, check_beat.start_s - action_beat.start_s)
|
||||
start_s += offset_delta_s
|
||||
end_s += offset_delta_s
|
||||
window_s = max(3.0, min(8.0, (end_s - start_s) * 4.0))
|
||||
aligned_in_s, combined_score, content_score, motion_score = align_in_point_by_content_and_motion(
|
||||
check_beat,
|
||||
@@ -737,7 +741,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
|
||||
start_s=beat.start_s + segment.trailer_offset_s,
|
||||
end_s=beat.start_s + segment.trailer_offset_s + segment.duration_s,
|
||||
)
|
||||
repair = realign_window(segment_beat, segment.scene_id)
|
||||
repair = realign_window(segment_beat, segment.scene_id, action_beat=beat)
|
||||
if repair is None:
|
||||
new_segments.append(segment)
|
||||
continue
|
||||
@@ -809,7 +813,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
|
||||
start_s=beat.start_s + segment.trailer_offset_s,
|
||||
end_s=beat.start_s + segment.trailer_offset_s + segment.duration_s,
|
||||
)
|
||||
repair = realign_window(segment_beat, segment.scene_id)
|
||||
repair = realign_window(segment_beat, segment.scene_id, action_beat=beat)
|
||||
if repair is None:
|
||||
all_repaired = False
|
||||
break
|
||||
|
||||
+90
-5
@@ -283,7 +283,7 @@ def _semantic_match_score(beat_desc: str, candidate_desc: str) -> tuple[float, s
|
||||
beat_actions = _semantic_action_groups(beat_desc)
|
||||
candidate_actions = _semantic_action_groups(candidate_desc)
|
||||
required = beat_actions & _STRONG_ACTION_GROUPS
|
||||
missing = required - candidate_actions
|
||||
missing = _missing_action_groups(required, candidate_actions)
|
||||
if missing:
|
||||
penalty = min(0.45, 0.18 * len(missing))
|
||||
text_score = max(0.0, text_score - penalty)
|
||||
@@ -297,6 +297,80 @@ def _semantic_match_score(beat_desc: str, candidate_desc: str) -> tuple[float, s
|
||||
return text_score, reason
|
||||
|
||||
|
||||
def _missing_action_groups(required: set[str], candidate_actions: set[str]) -> set[str]:
|
||||
missing = set(required) - set(candidate_actions)
|
||||
if "approach" in missing and ({"kiss", "forehead_touch"} & set(candidate_actions)):
|
||||
missing.remove("approach")
|
||||
return missing
|
||||
|
||||
|
||||
def _action_phase_text(desc: str) -> str:
|
||||
match = re.search(r'"action_phase"\s*:\s*"([^"]+)"', desc, flags=re.IGNORECASE | re.DOTALL)
|
||||
if match:
|
||||
return match.group(1).lower()
|
||||
return desc.lower()
|
||||
|
||||
|
||||
def _action_phase_adjustment(beat_desc: str, candidate_desc: str) -> tuple[float, str]:
|
||||
"""
|
||||
Score whether a candidate is in the same action phase, not just the same scene.
|
||||
|
||||
Vision text can make two moments in one long dialogue sound equally similar:
|
||||
"moving closer until foreheads touch" and "static conversation after contact"
|
||||
share subjects/composition, but their temporal phase is different. This
|
||||
adjustment is intentionally generic and only looks for broad phase words.
|
||||
"""
|
||||
beat_phase = _action_phase_text(beat_desc)
|
||||
candidate_phase = _action_phase_text(candidate_desc)
|
||||
adjustment = 0.0
|
||||
notes: list[str] = []
|
||||
|
||||
beat_dynamic = any(term in beat_phase for term in (
|
||||
"moving", "move", "approach", "approaching", "lean", "leaning",
|
||||
"raise", "raising", "lift", "lifting", "turn", "turning",
|
||||
"pull", "pulling", "transition",
|
||||
))
|
||||
candidate_static = any(term in candidate_phase for term in (
|
||||
"static", "conversation", "talking", "speaking", "listening",
|
||||
"subtle facial", "slight facial",
|
||||
))
|
||||
if beat_dynamic and candidate_static:
|
||||
adjustment -= 0.18
|
||||
notes.append("dynamic_vs_static")
|
||||
|
||||
beat_contact_target = any(term in beat_phase for term in (
|
||||
"forehead", "foreheads", "touch", "kiss",
|
||||
))
|
||||
candidate_contact = any(term in candidate_phase for term in (
|
||||
"forehead", "foreheads", "touch", "touching", "kiss", "kissing",
|
||||
))
|
||||
candidate_after_contact = any(term in candidate_phase for term in (
|
||||
"gap", "pull back", "pulling back", "look at each other",
|
||||
"looks at each other", "conversation",
|
||||
))
|
||||
if beat_contact_target and candidate_contact:
|
||||
adjustment += 0.08
|
||||
notes.append("contact")
|
||||
if beat_contact_target and candidate_after_contact and "until" in beat_phase:
|
||||
adjustment -= 0.28
|
||||
notes.append("after_contact")
|
||||
|
||||
beat_kiss = "kiss" in beat_phase or "kissing" in beat_phase
|
||||
candidate_kiss = "kiss" in candidate_phase or "kissing" in candidate_phase
|
||||
if beat_kiss and candidate_kiss:
|
||||
adjustment += 0.12
|
||||
notes.append("kiss")
|
||||
elif beat_contact_target and candidate_kiss:
|
||||
adjustment += 0.06
|
||||
notes.append("kiss_contact")
|
||||
|
||||
if "talk" not in beat_phase and "conversation" in candidate_phase:
|
||||
adjustment -= 0.10
|
||||
notes.append("unexpected_conversation")
|
||||
|
||||
return adjustment, ",".join(notes) if notes else "phase_neutral"
|
||||
|
||||
|
||||
def _scene_seed_points(scene: Scene, max_points: int) -> list[float]:
|
||||
if max_points <= 1 or scene.duration_s <= 0:
|
||||
return [scene.start_s]
|
||||
@@ -612,7 +686,7 @@ def validate_match_window_with_vision(
|
||||
score, reason = _semantic_match_score(beat_desc, source_desc)
|
||||
beat_actions = _semantic_action_groups(beat_desc) & _STRONG_ACTION_GROUPS
|
||||
source_actions = _semantic_action_groups(source_desc)
|
||||
missing_actions = beat_actions - source_actions
|
||||
missing_actions = _missing_action_groups(beat_actions, source_actions)
|
||||
threshold = max(0.32, cfg.vision.similarity_threshold + 0.12)
|
||||
if missing_actions and score < threshold:
|
||||
return False, f"{reason} missing_actions={sorted(missing_actions)}"
|
||||
@@ -675,14 +749,25 @@ def find_action_window_in_scene(
|
||||
continue
|
||||
score, reason = _semantic_match_score(beat_desc, desc)
|
||||
source_actions = _semantic_action_groups(desc)
|
||||
missing_actions = beat_actions - source_actions
|
||||
missing_actions = _missing_action_groups(beat_actions, source_actions)
|
||||
if missing_actions:
|
||||
continue
|
||||
threshold = max(0.38, cfg.vision.similarity_threshold + 0.18)
|
||||
if score < threshold:
|
||||
continue
|
||||
candidate = (start_s, end_s, score, reason)
|
||||
if best is None or candidate[2] > best[2]:
|
||||
phase_adjustment, phase_reason = _action_phase_adjustment(beat_desc, desc)
|
||||
adjusted_score = max(0.0, min(1.0, score + phase_adjustment))
|
||||
if adjusted_score < threshold:
|
||||
continue
|
||||
candidate = (
|
||||
start_s,
|
||||
end_s,
|
||||
adjusted_score,
|
||||
f"{reason} phase={phase_reason} raw={score:.3f}",
|
||||
)
|
||||
if best is None or candidate[2] > best[2] + 0.03 or (
|
||||
candidate[2] >= best[2] - 0.03 and candidate[0] < best[0]
|
||||
):
|
||||
best = candidate
|
||||
|
||||
_save_cache(cfg, cache)
|
||||
|
||||
Reference in New Issue
Block a user