Reject vision matches with action phase mismatches

This commit is contained in:
Melbar
2026-05-02 16:49:47 +02:00
parent d9e470c877
commit 1a177d6b89
3 changed files with 177 additions and 3 deletions
+5
View File
@@ -159,6 +159,11 @@ Vision-Modell stammen. Bei langen semantisch passenden Source-Szenen beschreibt
der Vision-Layer zusätzlich wenige lokale Zeitfenster und cached auch diese der Vision-Layer zusätzlich wenige lokale Zeitfenster und cached auch diese
Fenster, damit eine grob ähnliche Szene nicht automatisch mit dem falschen Fenster, damit eine grob ähnliche Szene nicht automatisch mit dem falschen
Bewegungs- oder Dialogmoment gleichgesetzt wird. Bewegungs- oder Dialogmoment gleichgesetzt wird.
Nach dem CV-Match kann derselbe Vision-Layer den konkreten finalen Source-
Zeitbereich nochmals gegen den Trailer-Beat prüfen. Starke Aktionsphasen wie
Annäherung, Kuss/Stirnkontakt, Handbewegungen oder Schneiden müssen dann auch
im Source-Fenster beschrieben sein; fehlt diese Aktionsphase, wird der Treffer
nicht gespeichert, selbst wenn der Low-Level-CV-Score hoch ist.
Der gewichtete Vision-Seed-Pfad ersetzt standardmäßig keinen normalen Der gewichtete Vision-Seed-Pfad ersetzt standardmäßig keinen normalen
FFmpeg-Vollscan. Vision-Beschreibungen sind semantische Hinweise, aber keine FFmpeg-Vollscan. Vision-Beschreibungen sind semantische Hinweise, aber keine
Beweise; der volle CV-Scan bleibt deshalb aktiv, damit falsch bewertete Beweise; der volle CV-Scan bleibt deshalb aktiv, damit falsch bewertete
+61
View File
@@ -632,6 +632,66 @@ def _merge_best_results(existing: list, candidates: list, cfg) -> list:
return sorted(by_id.values(), key=lambda r: r.beat_id) return sorted(by_id.values(), key=lambda r: r.beat_id)
def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg) -> list:
"""Drop vision-enabled matches whose final action phase contradicts the beat."""
if not cfg.vision.enabled or not results:
return results
from dataclasses import replace
from src.llm.vision_cache import validate_match_window_with_vision
logger = logging.getLogger(__name__)
beats_by_id = {beat.beat_id: beat for beat in beats}
kept = []
for result in results:
beat = beats_by_id.get(result.beat_id)
if beat is None:
kept.append(result)
continue
windows = []
if getattr(result, "segments", ()):
for segment in result.segments:
segment_beat = replace(
beat,
start_s=beat.start_s + segment.trailer_offset_s,
end_s=beat.start_s + segment.trailer_offset_s + segment.duration_s,
)
windows.append((
segment_beat,
segment.scene_id,
segment.in_point_s,
segment.out_point_s,
))
else:
windows.append((beat, result.scene_id, result.in_point_s, result.out_point_s))
valid = True
reasons: list[str] = []
for check_beat, scene_id, in_point_s, out_point_s in windows:
ok, reason = validate_match_window_with_vision(
check_beat,
source_path=result.source_path,
scene_id=scene_id,
in_point_s=in_point_s,
out_point_s=out_point_s,
cfg=cfg,
)
reasons.append(reason)
if not ok:
valid = False
break
if valid:
kept.append(result)
else:
logger.warning(
"Beat %d: rejected by vision action-phase verification (%s)",
result.beat_id,
"; ".join(reasons),
)
return kept
def _attach_visual_segments(results: list, beats: list, cfg) -> list: def _attach_visual_segments(results: list, beats: list, cfg) -> list:
"""Attach automatic sub-shot matches for multi-island trailer beats.""" """Attach automatic sub-shot matches for multi-island trailer beats."""
from dataclasses import replace from dataclasses import replace
@@ -976,6 +1036,7 @@ def cmd_match(args: argparse.Namespace, cfg) -> list:
skip_global_segment_scan_for=set(single_island_trims), skip_global_segment_scan_for=set(single_island_trims),
) )
results = _attach_visual_segments(results, beats, cfg) results = _attach_visual_segments(results, beats, cfg)
results = _filter_semantically_invalid_vision_matches(results, beats, cfg)
# A targeted one-beat match should improve the cache without deleting # A targeted one-beat match should improve the cache without deleting
# automatic matches for other beats. # automatic matches for other beats.
+111 -3
View File
@@ -50,6 +50,18 @@ _CREDIT_ERROR_PATTERNS = (
"payment required", "payment required",
) )
_ACTION_GROUPS = {
"kiss": {"kiss", "kisses", "kissing", "kissed"},
"forehead_touch": {"forehead", "foreheads", "touch", "touches", "touching", "touched"},
"approach": {"approach", "approaches", "approaching", "closer", "lean", "leans", "leaning"},
"talk": {"talk", "talking", "speak", "speaking", "conversation", "conversing"},
"hand": {"hand", "hands", "holding", "holds", "raise", "raises", "raising", "lift", "lifting"},
"cutting": {"cut", "cuts", "cutting", "knife", "blade", "scissors"},
"look_down": {"down", "lowering", "lowers"},
"turn": {"turn", "turns", "turning"},
}
_STRONG_ACTION_GROUPS = {"kiss", "forehead_touch", "approach", "hand", "cutting"}
def _cache_path(cfg: AppConfig) -> Path: def _cache_path(cfg: AppConfig) -> Path:
return cfg.paths.cache_dir / "vision_descriptions.json" return cfg.paths.cache_dir / "vision_descriptions.json"
@@ -251,6 +263,40 @@ def _text_similarity(a: str, b: str) -> float:
return float(overlap / max(8, min(len(ta), len(tb)))) return float(overlap / max(8, min(len(ta), len(tb))))
def _semantic_action_groups(text: str) -> set[str]:
terms = _terms(text)
lowered = text.lower()
groups = {
name
for name, needles in _ACTION_GROUPS.items()
if terms & needles
}
if "moving closer" in lowered or "move closer" in lowered:
groups.add("approach")
if "face-to-face" in lowered or "faces facing" in lowered:
groups.add("approach")
return groups
def _semantic_match_score(beat_desc: str, candidate_desc: str) -> tuple[float, str]:
text_score = _text_similarity(beat_desc, candidate_desc)
beat_actions = _semantic_action_groups(beat_desc)
candidate_actions = _semantic_action_groups(candidate_desc)
required = beat_actions & _STRONG_ACTION_GROUPS
missing = required - candidate_actions
if missing:
penalty = min(0.45, 0.18 * len(missing))
text_score = max(0.0, text_score - penalty)
if required and not missing:
text_score = min(1.0, text_score + 0.12)
reason = (
f"semantic={text_score:.3f} "
f"beat_actions={sorted(beat_actions)} "
f"candidate_actions={sorted(candidate_actions)}"
)
return text_score, reason
def _scene_seed_points(scene: Scene, max_points: int) -> list[float]: def _scene_seed_points(scene: Scene, max_points: int) -> list[float]:
if max_points <= 1 or scene.duration_s <= 0: if max_points <= 1 or scene.duration_s <= 0:
return [scene.start_s] return [scene.start_s]
@@ -340,7 +386,7 @@ def _add_window_seed_descriptions(
) )
if not desc: if not desc:
continue continue
score = _text_similarity(beat_desc, desc) score, _reason = _semantic_match_score(beat_desc, desc)
if score < cfg.vision.similarity_threshold: if score < cfg.vision.similarity_threshold:
continue continue
semantic_score = min(0.99, score + 0.30) semantic_score = min(0.99, score + 0.30)
@@ -427,7 +473,7 @@ def build_vision_seed_in_points(
) )
if not scene_desc: if not scene_desc:
continue continue
score = _text_similarity(beat_desc, scene_desc) score, _reason = _semantic_match_score(beat_desc, scene_desc)
if score >= cfg.vision.similarity_threshold: if score >= cfg.vision.similarity_threshold:
ranked_by_scene[scene.scene_id] = (min(0.99, score + 0.25), scene, "vision") ranked_by_scene[scene.scene_id] = (min(0.99, score + 0.25), scene, "vision")
@@ -440,7 +486,7 @@ def build_vision_seed_in_points(
ranked_by_scene[scene.scene_id] = (vibe_score, scene, "vibe") ranked_by_scene[scene.scene_id] = (vibe_score, scene, "vibe")
for scene, scene_desc in _cached_scene_descriptions(cache, scenes_by_id, cfg): for scene, scene_desc in _cached_scene_descriptions(cache, scenes_by_id, cfg):
score = _text_similarity(beat_desc, scene_desc) score, _reason = _semantic_match_score(beat_desc, scene_desc)
if score < cfg.vision.similarity_threshold: if score < cfg.vision.similarity_threshold:
continue continue
semantic_score = min(0.99, score + 0.25) semantic_score = min(0.99, score + 0.25)
@@ -487,3 +533,65 @@ def build_vision_seed_in_points(
_save_cache(cfg, cache) _save_cache(cfg, cache)
return seeds return seeds
def validate_match_window_with_vision(
beat: TrailerBeat,
*,
source_path: Path,
scene_id: int,
in_point_s: float,
out_point_s: float,
cfg: AppConfig,
) -> tuple[bool, str]:
"""
Verify a final match window with cached vision descriptions.
This is a conservative safety net for visually ambiguous low-light shots:
CV may confirm a similar-looking two-shot, but the action phase still has
to agree with the beat description. Descriptions are cached by exact window.
"""
if not cfg.vision.enabled:
return True, "vision disabled"
if out_point_s <= in_point_s:
return False, "empty source window"
cache = _load_cache(cfg)
budget = [min(2, max(0, cfg.vision.max_new_descriptions_per_run))]
beat_desc = _describe_sample(
kind="beat",
item_id=beat.beat_id,
label=f"trailer beat {beat.beat_id} verification",
video_path=beat.trailer_path,
start_s=beat.start_s,
end_s=beat.end_s,
cfg=cfg,
cache=cache,
budget=budget,
)
if not beat_desc:
return True, "no beat vision description"
source_desc = _describe_sample(
kind="match_window",
item_id=scene_id,
label=f"matched source scene {scene_id} window {in_point_s:.2f}-{out_point_s:.2f}",
video_path=source_path,
start_s=in_point_s,
end_s=out_point_s,
cfg=cfg,
cache=cache,
budget=budget,
)
_save_cache(cfg, cache)
if not source_desc:
return True, "no source vision description"
score, reason = _semantic_match_score(beat_desc, source_desc)
beat_actions = _semantic_action_groups(beat_desc) & _STRONG_ACTION_GROUPS
source_actions = _semantic_action_groups(source_desc)
missing_actions = beat_actions - source_actions
threshold = max(0.32, cfg.vision.similarity_threshold + 0.12)
if missing_actions and score < threshold:
return False, f"{reason} missing_actions={sorted(missing_actions)}"
return True, reason