Reject vision matches with action phase mismatches
This commit is contained in:
+111
-3
@@ -50,6 +50,18 @@ _CREDIT_ERROR_PATTERNS = (
|
||||
"payment required",
|
||||
)
|
||||
|
||||
_ACTION_GROUPS = {
|
||||
"kiss": {"kiss", "kisses", "kissing", "kissed"},
|
||||
"forehead_touch": {"forehead", "foreheads", "touch", "touches", "touching", "touched"},
|
||||
"approach": {"approach", "approaches", "approaching", "closer", "lean", "leans", "leaning"},
|
||||
"talk": {"talk", "talking", "speak", "speaking", "conversation", "conversing"},
|
||||
"hand": {"hand", "hands", "holding", "holds", "raise", "raises", "raising", "lift", "lifting"},
|
||||
"cutting": {"cut", "cuts", "cutting", "knife", "blade", "scissors"},
|
||||
"look_down": {"down", "lowering", "lowers"},
|
||||
"turn": {"turn", "turns", "turning"},
|
||||
}
|
||||
_STRONG_ACTION_GROUPS = {"kiss", "forehead_touch", "approach", "hand", "cutting"}
|
||||
|
||||
|
||||
def _cache_path(cfg: AppConfig) -> Path:
|
||||
return cfg.paths.cache_dir / "vision_descriptions.json"
|
||||
@@ -251,6 +263,40 @@ def _text_similarity(a: str, b: str) -> float:
|
||||
return float(overlap / max(8, min(len(ta), len(tb))))
|
||||
|
||||
|
||||
def _semantic_action_groups(text: str) -> set[str]:
|
||||
terms = _terms(text)
|
||||
lowered = text.lower()
|
||||
groups = {
|
||||
name
|
||||
for name, needles in _ACTION_GROUPS.items()
|
||||
if terms & needles
|
||||
}
|
||||
if "moving closer" in lowered or "move closer" in lowered:
|
||||
groups.add("approach")
|
||||
if "face-to-face" in lowered or "faces facing" in lowered:
|
||||
groups.add("approach")
|
||||
return groups
|
||||
|
||||
|
||||
def _semantic_match_score(beat_desc: str, candidate_desc: str) -> tuple[float, str]:
|
||||
text_score = _text_similarity(beat_desc, candidate_desc)
|
||||
beat_actions = _semantic_action_groups(beat_desc)
|
||||
candidate_actions = _semantic_action_groups(candidate_desc)
|
||||
required = beat_actions & _STRONG_ACTION_GROUPS
|
||||
missing = required - candidate_actions
|
||||
if missing:
|
||||
penalty = min(0.45, 0.18 * len(missing))
|
||||
text_score = max(0.0, text_score - penalty)
|
||||
if required and not missing:
|
||||
text_score = min(1.0, text_score + 0.12)
|
||||
reason = (
|
||||
f"semantic={text_score:.3f} "
|
||||
f"beat_actions={sorted(beat_actions)} "
|
||||
f"candidate_actions={sorted(candidate_actions)}"
|
||||
)
|
||||
return text_score, reason
|
||||
|
||||
|
||||
def _scene_seed_points(scene: Scene, max_points: int) -> list[float]:
|
||||
if max_points <= 1 or scene.duration_s <= 0:
|
||||
return [scene.start_s]
|
||||
@@ -340,7 +386,7 @@ def _add_window_seed_descriptions(
|
||||
)
|
||||
if not desc:
|
||||
continue
|
||||
score = _text_similarity(beat_desc, desc)
|
||||
score, _reason = _semantic_match_score(beat_desc, desc)
|
||||
if score < cfg.vision.similarity_threshold:
|
||||
continue
|
||||
semantic_score = min(0.99, score + 0.30)
|
||||
@@ -427,7 +473,7 @@ def build_vision_seed_in_points(
|
||||
)
|
||||
if not scene_desc:
|
||||
continue
|
||||
score = _text_similarity(beat_desc, scene_desc)
|
||||
score, _reason = _semantic_match_score(beat_desc, scene_desc)
|
||||
if score >= cfg.vision.similarity_threshold:
|
||||
ranked_by_scene[scene.scene_id] = (min(0.99, score + 0.25), scene, "vision")
|
||||
|
||||
@@ -440,7 +486,7 @@ def build_vision_seed_in_points(
|
||||
ranked_by_scene[scene.scene_id] = (vibe_score, scene, "vibe")
|
||||
|
||||
for scene, scene_desc in _cached_scene_descriptions(cache, scenes_by_id, cfg):
|
||||
score = _text_similarity(beat_desc, scene_desc)
|
||||
score, _reason = _semantic_match_score(beat_desc, scene_desc)
|
||||
if score < cfg.vision.similarity_threshold:
|
||||
continue
|
||||
semantic_score = min(0.99, score + 0.25)
|
||||
@@ -487,3 +533,65 @@ def build_vision_seed_in_points(
|
||||
|
||||
_save_cache(cfg, cache)
|
||||
return seeds
|
||||
|
||||
|
||||
def validate_match_window_with_vision(
|
||||
beat: TrailerBeat,
|
||||
*,
|
||||
source_path: Path,
|
||||
scene_id: int,
|
||||
in_point_s: float,
|
||||
out_point_s: float,
|
||||
cfg: AppConfig,
|
||||
) -> tuple[bool, str]:
|
||||
"""
|
||||
Verify a final match window with cached vision descriptions.
|
||||
|
||||
This is a conservative safety net for visually ambiguous low-light shots:
|
||||
CV may confirm a similar-looking two-shot, but the action phase still has
|
||||
to agree with the beat description. Descriptions are cached by exact window.
|
||||
"""
|
||||
if not cfg.vision.enabled:
|
||||
return True, "vision disabled"
|
||||
if out_point_s <= in_point_s:
|
||||
return False, "empty source window"
|
||||
|
||||
cache = _load_cache(cfg)
|
||||
budget = [min(2, max(0, cfg.vision.max_new_descriptions_per_run))]
|
||||
beat_desc = _describe_sample(
|
||||
kind="beat",
|
||||
item_id=beat.beat_id,
|
||||
label=f"trailer beat {beat.beat_id} verification",
|
||||
video_path=beat.trailer_path,
|
||||
start_s=beat.start_s,
|
||||
end_s=beat.end_s,
|
||||
cfg=cfg,
|
||||
cache=cache,
|
||||
budget=budget,
|
||||
)
|
||||
if not beat_desc:
|
||||
return True, "no beat vision description"
|
||||
|
||||
source_desc = _describe_sample(
|
||||
kind="match_window",
|
||||
item_id=scene_id,
|
||||
label=f"matched source scene {scene_id} window {in_point_s:.2f}-{out_point_s:.2f}",
|
||||
video_path=source_path,
|
||||
start_s=in_point_s,
|
||||
end_s=out_point_s,
|
||||
cfg=cfg,
|
||||
cache=cache,
|
||||
budget=budget,
|
||||
)
|
||||
_save_cache(cfg, cache)
|
||||
if not source_desc:
|
||||
return True, "no source vision description"
|
||||
|
||||
score, reason = _semantic_match_score(beat_desc, source_desc)
|
||||
beat_actions = _semantic_action_groups(beat_desc) & _STRONG_ACTION_GROUPS
|
||||
source_actions = _semantic_action_groups(source_desc)
|
||||
missing_actions = beat_actions - source_actions
|
||||
threshold = max(0.32, cfg.vision.similarity_threshold + 0.12)
|
||||
if missing_actions and score < threshold:
|
||||
return False, f"{reason} missing_actions={sorted(missing_actions)}"
|
||||
return True, reason
|
||||
|
||||
Reference in New Issue
Block a user