Realign wrong in-scene action matches

This commit is contained in:
Melbar
2026-05-02 17:13:22 +02:00
parent 1a177d6b89
commit 3ea5582b49
3 changed files with 178 additions and 1 deletions
+5
View File
@@ -164,6 +164,11 @@ Zeitbereich nochmals gegen den Trailer-Beat prüfen. Starke Aktionsphasen wie
Annäherung, Kuss/Stirnkontakt, Handbewegungen oder Schneiden müssen dann auch
im Source-Fenster beschrieben sein; fehlt diese Aktionsphase, wird der Treffer
nicht gespeichert, selbst wenn der Low-Level-CV-Score hoch ist.
Wenn die Szene selbst plausibel ist, aber der konkrete Source-Zeitpunkt diese
Aktionsphase verfehlt, sucht der Matcher automatisch dichter innerhalb derselben
Source-Szene nach lokalen Vision-Fenstern mit der passenden Aktion und richtet
den Inpoint darauf neu aus. Erst wenn auch diese In-Scene-Reparatur scheitert,
wird der Treffer verworfen.
Der gewichtete Vision-Seed-Pfad ersetzt standardmäßig keinen normalen
FFmpeg-Vollscan. Vision-Beschreibungen sind semantische Hinweise, aber keine
Beweise; der volle CV-Scan bleibt deshalb aktiv, damit falsch bewertete
+103 -1
View File
@@ -638,10 +638,50 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
return results
from dataclasses import replace
from src.llm.vision_cache import validate_match_window_with_vision
from src.llm.vision_cache import find_action_window_in_scene, validate_match_window_with_vision
from src.cv.scene_indexer import build_scene_index
from src.cv.global_scan import align_in_point_by_content
logger = logging.getLogger(__name__)
beats_by_id = {beat.beat_id: beat for beat in beats}
scenes_by_id = {scene.scene_id: scene for scene in build_scene_index(cfg)}
def realign_window(check_beat, scene_id: int):
scene = scenes_by_id.get(scene_id)
if scene is None:
return None
found = find_action_window_in_scene(check_beat, scene, cfg)
if found is None:
return None
start_s, end_s, semantic_score, reason = found
window_s = max(1.0, min(4.0, (end_s - start_s) * 1.5))
aligned_in_s, content_score = align_in_point_by_content(
check_beat,
start_s,
cfg,
search_window_s=window_s,
)
aligned_in_s = max(scene.start_s, min(aligned_in_s, max(scene.start_s, scene.end_s - check_beat.duration_s)))
ok, verify_reason = validate_match_window_with_vision(
check_beat,
source_path=scene.source_path,
scene_id=scene.scene_id,
in_point_s=aligned_in_s,
out_point_s=aligned_in_s + check_beat.duration_s,
cfg=cfg,
)
if not ok:
logger.info(
"Beat %d: action-window realign rejected scene=%d in=%.3fs (%s)",
check_beat.beat_id,
scene.scene_id,
aligned_in_s,
verify_reason,
)
return None
score = max(content_score, min(0.99, semantic_score))
return scene, aligned_in_s, score, f"{reason}; {verify_reason}"
kept = []
for result in results:
beat = beats_by_id.get(result.beat_id)
@@ -684,6 +724,68 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
if valid:
kept.append(result)
else:
if getattr(result, "segments", ()):
new_segments = []
all_repaired = True
repair_reasons = []
for segment in result.segments:
segment_beat = replace(
beat,
start_s=beat.start_s + segment.trailer_offset_s,
end_s=beat.start_s + segment.trailer_offset_s + segment.duration_s,
)
repair = realign_window(segment_beat, segment.scene_id)
if repair is None:
all_repaired = False
break
scene, aligned_in_s, score, repair_reason = repair
repair_reasons.append(repair_reason)
new_segments.append(replace(
segment,
scene_id=scene.scene_id,
in_point_s=aligned_in_s,
out_point_s=aligned_in_s + segment.duration_s,
match_score=score,
is_confirmed=score >= cfg.cv.deep_scan.match_threshold,
))
if all_repaired and new_segments:
first = new_segments[0]
repaired_score = min(seg.match_score for seg in new_segments)
logger.info(
"Beat %d: realigned inside matched scene by vision action windows (%s)",
result.beat_id,
"; ".join(repair_reasons),
)
kept.append(replace(
result,
scene_id=first.scene_id,
in_point_s=first.in_point_s,
out_point_s=first.out_point_s,
in_point_frame=int(first.in_point_s * cfg.export.edl_frame_rate),
match_score=repaired_score,
is_confirmed=repaired_score >= cfg.cv.deep_scan.match_threshold,
segments=tuple(new_segments),
))
continue
else:
repair = realign_window(beat, result.scene_id)
if repair is not None:
scene, aligned_in_s, score, repair_reason = repair
logger.info(
"Beat %d: realigned inside matched scene by vision action window (%s)",
result.beat_id,
repair_reason,
)
kept.append(replace(
result,
scene_id=scene.scene_id,
in_point_s=aligned_in_s,
out_point_s=aligned_in_s + result.duration_s,
in_point_frame=int(aligned_in_s * cfg.export.edl_frame_rate),
match_score=score,
is_confirmed=score >= cfg.cv.deep_scan.match_threshold,
))
continue
logger.warning(
"Beat %d: rejected by vision action-phase verification (%s)",
result.beat_id,
+70
View File
@@ -595,3 +595,73 @@ def validate_match_window_with_vision(
if missing_actions and score < threshold:
return False, f"{reason} missing_actions={sorted(missing_actions)}"
return True, reason
def find_action_window_in_scene(
beat: TrailerBeat,
scene: Scene,
cfg: AppConfig,
) -> tuple[float, float, float, str] | None:
"""
Search one already selected source scene for the beat's action phase.
This is used after CV picked the right broad scene but the wrong time
inside that scene. It stays automatic and cached: windows are described
evenly across the scene until the per-run vision budget is consumed.
"""
if not cfg.vision.enabled or scene.duration_s <= 0:
return None
cache = _load_cache(cfg)
budget = [max(0, cfg.vision.max_new_descriptions_per_run)]
beat_desc = _describe_sample(
kind="beat",
item_id=beat.beat_id,
label=f"trailer beat {beat.beat_id} action search",
video_path=beat.trailer_path,
start_s=beat.start_s,
end_s=beat.end_s,
cfg=cfg,
cache=cache,
budget=budget,
)
if not beat_desc:
return None
beat_actions = _semantic_action_groups(beat_desc) & _STRONG_ACTION_GROUPS
if not beat_actions:
return None
max_windows = max(
cfg.vision.seed_points_per_scene,
cfg.vision.max_new_descriptions_per_run,
)
best: tuple[float, float, float, str] | None = None
for start_s, end_s in _scene_window_ranges(scene, beat, max_windows):
desc = _describe_sample(
kind="action_window",
item_id=scene.scene_id,
label=f"source scene {scene.scene_id} action window {start_s:.2f}-{end_s:.2f}",
video_path=scene.source_path,
start_s=start_s,
end_s=end_s,
cfg=cfg,
cache=cache,
budget=budget,
)
if not desc:
continue
score, reason = _semantic_match_score(beat_desc, desc)
source_actions = _semantic_action_groups(desc)
missing_actions = beat_actions - source_actions
if missing_actions:
continue
threshold = max(0.38, cfg.vision.similarity_threshold + 0.18)
if score < threshold:
continue
candidate = (start_s, end_s, score, reason)
if best is None or candidate[2] > best[2]:
best = candidate
_save_cache(cfg, cache)
return best