Compare commits

..

2 Commits

Author SHA1 Message Date
Melbar a5a84a9145 Use motion phase for in-scene timing 2026-05-02 17:59:18 +02:00
Melbar 3ea5582b49 Realign wrong in-scene action matches 2026-05-02 17:13:22 +02:00
4 changed files with 253 additions and 4 deletions
+10
View File
@@ -137,6 +137,11 @@ Die Inpoint-Feinjustage bestimmt den Versatz lokal aus dem Bildinhalt: Um einen
groben Treffer herum werden mehrere Referenzframes gegen mehrere Source-Offsets
verglichen, und der beste gemeinsame Offset wird übernommen. Das ist schneller
als ein erneuter globaler Scan und vermeidet pauschale Frame-Prerolls.
Zusätzlich wird die Bewegungsphase über Frame-zu-Frame-Differenzen verglichen.
Dadurch kann der Matcher innerhalb derselben Source-Szene unterscheiden, ob
zwei Figuren noch sprechen, sich annähern, bereits im Kontakt sind oder sich
wieder voneinander lösen. Ein optisch ähnlicher Standbild-Treffer reicht damit
nicht mehr aus, wenn der Bewegungsverlauf nicht zur Referenz passt.
Schwarze Referenzframes aus Blenden oder Titel-Tails werden für diese
Offset-Messung ausgelassen, damit echte Bildbewegung und nicht die Blende selbst
den Inpoint bestimmt.
@@ -164,6 +169,11 @@ Zeitbereich nochmals gegen den Trailer-Beat prüfen. Starke Aktionsphasen wie
Annäherung, Kuss/Stirnkontakt, Handbewegungen oder Schneiden müssen dann auch
im Source-Fenster beschrieben sein; fehlt diese Aktionsphase, wird der Treffer
nicht gespeichert, selbst wenn der Low-Level-CV-Score hoch ist.
Wenn die Szene selbst plausibel ist, aber der konkrete Source-Zeitpunkt diese
Aktionsphase verfehlt, sucht der Matcher automatisch dichter innerhalb derselben
Source-Szene nach lokalen Vision-Fenstern mit der passenden Aktion und richtet
den Inpoint mit der Motion-Phase-Prüfung darauf neu aus. Erst wenn auch diese
In-Scene-Reparatur scheitert, wird der Treffer verworfen.
Der gewichtete Vision-Seed-Pfad ersetzt standardmäßig keinen normalen
FFmpeg-Vollscan. Vision-Beschreibungen sind semantische Hinweise, aber keine
Beweise; der volle CV-Scan bleibt deshalb aktiv, damit falsch bewertete
+109 -1
View File
@@ -638,10 +638,56 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
return results
from dataclasses import replace
from src.llm.vision_cache import validate_match_window_with_vision
from src.llm.vision_cache import find_action_window_in_scene, validate_match_window_with_vision
from src.cv.scene_indexer import build_scene_index
from src.cv.global_scan import align_in_point_by_content, align_in_point_by_motion
logger = logging.getLogger(__name__)
beats_by_id = {beat.beat_id: beat for beat in beats}
scenes_by_id = {scene.scene_id: scene for scene in build_scene_index(cfg)}
def realign_window(check_beat, scene_id: int):
scene = scenes_by_id.get(scene_id)
if scene is None:
return None
found = find_action_window_in_scene(check_beat, scene, cfg)
if found is None:
return None
start_s, end_s, semantic_score, reason = found
window_s = max(1.0, min(4.0, (end_s - start_s) * 1.5))
motion_in_s, motion_score = align_in_point_by_motion(
check_beat,
start_s,
cfg,
search_window_s=window_s,
)
aligned_in_s, content_score = align_in_point_by_content(
check_beat,
motion_in_s,
cfg,
search_window_s=min(window_s, 0.8),
)
aligned_in_s = max(scene.start_s, min(aligned_in_s, max(scene.start_s, scene.end_s - check_beat.duration_s)))
ok, verify_reason = validate_match_window_with_vision(
check_beat,
source_path=scene.source_path,
scene_id=scene.scene_id,
in_point_s=aligned_in_s,
out_point_s=aligned_in_s + check_beat.duration_s,
cfg=cfg,
)
if not ok:
logger.info(
"Beat %d: action-window realign rejected scene=%d in=%.3fs (%s)",
check_beat.beat_id,
scene.scene_id,
aligned_in_s,
verify_reason,
)
return None
score = max(content_score, min(0.99, semantic_score * 0.75 + motion_score * 0.25))
return scene, aligned_in_s, score, f"{reason}; {verify_reason}"
kept = []
for result in results:
beat = beats_by_id.get(result.beat_id)
@@ -684,6 +730,68 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
if valid:
kept.append(result)
else:
if getattr(result, "segments", ()):
new_segments = []
all_repaired = True
repair_reasons = []
for segment in result.segments:
segment_beat = replace(
beat,
start_s=beat.start_s + segment.trailer_offset_s,
end_s=beat.start_s + segment.trailer_offset_s + segment.duration_s,
)
repair = realign_window(segment_beat, segment.scene_id)
if repair is None:
all_repaired = False
break
scene, aligned_in_s, score, repair_reason = repair
repair_reasons.append(repair_reason)
new_segments.append(replace(
segment,
scene_id=scene.scene_id,
in_point_s=aligned_in_s,
out_point_s=aligned_in_s + segment.duration_s,
match_score=score,
is_confirmed=score >= cfg.cv.deep_scan.match_threshold,
))
if all_repaired and new_segments:
first = new_segments[0]
repaired_score = min(seg.match_score for seg in new_segments)
logger.info(
"Beat %d: realigned inside matched scene by vision action windows (%s)",
result.beat_id,
"; ".join(repair_reasons),
)
kept.append(replace(
result,
scene_id=first.scene_id,
in_point_s=first.in_point_s,
out_point_s=first.out_point_s,
in_point_frame=int(first.in_point_s * cfg.export.edl_frame_rate),
match_score=repaired_score,
is_confirmed=repaired_score >= cfg.cv.deep_scan.match_threshold,
segments=tuple(new_segments),
))
continue
else:
repair = realign_window(beat, result.scene_id)
if repair is not None:
scene, aligned_in_s, score, repair_reason = repair
logger.info(
"Beat %d: realigned inside matched scene by vision action window (%s)",
result.beat_id,
repair_reason,
)
kept.append(replace(
result,
scene_id=scene.scene_id,
in_point_s=aligned_in_s,
out_point_s=aligned_in_s + result.duration_s,
in_point_frame=int(aligned_in_s * cfg.export.edl_frame_rate),
match_score=score,
is_confirmed=score >= cfg.cv.deep_scan.match_threshold,
))
continue
logger.warning(
"Beat %d: rejected by vision action-phase verification (%s)",
result.beat_id,
+64 -3
View File
@@ -827,6 +827,50 @@ def _motion_phase_score(
return float((sum(scores) / len(scores)) * 0.65 + min(scores) * 0.35)
def align_in_point_by_motion(
beat: TrailerBeat,
estimated_in_point_s: float,
cfg: AppConfig,
search_window_s: float | None = None,
) -> tuple[float, float]:
"""
Align a candidate by matching the frame-to-frame motion pattern.
This catches the common failure mode where the right source scene is found,
but the in-point is a few seconds too early or late inside a repeated
conversation/action beat.
"""
motion_templates = _prepare_motion_templates(beat, cfg)
if len(motion_templates) < 2:
return estimated_in_point_s, 0.0
with open_video(cfg.paths.source_movie) as cap:
fps = float(cap.get(cv2.CAP_PROP_FPS)) or cfg.export.edl_frame_rate
frame_step_s = 1.0 / fps
window_s = (
search_window_s
if search_window_s is not None
else cfg.cv.deep_scan.content_align_window_seconds
)
start_s = max(0.0, estimated_in_point_s - window_s)
end_s = estimated_in_point_s + window_s
tie_delta = cfg.cv.deep_scan.start_tie_break_score_delta
best_in = estimated_in_point_s
best_score = -1.0
t = start_s
while t <= end_s:
score = _motion_phase_score(cap, t, motion_templates, cfg)
if score > best_score + tie_delta:
best_score = score
best_in = t
elif score >= best_score - tie_delta and abs(t - estimated_in_point_s) < abs(best_in - estimated_in_point_s):
best_in = t
t = round(t + frame_step_s, 6)
return best_in, max(0.0, best_score)
def estimate_usable_source_duration(
beat: TrailerBeat,
in_point_s: float,
@@ -1190,6 +1234,7 @@ def run_global_scan(
for _, coarse_score, in_point_s in reranked_candidates[:refine_limit]
]
validation_templates = _prepare_validation_templates(b, cfg)
motion_templates = _prepare_motion_templates(b, cfg)
logger.info(
'Beat %d: content-reranked top %d / %d candidates.',
b.beat_id,
@@ -1270,6 +1315,16 @@ def run_global_scan(
if matchable_duration_s > 0 else 0.0
)
motion_score = 0.0
if len(motion_templates) >= 2:
with open_video(cfg.paths.source_movie) as motion_cap:
motion_score = _motion_phase_score(
motion_cap,
adjusted_in_s,
motion_templates,
cfg,
)
if is_weighted_seed_candidate and scene is not None and content_score >= content_gate:
contiguous_usable_s = _contiguous_scene_coverage_duration(
b,
@@ -1299,11 +1354,15 @@ def run_global_scan(
final_score * (1.0 - scan_cfg.content_validation_weight)
+ content_score * scan_cfg.content_validation_weight
)
if len(motion_templates) >= 2:
motion_score_clamped = max(0.0, min(1.0, motion_score))
final_score = final_score * 0.82 + motion_score_clamped * 0.18
if is_weighted_seed_candidate:
vision_provisional_score = (
content_score * 0.55
content_score * 0.45
+ duration_coverage * 0.33
+ coarse_score * 0.12
+ max(0.0, min(1.0, motion_score)) * 0.10
)
final_score = max(final_score, vision_provisional_score)
if content_score < scan_cfg.match_threshold and not is_weighted_seed_candidate:
@@ -1332,7 +1391,7 @@ def run_global_scan(
if duration_coverage < scan_cfg.min_duration_coverage:
rejected_short_candidates += 1
logger.debug(
'Beat %d short candidate in=%.3fs scene=%s sequence=%.3f span=%.3f coarse=%.3f content=%.3f coverage=%.2f final=%.3f',
'Beat %d short candidate in=%.3fs scene=%s sequence=%.3f span=%.3f coarse=%.3f content=%.3f motion=%.3f coverage=%.2f final=%.3f',
b.beat_id,
adjusted_in_s,
scene.scene_id if scene is not None else 'none',
@@ -1340,6 +1399,7 @@ def run_global_scan(
span_score,
coarse_score,
content_score,
motion_score,
duration_coverage,
final_score,
)
@@ -1364,7 +1424,7 @@ def run_global_scan(
continue
logger.debug(
'Beat %d candidate in=%.3fs scene=%s sequence=%.3f span=%.3f coarse=%.3f content=%.3f coverage=%.2f final=%.3f',
'Beat %d candidate in=%.3fs scene=%s sequence=%.3f span=%.3f coarse=%.3f content=%.3f motion=%.3f coverage=%.2f final=%.3f',
b.beat_id,
adjusted_in_s,
scene.scene_id if scene is not None else 'none',
@@ -1372,6 +1432,7 @@ def run_global_scan(
span_score,
coarse_score,
content_score,
motion_score,
duration_coverage,
final_score,
)
+70
View File
@@ -595,3 +595,73 @@ def validate_match_window_with_vision(
if missing_actions and score < threshold:
return False, f"{reason} missing_actions={sorted(missing_actions)}"
return True, reason
def find_action_window_in_scene(
beat: TrailerBeat,
scene: Scene,
cfg: AppConfig,
) -> tuple[float, float, float, str] | None:
"""
Search one already selected source scene for the beat's action phase.
This is used after CV picked the right broad scene but the wrong time
inside that scene. It stays automatic and cached: windows are described
evenly across the scene until the per-run vision budget is consumed.
"""
if not cfg.vision.enabled or scene.duration_s <= 0:
return None
cache = _load_cache(cfg)
budget = [max(0, cfg.vision.max_new_descriptions_per_run)]
beat_desc = _describe_sample(
kind="beat",
item_id=beat.beat_id,
label=f"trailer beat {beat.beat_id} action search",
video_path=beat.trailer_path,
start_s=beat.start_s,
end_s=beat.end_s,
cfg=cfg,
cache=cache,
budget=budget,
)
if not beat_desc:
return None
beat_actions = _semantic_action_groups(beat_desc) & _STRONG_ACTION_GROUPS
if not beat_actions:
return None
max_windows = max(
cfg.vision.seed_points_per_scene,
cfg.vision.max_new_descriptions_per_run,
)
best: tuple[float, float, float, str] | None = None
for start_s, end_s in _scene_window_ranges(scene, beat, max_windows):
desc = _describe_sample(
kind="action_window",
item_id=scene.scene_id,
label=f"source scene {scene.scene_id} action window {start_s:.2f}-{end_s:.2f}",
video_path=scene.source_path,
start_s=start_s,
end_s=end_s,
cfg=cfg,
cache=cache,
budget=budget,
)
if not desc:
continue
score, reason = _semantic_match_score(beat_desc, desc)
source_actions = _semantic_action_groups(desc)
missing_actions = beat_actions - source_actions
if missing_actions:
continue
threshold = max(0.38, cfg.vision.similarity_threshold + 0.18)
if score < threshold:
continue
candidate = (start_s, end_s, score, reason)
if best is None or candidate[2] > best[2]:
best = candidate
_save_cache(cfg, cache)
return best