Use motion phase for in-scene timing

Realign wrong in-scene action matches
2026-05-02 17:59:18 +02:00 · 2026-05-02 17:13:22 +02:00
4 changed files with 253 additions and 4 deletions
@@ -137,6 +137,11 @@ Die Inpoint-Feinjustage bestimmt den Versatz lokal aus dem Bildinhalt: Um einen
 groben Treffer herum werden mehrere Referenzframes gegen mehrere Source-Offsets
 verglichen, und der beste gemeinsame Offset wird übernommen. Das ist schneller
 als ein erneuter globaler Scan und vermeidet pauschale Frame-Prerolls.
 Zusätzlich wird die Bewegungsphase über Frame-zu-Frame-Differenzen verglichen.
 Dadurch kann der Matcher innerhalb derselben Source-Szene unterscheiden, ob
 zwei Figuren noch sprechen, sich annähern, bereits im Kontakt sind oder sich
 wieder voneinander lösen. Ein optisch ähnlicher Standbild-Treffer reicht damit
 nicht mehr aus, wenn der Bewegungsverlauf nicht zur Referenz passt.
 Schwarze Referenzframes aus Blenden oder Titel-Tails werden für diese
 Offset-Messung ausgelassen, damit echte Bildbewegung und nicht die Blende selbst
 den Inpoint bestimmt.
@@ -164,6 +169,11 @@ Zeitbereich nochmals gegen den Trailer-Beat prüfen. Starke Aktionsphasen wie
 Annäherung, Kuss/Stirnkontakt, Handbewegungen oder Schneiden müssen dann auch
 im Source-Fenster beschrieben sein; fehlt diese Aktionsphase, wird der Treffer
 nicht gespeichert, selbst wenn der Low-Level-CV-Score hoch ist.
 Wenn die Szene selbst plausibel ist, aber der konkrete Source-Zeitpunkt diese
 Aktionsphase verfehlt, sucht der Matcher automatisch dichter innerhalb derselben
 Source-Szene nach lokalen Vision-Fenstern mit der passenden Aktion und richtet
 den Inpoint mit der Motion-Phase-Prüfung darauf neu aus. Erst wenn auch diese
 In-Scene-Reparatur scheitert, wird der Treffer verworfen.
 Der gewichtete Vision-Seed-Pfad ersetzt standardmäßig keinen normalen
 FFmpeg-Vollscan. Vision-Beschreibungen sind semantische Hinweise, aber keine
 Beweise; der volle CV-Scan bleibt deshalb aktiv, damit falsch bewertete
@@ -638,10 +638,56 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
        return results
    from dataclasses import replace
-    from src.llm.vision_cache import validate_match_window_with_vision
+    from src.llm.vision_cache import find_action_window_in_scene, validate_match_window_with_vision
    from src.cv.scene_indexer import build_scene_index
    from src.cv.global_scan import align_in_point_by_content, align_in_point_by_motion
    logger = logging.getLogger(__name__)
    beats_by_id = {beat.beat_id: beat for beat in beats}
    scenes_by_id = {scene.scene_id: scene for scene in build_scene_index(cfg)}
    def realign_window(check_beat, scene_id: int):
        scene = scenes_by_id.get(scene_id)
        if scene is None:
            return None
        found = find_action_window_in_scene(check_beat, scene, cfg)
        if found is None:
            return None
        start_s, end_s, semantic_score, reason = found
        window_s = max(1.0, min(4.0, (end_s - start_s) * 1.5))
        motion_in_s, motion_score = align_in_point_by_motion(
            check_beat,
            start_s,
            cfg,
            search_window_s=window_s,
        )
        aligned_in_s, content_score = align_in_point_by_content(
            check_beat,
            motion_in_s,
            cfg,
            search_window_s=min(window_s, 0.8),
        )
        aligned_in_s = max(scene.start_s, min(aligned_in_s, max(scene.start_s, scene.end_s - check_beat.duration_s)))
        ok, verify_reason = validate_match_window_with_vision(
            check_beat,
            source_path=scene.source_path,
            scene_id=scene.scene_id,
            in_point_s=aligned_in_s,
            out_point_s=aligned_in_s + check_beat.duration_s,
            cfg=cfg,
        )
        if not ok:
            logger.info(
                "Beat %d: action-window realign rejected scene=%d in=%.3fs (%s)",
                check_beat.beat_id,
                scene.scene_id,
                aligned_in_s,
                verify_reason,
            )
            return None
        score = max(content_score, min(0.99, semantic_score * 0.75 + motion_score * 0.25))
        return scene, aligned_in_s, score, f"{reason}; {verify_reason}"
    kept = []
    for result in results:
        beat = beats_by_id.get(result.beat_id)
@@ -684,6 +730,68 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
        if valid:
            kept.append(result)
        else:
            if getattr(result, "segments", ()):
                new_segments = []
                all_repaired = True
                repair_reasons = []
                for segment in result.segments:
                    segment_beat = replace(
                        beat,
                        start_s=beat.start_s + segment.trailer_offset_s,
                        end_s=beat.start_s + segment.trailer_offset_s + segment.duration_s,
                    )
                    repair = realign_window(segment_beat, segment.scene_id)
                    if repair is None:
                        all_repaired = False
                        break
                    scene, aligned_in_s, score, repair_reason = repair
                    repair_reasons.append(repair_reason)
                    new_segments.append(replace(
                        segment,
                        scene_id=scene.scene_id,
                        in_point_s=aligned_in_s,
                        out_point_s=aligned_in_s + segment.duration_s,
                        match_score=score,
                        is_confirmed=score >= cfg.cv.deep_scan.match_threshold,
                    ))
                if all_repaired and new_segments:
                    first = new_segments[0]
                    repaired_score = min(seg.match_score for seg in new_segments)
                    logger.info(
                        "Beat %d: realigned inside matched scene by vision action windows (%s)",
                        result.beat_id,
                        "; ".join(repair_reasons),
                    )
                    kept.append(replace(
                        result,
                        scene_id=first.scene_id,
                        in_point_s=first.in_point_s,
                        out_point_s=first.out_point_s,
                        in_point_frame=int(first.in_point_s * cfg.export.edl_frame_rate),
                        match_score=repaired_score,
                        is_confirmed=repaired_score >= cfg.cv.deep_scan.match_threshold,
                        segments=tuple(new_segments),
                    ))
                    continue
            else:
                repair = realign_window(beat, result.scene_id)
                if repair is not None:
                    scene, aligned_in_s, score, repair_reason = repair
                    logger.info(
                        "Beat %d: realigned inside matched scene by vision action window (%s)",
                        result.beat_id,
                        repair_reason,
                    )
                    kept.append(replace(
                        result,
                        scene_id=scene.scene_id,
                        in_point_s=aligned_in_s,
                        out_point_s=aligned_in_s + result.duration_s,
                        in_point_frame=int(aligned_in_s * cfg.export.edl_frame_rate),
                        match_score=score,
                        is_confirmed=score >= cfg.cv.deep_scan.match_threshold,
                    ))
                    continue
            logger.warning(
                "Beat %d: rejected by vision action-phase verification (%s)",
                result.beat_id,
@@ -827,6 +827,50 @@ def _motion_phase_score(
    return float((sum(scores) / len(scores)) * 0.65 + min(scores) * 0.35)
 def align_in_point_by_motion(
    beat: TrailerBeat,
    estimated_in_point_s: float,
    cfg: AppConfig,
    search_window_s: float | None = None,
 ) -> tuple[float, float]:
    """
    Align a candidate by matching the frame-to-frame motion pattern.
    This catches the common failure mode where the right source scene is found,
    but the in-point is a few seconds too early or late inside a repeated
    conversation/action beat.
    """
    motion_templates = _prepare_motion_templates(beat, cfg)
    if len(motion_templates) < 2:
        return estimated_in_point_s, 0.0
    with open_video(cfg.paths.source_movie) as cap:
        fps = float(cap.get(cv2.CAP_PROP_FPS)) or cfg.export.edl_frame_rate
        frame_step_s = 1.0 / fps
        window_s = (
            search_window_s
            if search_window_s is not None
            else cfg.cv.deep_scan.content_align_window_seconds
        )
        start_s = max(0.0, estimated_in_point_s - window_s)
        end_s = estimated_in_point_s + window_s
        tie_delta = cfg.cv.deep_scan.start_tie_break_score_delta
        best_in = estimated_in_point_s
        best_score = -1.0
        t = start_s
        while t <= end_s:
            score = _motion_phase_score(cap, t, motion_templates, cfg)
            if score > best_score + tie_delta:
                best_score = score
                best_in = t
            elif score >= best_score - tie_delta and abs(t - estimated_in_point_s) < abs(best_in - estimated_in_point_s):
                best_in = t
            t = round(t + frame_step_s, 6)
    return best_in, max(0.0, best_score)
 def estimate_usable_source_duration(
    beat: TrailerBeat,
    in_point_s: float,
@@ -1190,6 +1234,7 @@ def run_global_scan(
                for _, coarse_score, in_point_s in reranked_candidates[:refine_limit]
            ]
            validation_templates = _prepare_validation_templates(b, cfg)
            motion_templates = _prepare_motion_templates(b, cfg)
            logger.info(
                'Beat %d: content-reranked top %d / %d candidates.',
                b.beat_id,
@@ -1270,6 +1315,16 @@ def run_global_scan(
                            if matchable_duration_s > 0 else 0.0
                        )
                motion_score = 0.0
                if len(motion_templates) >= 2:
                    with open_video(cfg.paths.source_movie) as motion_cap:
                        motion_score = _motion_phase_score(
                            motion_cap,
                            adjusted_in_s,
                            motion_templates,
                            cfg,
                        )
                if is_weighted_seed_candidate and scene is not None and content_score >= content_gate:
                    contiguous_usable_s = _contiguous_scene_coverage_duration(
                        b,
@@ -1299,11 +1354,15 @@ def run_global_scan(
                    final_score * (1.0 - scan_cfg.content_validation_weight)
                    + content_score * scan_cfg.content_validation_weight
                )
                if len(motion_templates) >= 2:
                    motion_score_clamped = max(0.0, min(1.0, motion_score))
                    final_score = final_score * 0.82 + motion_score_clamped * 0.18
                if is_weighted_seed_candidate:
                    vision_provisional_score = (
-                        content_score * 0.55
+                        content_score * 0.45
                        + duration_coverage * 0.33
                        + coarse_score * 0.12
                        + max(0.0, min(1.0, motion_score)) * 0.10
                    )
                    final_score = max(final_score, vision_provisional_score)
                if content_score < scan_cfg.match_threshold and not is_weighted_seed_candidate:
@@ -1332,7 +1391,7 @@ def run_global_scan(
                if duration_coverage < scan_cfg.min_duration_coverage:
                    rejected_short_candidates += 1
                    logger.debug(
-                        'Beat %d short candidate in=%.3fs scene=%s sequence=%.3f span=%.3f coarse=%.3f content=%.3f coverage=%.2f final=%.3f',
+                        'Beat %d short candidate in=%.3fs scene=%s sequence=%.3f span=%.3f coarse=%.3f content=%.3f motion=%.3f coverage=%.2f final=%.3f',
                        b.beat_id,
                        adjusted_in_s,
                        scene.scene_id if scene is not None else 'none',
@@ -1340,6 +1399,7 @@ def run_global_scan(
                        span_score,
                        coarse_score,
                        content_score,
                        motion_score,
                        duration_coverage,
                        final_score,
                    )
@@ -1364,7 +1424,7 @@ def run_global_scan(
                    continue
                logger.debug(
-                    'Beat %d candidate in=%.3fs scene=%s sequence=%.3f span=%.3f coarse=%.3f content=%.3f coverage=%.2f final=%.3f',
+                    'Beat %d candidate in=%.3fs scene=%s sequence=%.3f span=%.3f coarse=%.3f content=%.3f motion=%.3f coverage=%.2f final=%.3f',
                    b.beat_id,
                    adjusted_in_s,
                    scene.scene_id if scene is not None else 'none',
@@ -1372,6 +1432,7 @@ def run_global_scan(
                    span_score,
                    coarse_score,
                    content_score,
                    motion_score,
                    duration_coverage,
                    final_score,
                )
@@ -595,3 +595,73 @@ def validate_match_window_with_vision(
    if missing_actions and score < threshold:
        return False, f"{reason} missing_actions={sorted(missing_actions)}"
    return True, reason
 def find_action_window_in_scene(
    beat: TrailerBeat,
    scene: Scene,
    cfg: AppConfig,
 ) -> tuple[float, float, float, str] | None:
    """
    Search one already selected source scene for the beat's action phase.
    This is used after CV picked the right broad scene but the wrong time
    inside that scene. It stays automatic and cached: windows are described
    evenly across the scene until the per-run vision budget is consumed.
    """
    if not cfg.vision.enabled or scene.duration_s <= 0:
        return None
    cache = _load_cache(cfg)
    budget = [max(0, cfg.vision.max_new_descriptions_per_run)]
    beat_desc = _describe_sample(
        kind="beat",
        item_id=beat.beat_id,
        label=f"trailer beat {beat.beat_id} action search",
        video_path=beat.trailer_path,
        start_s=beat.start_s,
        end_s=beat.end_s,
        cfg=cfg,
        cache=cache,
        budget=budget,
    )
    if not beat_desc:
        return None
    beat_actions = _semantic_action_groups(beat_desc) & _STRONG_ACTION_GROUPS
    if not beat_actions:
        return None
    max_windows = max(
        cfg.vision.seed_points_per_scene,
        cfg.vision.max_new_descriptions_per_run,
    )
    best: tuple[float, float, float, str] | None = None
    for start_s, end_s in _scene_window_ranges(scene, beat, max_windows):
        desc = _describe_sample(
            kind="action_window",
            item_id=scene.scene_id,
            label=f"source scene {scene.scene_id} action window {start_s:.2f}-{end_s:.2f}",
            video_path=scene.source_path,
            start_s=start_s,
            end_s=end_s,
            cfg=cfg,
            cache=cache,
            budget=budget,
        )
        if not desc:
            continue
        score, reason = _semantic_match_score(beat_desc, desc)
        source_actions = _semantic_action_groups(desc)
        missing_actions = beat_actions - source_actions
        if missing_actions:
            continue
        threshold = max(0.38, cfg.vision.similarity_threshold + 0.18)
        if score < threshold:
            continue
        candidate = (start_s, end_s, score, reason)
        if best is None or candidate[2] > best[2]:
            best = candidate
    _save_cache(cfg, cache)
    return best
Author	SHA1	Message	Date
Melbar	a5a84a9145	Use motion phase for in-scene timing	2026-05-02 17:59:18 +02:00
Melbar	3ea5582b49	Realign wrong in-scene action matches	2026-05-02 17:13:22 +02:00