Use motion phase for in-scene timing

This commit is contained in:
Melbar
2026-05-02 17:59:18 +02:00
parent 3ea5582b49
commit a5a84a9145
3 changed files with 80 additions and 8 deletions
+7 -2
View File
@@ -137,6 +137,11 @@ Die Inpoint-Feinjustage bestimmt den Versatz lokal aus dem Bildinhalt: Um einen
groben Treffer herum werden mehrere Referenzframes gegen mehrere Source-Offsets groben Treffer herum werden mehrere Referenzframes gegen mehrere Source-Offsets
verglichen, und der beste gemeinsame Offset wird übernommen. Das ist schneller verglichen, und der beste gemeinsame Offset wird übernommen. Das ist schneller
als ein erneuter globaler Scan und vermeidet pauschale Frame-Prerolls. als ein erneuter globaler Scan und vermeidet pauschale Frame-Prerolls.
Zusätzlich wird die Bewegungsphase über Frame-zu-Frame-Differenzen verglichen.
Dadurch kann der Matcher innerhalb derselben Source-Szene unterscheiden, ob
zwei Figuren noch sprechen, sich annähern, bereits im Kontakt sind oder sich
wieder voneinander lösen. Ein optisch ähnlicher Standbild-Treffer reicht damit
nicht mehr aus, wenn der Bewegungsverlauf nicht zur Referenz passt.
Schwarze Referenzframes aus Blenden oder Titel-Tails werden für diese Schwarze Referenzframes aus Blenden oder Titel-Tails werden für diese
Offset-Messung ausgelassen, damit echte Bildbewegung und nicht die Blende selbst Offset-Messung ausgelassen, damit echte Bildbewegung und nicht die Blende selbst
den Inpoint bestimmt. den Inpoint bestimmt.
@@ -167,8 +172,8 @@ nicht gespeichert, selbst wenn der Low-Level-CV-Score hoch ist.
Wenn die Szene selbst plausibel ist, aber der konkrete Source-Zeitpunkt diese Wenn die Szene selbst plausibel ist, aber der konkrete Source-Zeitpunkt diese
Aktionsphase verfehlt, sucht der Matcher automatisch dichter innerhalb derselben Aktionsphase verfehlt, sucht der Matcher automatisch dichter innerhalb derselben
Source-Szene nach lokalen Vision-Fenstern mit der passenden Aktion und richtet Source-Szene nach lokalen Vision-Fenstern mit der passenden Aktion und richtet
den Inpoint darauf neu aus. Erst wenn auch diese In-Scene-Reparatur scheitert, den Inpoint mit der Motion-Phase-Prüfung darauf neu aus. Erst wenn auch diese
wird der Treffer verworfen. In-Scene-Reparatur scheitert, wird der Treffer verworfen.
Der gewichtete Vision-Seed-Pfad ersetzt standardmäßig keinen normalen Der gewichtete Vision-Seed-Pfad ersetzt standardmäßig keinen normalen
FFmpeg-Vollscan. Vision-Beschreibungen sind semantische Hinweise, aber keine FFmpeg-Vollscan. Vision-Beschreibungen sind semantische Hinweise, aber keine
Beweise; der volle CV-Scan bleibt deshalb aktiv, damit falsch bewertete Beweise; der volle CV-Scan bleibt deshalb aktiv, damit falsch bewertete
+9 -3
View File
@@ -640,7 +640,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
from dataclasses import replace from dataclasses import replace
from src.llm.vision_cache import find_action_window_in_scene, validate_match_window_with_vision from src.llm.vision_cache import find_action_window_in_scene, validate_match_window_with_vision
from src.cv.scene_indexer import build_scene_index from src.cv.scene_indexer import build_scene_index
from src.cv.global_scan import align_in_point_by_content from src.cv.global_scan import align_in_point_by_content, align_in_point_by_motion
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
beats_by_id = {beat.beat_id: beat for beat in beats} beats_by_id = {beat.beat_id: beat for beat in beats}
@@ -655,12 +655,18 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
return None return None
start_s, end_s, semantic_score, reason = found start_s, end_s, semantic_score, reason = found
window_s = max(1.0, min(4.0, (end_s - start_s) * 1.5)) window_s = max(1.0, min(4.0, (end_s - start_s) * 1.5))
aligned_in_s, content_score = align_in_point_by_content( motion_in_s, motion_score = align_in_point_by_motion(
check_beat, check_beat,
start_s, start_s,
cfg, cfg,
search_window_s=window_s, search_window_s=window_s,
) )
aligned_in_s, content_score = align_in_point_by_content(
check_beat,
motion_in_s,
cfg,
search_window_s=min(window_s, 0.8),
)
aligned_in_s = max(scene.start_s, min(aligned_in_s, max(scene.start_s, scene.end_s - check_beat.duration_s))) aligned_in_s = max(scene.start_s, min(aligned_in_s, max(scene.start_s, scene.end_s - check_beat.duration_s)))
ok, verify_reason = validate_match_window_with_vision( ok, verify_reason = validate_match_window_with_vision(
check_beat, check_beat,
@@ -679,7 +685,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
verify_reason, verify_reason,
) )
return None return None
score = max(content_score, min(0.99, semantic_score)) score = max(content_score, min(0.99, semantic_score * 0.75 + motion_score * 0.25))
return scene, aligned_in_s, score, f"{reason}; {verify_reason}" return scene, aligned_in_s, score, f"{reason}; {verify_reason}"
kept = [] kept = []
+64 -3
View File
@@ -827,6 +827,50 @@ def _motion_phase_score(
return float((sum(scores) / len(scores)) * 0.65 + min(scores) * 0.35) return float((sum(scores) / len(scores)) * 0.65 + min(scores) * 0.35)
def align_in_point_by_motion(
beat: TrailerBeat,
estimated_in_point_s: float,
cfg: AppConfig,
search_window_s: float | None = None,
) -> tuple[float, float]:
"""
Align a candidate by matching the frame-to-frame motion pattern.
This catches the common failure mode where the right source scene is found,
but the in-point is a few seconds too early or late inside a repeated
conversation/action beat.
"""
motion_templates = _prepare_motion_templates(beat, cfg)
if len(motion_templates) < 2:
return estimated_in_point_s, 0.0
with open_video(cfg.paths.source_movie) as cap:
fps = float(cap.get(cv2.CAP_PROP_FPS)) or cfg.export.edl_frame_rate
frame_step_s = 1.0 / fps
window_s = (
search_window_s
if search_window_s is not None
else cfg.cv.deep_scan.content_align_window_seconds
)
start_s = max(0.0, estimated_in_point_s - window_s)
end_s = estimated_in_point_s + window_s
tie_delta = cfg.cv.deep_scan.start_tie_break_score_delta
best_in = estimated_in_point_s
best_score = -1.0
t = start_s
while t <= end_s:
score = _motion_phase_score(cap, t, motion_templates, cfg)
if score > best_score + tie_delta:
best_score = score
best_in = t
elif score >= best_score - tie_delta and abs(t - estimated_in_point_s) < abs(best_in - estimated_in_point_s):
best_in = t
t = round(t + frame_step_s, 6)
return best_in, max(0.0, best_score)
def estimate_usable_source_duration( def estimate_usable_source_duration(
beat: TrailerBeat, beat: TrailerBeat,
in_point_s: float, in_point_s: float,
@@ -1190,6 +1234,7 @@ def run_global_scan(
for _, coarse_score, in_point_s in reranked_candidates[:refine_limit] for _, coarse_score, in_point_s in reranked_candidates[:refine_limit]
] ]
validation_templates = _prepare_validation_templates(b, cfg) validation_templates = _prepare_validation_templates(b, cfg)
motion_templates = _prepare_motion_templates(b, cfg)
logger.info( logger.info(
'Beat %d: content-reranked top %d / %d candidates.', 'Beat %d: content-reranked top %d / %d candidates.',
b.beat_id, b.beat_id,
@@ -1270,6 +1315,16 @@ def run_global_scan(
if matchable_duration_s > 0 else 0.0 if matchable_duration_s > 0 else 0.0
) )
motion_score = 0.0
if len(motion_templates) >= 2:
with open_video(cfg.paths.source_movie) as motion_cap:
motion_score = _motion_phase_score(
motion_cap,
adjusted_in_s,
motion_templates,
cfg,
)
if is_weighted_seed_candidate and scene is not None and content_score >= content_gate: if is_weighted_seed_candidate and scene is not None and content_score >= content_gate:
contiguous_usable_s = _contiguous_scene_coverage_duration( contiguous_usable_s = _contiguous_scene_coverage_duration(
b, b,
@@ -1299,11 +1354,15 @@ def run_global_scan(
final_score * (1.0 - scan_cfg.content_validation_weight) final_score * (1.0 - scan_cfg.content_validation_weight)
+ content_score * scan_cfg.content_validation_weight + content_score * scan_cfg.content_validation_weight
) )
if len(motion_templates) >= 2:
motion_score_clamped = max(0.0, min(1.0, motion_score))
final_score = final_score * 0.82 + motion_score_clamped * 0.18
if is_weighted_seed_candidate: if is_weighted_seed_candidate:
vision_provisional_score = ( vision_provisional_score = (
content_score * 0.55 content_score * 0.45
+ duration_coverage * 0.33 + duration_coverage * 0.33
+ coarse_score * 0.12 + coarse_score * 0.12
+ max(0.0, min(1.0, motion_score)) * 0.10
) )
final_score = max(final_score, vision_provisional_score) final_score = max(final_score, vision_provisional_score)
if content_score < scan_cfg.match_threshold and not is_weighted_seed_candidate: if content_score < scan_cfg.match_threshold and not is_weighted_seed_candidate:
@@ -1332,7 +1391,7 @@ def run_global_scan(
if duration_coverage < scan_cfg.min_duration_coverage: if duration_coverage < scan_cfg.min_duration_coverage:
rejected_short_candidates += 1 rejected_short_candidates += 1
logger.debug( logger.debug(
'Beat %d short candidate in=%.3fs scene=%s sequence=%.3f span=%.3f coarse=%.3f content=%.3f coverage=%.2f final=%.3f', 'Beat %d short candidate in=%.3fs scene=%s sequence=%.3f span=%.3f coarse=%.3f content=%.3f motion=%.3f coverage=%.2f final=%.3f',
b.beat_id, b.beat_id,
adjusted_in_s, adjusted_in_s,
scene.scene_id if scene is not None else 'none', scene.scene_id if scene is not None else 'none',
@@ -1340,6 +1399,7 @@ def run_global_scan(
span_score, span_score,
coarse_score, coarse_score,
content_score, content_score,
motion_score,
duration_coverage, duration_coverage,
final_score, final_score,
) )
@@ -1364,7 +1424,7 @@ def run_global_scan(
continue continue
logger.debug( logger.debug(
'Beat %d candidate in=%.3fs scene=%s sequence=%.3f span=%.3f coarse=%.3f content=%.3f coverage=%.2f final=%.3f', 'Beat %d candidate in=%.3fs scene=%s sequence=%.3f span=%.3f coarse=%.3f content=%.3f motion=%.3f coverage=%.2f final=%.3f',
b.beat_id, b.beat_id,
adjusted_in_s, adjusted_in_s,
scene.scene_id if scene is not None else 'none', scene.scene_id if scene is not None else 'none',
@@ -1372,6 +1432,7 @@ def run_global_scan(
span_score, span_score,
coarse_score, coarse_score,
content_score, content_score,
motion_score,
duration_coverage, duration_coverage,
final_score, final_score,
) )