Trim retimed segments when phase drifts

This commit is contained in:
Melbar
2026-05-02 23:04:41 +02:00
parent e293835a86
commit 2cc05e4737
2 changed files with 42 additions and 15 deletions
+5
View File
@@ -191,6 +191,11 @@ sichtbare Segment begrenzt; der gefundene Source-Inpoint wird dabei um den
Trailer-Offset des Segments verschoben. So geht die globale Aktionsbeschreibung Trailer-Offset des Segments verschoben. So geht die globale Aktionsbeschreibung
eines Beats nicht verloren, nur weil der scorebare Teil erst nach einer Blende eines Beats nicht verloren, nur weil der scorebare Teil erst nach einer Blende
beginnt. beginnt.
Der Segment-Offset zählt dabei nur über vorherige scorebare Bildinseln, nicht
über schwarze oder blendige Lücken. Nach dem Retiming wird die nutzbare
Source-Dauer erneut geschätzt; läuft die Source am Ende in eine sichtbar andere
Aktionsphase, wird der Clip gekürzt und der Rest bleibt Placeholder/Fade statt
einen falschen Bewegungsmoment zu zeigen.
Der gewichtete Vision-Seed-Pfad ersetzt standardmäßig keinen normalen Der gewichtete Vision-Seed-Pfad ersetzt standardmäßig keinen normalen
FFmpeg-Vollscan. Vision-Beschreibungen sind semantische Hinweise, aber keine FFmpeg-Vollscan. Vision-Beschreibungen sind semantische Hinweise, aber keine
Beweise; der volle CV-Scan bleibt deshalb aktiv, damit falsch bewertete Beweise; der volle CV-Scan bleibt deshalb aktiv, damit falsch bewertete
+37 -15
View File
@@ -640,12 +640,24 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
from dataclasses import replace from dataclasses import replace
from src.llm.vision_cache import find_action_window_in_scene, validate_match_window_with_vision from src.llm.vision_cache import find_action_window_in_scene, validate_match_window_with_vision
from src.cv.scene_indexer import build_scene_index from src.cv.scene_indexer import build_scene_index
from src.cv.global_scan import align_in_point_by_content_and_motion from src.cv.global_scan import align_in_point_by_content_and_motion, estimate_usable_source_duration
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
beats_by_id = {beat.beat_id: beat for beat in beats} beats_by_id = {beat.beat_id: beat for beat in beats}
scenes_by_id = {scene.scene_id: scene for scene in build_scene_index(cfg)} scenes_by_id = {scene.scene_id: scene for scene in build_scene_index(cfg)}
def visible_content_offset(action_beat, segment_start_offset_s: float) -> float:
content_offset_s = 0.0
for start_s, end_s in _reference_scoreable_segments(action_beat, cfg):
if end_s <= segment_start_offset_s:
content_offset_s += max(0.0, end_s - start_s)
elif start_s < segment_start_offset_s:
content_offset_s += max(0.0, segment_start_offset_s - start_s)
break
else:
break
return content_offset_s
def realign_window(check_beat, scene_id: int, action_beat=None): def realign_window(check_beat, scene_id: int, action_beat=None):
scene = scenes_by_id.get(scene_id) scene = scenes_by_id.get(scene_id)
if scene is None: if scene is None:
@@ -655,9 +667,10 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
return None return None
start_s, end_s, semantic_score, reason = found start_s, end_s, semantic_score, reason = found
if action_beat is not None: if action_beat is not None:
offset_delta_s = max(0.0, check_beat.start_s - action_beat.start_s) segment_start_offset_s = max(0.0, check_beat.start_s - action_beat.start_s)
start_s += offset_delta_s content_offset_s = visible_content_offset(action_beat, segment_start_offset_s)
end_s += offset_delta_s start_s += content_offset_s
end_s += content_offset_s
window_s = max(3.0, min(8.0, (end_s - start_s) * 4.0)) window_s = max(3.0, min(8.0, (end_s - start_s) * 4.0))
aligned_in_s, combined_score, content_score, motion_score = align_in_point_by_content_and_motion( aligned_in_s, combined_score, content_score, motion_score = align_in_point_by_content_and_motion(
check_beat, check_beat,
@@ -666,12 +679,16 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
search_window_s=window_s, search_window_s=window_s,
) )
aligned_in_s = max(scene.start_s, min(aligned_in_s, max(scene.start_s, scene.end_s - check_beat.duration_s))) aligned_in_s = max(scene.start_s, min(aligned_in_s, max(scene.start_s, scene.end_s - check_beat.duration_s)))
usable_duration_s, usable_score = estimate_usable_source_duration(check_beat, aligned_in_s, cfg)
usable_duration_s = max(0.0, min(check_beat.duration_s, usable_duration_s))
if usable_duration_s < max(0.32, check_beat.duration_s * 0.45):
usable_duration_s = check_beat.duration_s
ok, verify_reason = validate_match_window_with_vision( ok, verify_reason = validate_match_window_with_vision(
check_beat, check_beat,
source_path=scene.source_path, source_path=scene.source_path,
scene_id=scene.scene_id, scene_id=scene.scene_id,
in_point_s=aligned_in_s, in_point_s=aligned_in_s,
out_point_s=aligned_in_s + check_beat.duration_s, out_point_s=aligned_in_s + usable_duration_s,
cfg=cfg, cfg=cfg,
) )
if not ok: if not ok:
@@ -683,8 +700,11 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
verify_reason, verify_reason,
) )
return None return None
score = max(combined_score, min(0.99, semantic_score * 0.70 + motion_score * 0.20 + content_score * 0.10)) score = max(
return scene, aligned_in_s, score, f"{reason}; {verify_reason}" combined_score,
min(0.99, semantic_score * 0.65 + motion_score * 0.18 + content_score * 0.09 + usable_score * 0.08),
)
return scene, aligned_in_s, usable_duration_s, score, f"{reason}; {verify_reason}"
kept = [] kept = []
for result in results: for result in results:
@@ -745,7 +765,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
if repair is None: if repair is None:
new_segments.append(segment) new_segments.append(segment)
continue continue
repair_scene, aligned_in_s, score, repair_reason = repair repair_scene, aligned_in_s, usable_duration_s, score, repair_reason = repair
if abs(aligned_in_s - segment.in_point_s) <= 1.0 / cfg.export.edl_frame_rate: if abs(aligned_in_s - segment.in_point_s) <= 1.0 / cfg.export.edl_frame_rate:
new_segments.append(segment) new_segments.append(segment)
continue continue
@@ -755,7 +775,8 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
segment, segment,
scene_id=repair_scene.scene_id, scene_id=repair_scene.scene_id,
in_point_s=aligned_in_s, in_point_s=aligned_in_s,
out_point_s=aligned_in_s + segment.duration_s, out_point_s=aligned_in_s + usable_duration_s,
duration_s=usable_duration_s,
match_score=score, match_score=score,
is_confirmed=score >= cfg.cv.deep_scan.match_threshold, is_confirmed=score >= cfg.cv.deep_scan.match_threshold,
)) ))
@@ -783,7 +804,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
if scene is not None and scene.duration_s > max(result.duration_s * 1.6, 6.0): if scene is not None and scene.duration_s > max(result.duration_s * 1.6, 6.0):
repair = realign_window(beat, result.scene_id) repair = realign_window(beat, result.scene_id)
if repair is not None: if repair is not None:
repair_scene, aligned_in_s, score, repair_reason = repair repair_scene, aligned_in_s, usable_duration_s, score, repair_reason = repair
if abs(aligned_in_s - result.in_point_s) > 1.0 / cfg.export.edl_frame_rate: if abs(aligned_in_s - result.in_point_s) > 1.0 / cfg.export.edl_frame_rate:
logger.info( logger.info(
"Beat %d: realigned semantically valid long scene by motion/action window (%s)", "Beat %d: realigned semantically valid long scene by motion/action window (%s)",
@@ -794,7 +815,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
result, result,
scene_id=repair_scene.scene_id, scene_id=repair_scene.scene_id,
in_point_s=aligned_in_s, in_point_s=aligned_in_s,
out_point_s=aligned_in_s + result.duration_s, out_point_s=aligned_in_s + usable_duration_s,
in_point_frame=int(aligned_in_s * cfg.export.edl_frame_rate), in_point_frame=int(aligned_in_s * cfg.export.edl_frame_rate),
match_score=score, match_score=score,
is_confirmed=score >= cfg.cv.deep_scan.match_threshold, is_confirmed=score >= cfg.cv.deep_scan.match_threshold,
@@ -817,13 +838,14 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
if repair is None: if repair is None:
all_repaired = False all_repaired = False
break break
scene, aligned_in_s, score, repair_reason = repair scene, aligned_in_s, usable_duration_s, score, repair_reason = repair
repair_reasons.append(repair_reason) repair_reasons.append(repair_reason)
new_segments.append(replace( new_segments.append(replace(
segment, segment,
scene_id=scene.scene_id, scene_id=scene.scene_id,
in_point_s=aligned_in_s, in_point_s=aligned_in_s,
out_point_s=aligned_in_s + segment.duration_s, out_point_s=aligned_in_s + usable_duration_s,
duration_s=usable_duration_s,
match_score=score, match_score=score,
is_confirmed=score >= cfg.cv.deep_scan.match_threshold, is_confirmed=score >= cfg.cv.deep_scan.match_threshold,
)) ))
@@ -849,7 +871,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
else: else:
repair = realign_window(beat, result.scene_id) repair = realign_window(beat, result.scene_id)
if repair is not None: if repair is not None:
scene, aligned_in_s, score, repair_reason = repair scene, aligned_in_s, usable_duration_s, score, repair_reason = repair
logger.info( logger.info(
"Beat %d: realigned inside matched scene by vision action window (%s)", "Beat %d: realigned inside matched scene by vision action window (%s)",
result.beat_id, result.beat_id,
@@ -859,7 +881,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
result, result,
scene_id=scene.scene_id, scene_id=scene.scene_id,
in_point_s=aligned_in_s, in_point_s=aligned_in_s,
out_point_s=aligned_in_s + result.duration_s, out_point_s=aligned_in_s + usable_duration_s,
in_point_frame=int(aligned_in_s * cfg.export.edl_frame_rate), in_point_frame=int(aligned_in_s * cfg.export.edl_frame_rate),
match_score=score, match_score=score,
is_confirmed=score >= cfg.cv.deep_scan.match_threshold, is_confirmed=score >= cfg.cv.deep_scan.match_threshold,