Trim retimed segments when phase drifts

This commit is contained in:
Melbar
2026-05-02 23:04:41 +02:00
parent e293835a86
commit 2cc05e4737
2 changed files with 42 additions and 15 deletions
+5
View File
@@ -191,6 +191,11 @@ sichtbare Segment begrenzt; der gefundene Source-Inpoint wird dabei um den
Trailer-Offset des Segments verschoben. So geht die globale Aktionsbeschreibung
eines Beats nicht verloren, nur weil der scorebare Teil erst nach einer Blende
beginnt.
Der Segment-Offset zählt dabei nur über vorherige scorebare Bildinseln, nicht
über schwarze oder blendige Lücken. Nach dem Retiming wird die nutzbare
Source-Dauer erneut geschätzt; läuft die Source am Ende in eine sichtbar andere
Aktionsphase, wird der Clip gekürzt und der Rest bleibt Placeholder/Fade statt
einen falschen Bewegungsmoment zu zeigen.
Der gewichtete Vision-Seed-Pfad ersetzt standardmäßig keinen normalen
FFmpeg-Vollscan. Vision-Beschreibungen sind semantische Hinweise, aber keine
Beweise; der volle CV-Scan bleibt deshalb aktiv, damit falsch bewertete
+37 -15
View File
@@ -640,12 +640,24 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
from dataclasses import replace
from src.llm.vision_cache import find_action_window_in_scene, validate_match_window_with_vision
from src.cv.scene_indexer import build_scene_index
from src.cv.global_scan import align_in_point_by_content_and_motion
from src.cv.global_scan import align_in_point_by_content_and_motion, estimate_usable_source_duration
logger = logging.getLogger(__name__)
beats_by_id = {beat.beat_id: beat for beat in beats}
scenes_by_id = {scene.scene_id: scene for scene in build_scene_index(cfg)}
def visible_content_offset(action_beat, segment_start_offset_s: float) -> float:
content_offset_s = 0.0
for start_s, end_s in _reference_scoreable_segments(action_beat, cfg):
if end_s <= segment_start_offset_s:
content_offset_s += max(0.0, end_s - start_s)
elif start_s < segment_start_offset_s:
content_offset_s += max(0.0, segment_start_offset_s - start_s)
break
else:
break
return content_offset_s
def realign_window(check_beat, scene_id: int, action_beat=None):
scene = scenes_by_id.get(scene_id)
if scene is None:
@@ -655,9 +667,10 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
return None
start_s, end_s, semantic_score, reason = found
if action_beat is not None:
offset_delta_s = max(0.0, check_beat.start_s - action_beat.start_s)
start_s += offset_delta_s
end_s += offset_delta_s
segment_start_offset_s = max(0.0, check_beat.start_s - action_beat.start_s)
content_offset_s = visible_content_offset(action_beat, segment_start_offset_s)
start_s += content_offset_s
end_s += content_offset_s
window_s = max(3.0, min(8.0, (end_s - start_s) * 4.0))
aligned_in_s, combined_score, content_score, motion_score = align_in_point_by_content_and_motion(
check_beat,
@@ -666,12 +679,16 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
search_window_s=window_s,
)
aligned_in_s = max(scene.start_s, min(aligned_in_s, max(scene.start_s, scene.end_s - check_beat.duration_s)))
usable_duration_s, usable_score = estimate_usable_source_duration(check_beat, aligned_in_s, cfg)
usable_duration_s = max(0.0, min(check_beat.duration_s, usable_duration_s))
if usable_duration_s < max(0.32, check_beat.duration_s * 0.45):
usable_duration_s = check_beat.duration_s
ok, verify_reason = validate_match_window_with_vision(
check_beat,
source_path=scene.source_path,
scene_id=scene.scene_id,
in_point_s=aligned_in_s,
out_point_s=aligned_in_s + check_beat.duration_s,
out_point_s=aligned_in_s + usable_duration_s,
cfg=cfg,
)
if not ok:
@@ -683,8 +700,11 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
verify_reason,
)
return None
score = max(combined_score, min(0.99, semantic_score * 0.70 + motion_score * 0.20 + content_score * 0.10))
return scene, aligned_in_s, score, f"{reason}; {verify_reason}"
score = max(
combined_score,
min(0.99, semantic_score * 0.65 + motion_score * 0.18 + content_score * 0.09 + usable_score * 0.08),
)
return scene, aligned_in_s, usable_duration_s, score, f"{reason}; {verify_reason}"
kept = []
for result in results:
@@ -745,7 +765,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
if repair is None:
new_segments.append(segment)
continue
repair_scene, aligned_in_s, score, repair_reason = repair
repair_scene, aligned_in_s, usable_duration_s, score, repair_reason = repair
if abs(aligned_in_s - segment.in_point_s) <= 1.0 / cfg.export.edl_frame_rate:
new_segments.append(segment)
continue
@@ -755,7 +775,8 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
segment,
scene_id=repair_scene.scene_id,
in_point_s=aligned_in_s,
out_point_s=aligned_in_s + segment.duration_s,
out_point_s=aligned_in_s + usable_duration_s,
duration_s=usable_duration_s,
match_score=score,
is_confirmed=score >= cfg.cv.deep_scan.match_threshold,
))
@@ -783,7 +804,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
if scene is not None and scene.duration_s > max(result.duration_s * 1.6, 6.0):
repair = realign_window(beat, result.scene_id)
if repair is not None:
repair_scene, aligned_in_s, score, repair_reason = repair
repair_scene, aligned_in_s, usable_duration_s, score, repair_reason = repair
if abs(aligned_in_s - result.in_point_s) > 1.0 / cfg.export.edl_frame_rate:
logger.info(
"Beat %d: realigned semantically valid long scene by motion/action window (%s)",
@@ -794,7 +815,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
result,
scene_id=repair_scene.scene_id,
in_point_s=aligned_in_s,
out_point_s=aligned_in_s + result.duration_s,
out_point_s=aligned_in_s + usable_duration_s,
in_point_frame=int(aligned_in_s * cfg.export.edl_frame_rate),
match_score=score,
is_confirmed=score >= cfg.cv.deep_scan.match_threshold,
@@ -817,13 +838,14 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
if repair is None:
all_repaired = False
break
scene, aligned_in_s, score, repair_reason = repair
scene, aligned_in_s, usable_duration_s, score, repair_reason = repair
repair_reasons.append(repair_reason)
new_segments.append(replace(
segment,
scene_id=scene.scene_id,
in_point_s=aligned_in_s,
out_point_s=aligned_in_s + segment.duration_s,
out_point_s=aligned_in_s + usable_duration_s,
duration_s=usable_duration_s,
match_score=score,
is_confirmed=score >= cfg.cv.deep_scan.match_threshold,
))
@@ -849,7 +871,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
else:
repair = realign_window(beat, result.scene_id)
if repair is not None:
scene, aligned_in_s, score, repair_reason = repair
scene, aligned_in_s, usable_duration_s, score, repair_reason = repair
logger.info(
"Beat %d: realigned inside matched scene by vision action window (%s)",
result.beat_id,
@@ -859,7 +881,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
result,
scene_id=scene.scene_id,
in_point_s=aligned_in_s,
out_point_s=aligned_in_s + result.duration_s,
out_point_s=aligned_in_s + usable_duration_s,
in_point_frame=int(aligned_in_s * cfg.export.edl_frame_rate),
match_score=score,
is_confirmed=score >= cfg.cv.deep_scan.match_threshold,