Compare commits
2 Commits
1a177d6b89
...
a5a84a9145
| Author | SHA1 | Date | |
|---|---|---|---|
| a5a84a9145 | |||
| 3ea5582b49 |
@@ -137,6 +137,11 @@ Die Inpoint-Feinjustage bestimmt den Versatz lokal aus dem Bildinhalt: Um einen
|
|||||||
groben Treffer herum werden mehrere Referenzframes gegen mehrere Source-Offsets
|
groben Treffer herum werden mehrere Referenzframes gegen mehrere Source-Offsets
|
||||||
verglichen, und der beste gemeinsame Offset wird übernommen. Das ist schneller
|
verglichen, und der beste gemeinsame Offset wird übernommen. Das ist schneller
|
||||||
als ein erneuter globaler Scan und vermeidet pauschale Frame-Prerolls.
|
als ein erneuter globaler Scan und vermeidet pauschale Frame-Prerolls.
|
||||||
|
Zusätzlich wird die Bewegungsphase über Frame-zu-Frame-Differenzen verglichen.
|
||||||
|
Dadurch kann der Matcher innerhalb derselben Source-Szene unterscheiden, ob
|
||||||
|
zwei Figuren noch sprechen, sich annähern, bereits im Kontakt sind oder sich
|
||||||
|
wieder voneinander lösen. Ein optisch ähnlicher Standbild-Treffer reicht damit
|
||||||
|
nicht mehr aus, wenn der Bewegungsverlauf nicht zur Referenz passt.
|
||||||
Schwarze Referenzframes aus Blenden oder Titel-Tails werden für diese
|
Schwarze Referenzframes aus Blenden oder Titel-Tails werden für diese
|
||||||
Offset-Messung ausgelassen, damit echte Bildbewegung und nicht die Blende selbst
|
Offset-Messung ausgelassen, damit echte Bildbewegung und nicht die Blende selbst
|
||||||
den Inpoint bestimmt.
|
den Inpoint bestimmt.
|
||||||
@@ -164,6 +169,11 @@ Zeitbereich nochmals gegen den Trailer-Beat prüfen. Starke Aktionsphasen wie
|
|||||||
Annäherung, Kuss/Stirnkontakt, Handbewegungen oder Schneiden müssen dann auch
|
Annäherung, Kuss/Stirnkontakt, Handbewegungen oder Schneiden müssen dann auch
|
||||||
im Source-Fenster beschrieben sein; fehlt diese Aktionsphase, wird der Treffer
|
im Source-Fenster beschrieben sein; fehlt diese Aktionsphase, wird der Treffer
|
||||||
nicht gespeichert, selbst wenn der Low-Level-CV-Score hoch ist.
|
nicht gespeichert, selbst wenn der Low-Level-CV-Score hoch ist.
|
||||||
|
Wenn die Szene selbst plausibel ist, aber der konkrete Source-Zeitpunkt diese
|
||||||
|
Aktionsphase verfehlt, sucht der Matcher automatisch dichter innerhalb derselben
|
||||||
|
Source-Szene nach lokalen Vision-Fenstern mit der passenden Aktion und richtet
|
||||||
|
den Inpoint mit der Motion-Phase-Prüfung darauf neu aus. Erst wenn auch diese
|
||||||
|
In-Scene-Reparatur scheitert, wird der Treffer verworfen.
|
||||||
Der gewichtete Vision-Seed-Pfad ersetzt standardmäßig keinen normalen
|
Der gewichtete Vision-Seed-Pfad ersetzt standardmäßig keinen normalen
|
||||||
FFmpeg-Vollscan. Vision-Beschreibungen sind semantische Hinweise, aber keine
|
FFmpeg-Vollscan. Vision-Beschreibungen sind semantische Hinweise, aber keine
|
||||||
Beweise; der volle CV-Scan bleibt deshalb aktiv, damit falsch bewertete
|
Beweise; der volle CV-Scan bleibt deshalb aktiv, damit falsch bewertete
|
||||||
|
|||||||
@@ -638,10 +638,56 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
|
|||||||
return results
|
return results
|
||||||
|
|
||||||
from dataclasses import replace
|
from dataclasses import replace
|
||||||
from src.llm.vision_cache import validate_match_window_with_vision
|
from src.llm.vision_cache import find_action_window_in_scene, validate_match_window_with_vision
|
||||||
|
from src.cv.scene_indexer import build_scene_index
|
||||||
|
from src.cv.global_scan import align_in_point_by_content, align_in_point_by_motion
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
beats_by_id = {beat.beat_id: beat for beat in beats}
|
beats_by_id = {beat.beat_id: beat for beat in beats}
|
||||||
|
scenes_by_id = {scene.scene_id: scene for scene in build_scene_index(cfg)}
|
||||||
|
|
||||||
|
def realign_window(check_beat, scene_id: int):
|
||||||
|
scene = scenes_by_id.get(scene_id)
|
||||||
|
if scene is None:
|
||||||
|
return None
|
||||||
|
found = find_action_window_in_scene(check_beat, scene, cfg)
|
||||||
|
if found is None:
|
||||||
|
return None
|
||||||
|
start_s, end_s, semantic_score, reason = found
|
||||||
|
window_s = max(1.0, min(4.0, (end_s - start_s) * 1.5))
|
||||||
|
motion_in_s, motion_score = align_in_point_by_motion(
|
||||||
|
check_beat,
|
||||||
|
start_s,
|
||||||
|
cfg,
|
||||||
|
search_window_s=window_s,
|
||||||
|
)
|
||||||
|
aligned_in_s, content_score = align_in_point_by_content(
|
||||||
|
check_beat,
|
||||||
|
motion_in_s,
|
||||||
|
cfg,
|
||||||
|
search_window_s=min(window_s, 0.8),
|
||||||
|
)
|
||||||
|
aligned_in_s = max(scene.start_s, min(aligned_in_s, max(scene.start_s, scene.end_s - check_beat.duration_s)))
|
||||||
|
ok, verify_reason = validate_match_window_with_vision(
|
||||||
|
check_beat,
|
||||||
|
source_path=scene.source_path,
|
||||||
|
scene_id=scene.scene_id,
|
||||||
|
in_point_s=aligned_in_s,
|
||||||
|
out_point_s=aligned_in_s + check_beat.duration_s,
|
||||||
|
cfg=cfg,
|
||||||
|
)
|
||||||
|
if not ok:
|
||||||
|
logger.info(
|
||||||
|
"Beat %d: action-window realign rejected scene=%d in=%.3fs (%s)",
|
||||||
|
check_beat.beat_id,
|
||||||
|
scene.scene_id,
|
||||||
|
aligned_in_s,
|
||||||
|
verify_reason,
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
score = max(content_score, min(0.99, semantic_score * 0.75 + motion_score * 0.25))
|
||||||
|
return scene, aligned_in_s, score, f"{reason}; {verify_reason}"
|
||||||
|
|
||||||
kept = []
|
kept = []
|
||||||
for result in results:
|
for result in results:
|
||||||
beat = beats_by_id.get(result.beat_id)
|
beat = beats_by_id.get(result.beat_id)
|
||||||
@@ -684,6 +730,68 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
|
|||||||
if valid:
|
if valid:
|
||||||
kept.append(result)
|
kept.append(result)
|
||||||
else:
|
else:
|
||||||
|
if getattr(result, "segments", ()):
|
||||||
|
new_segments = []
|
||||||
|
all_repaired = True
|
||||||
|
repair_reasons = []
|
||||||
|
for segment in result.segments:
|
||||||
|
segment_beat = replace(
|
||||||
|
beat,
|
||||||
|
start_s=beat.start_s + segment.trailer_offset_s,
|
||||||
|
end_s=beat.start_s + segment.trailer_offset_s + segment.duration_s,
|
||||||
|
)
|
||||||
|
repair = realign_window(segment_beat, segment.scene_id)
|
||||||
|
if repair is None:
|
||||||
|
all_repaired = False
|
||||||
|
break
|
||||||
|
scene, aligned_in_s, score, repair_reason = repair
|
||||||
|
repair_reasons.append(repair_reason)
|
||||||
|
new_segments.append(replace(
|
||||||
|
segment,
|
||||||
|
scene_id=scene.scene_id,
|
||||||
|
in_point_s=aligned_in_s,
|
||||||
|
out_point_s=aligned_in_s + segment.duration_s,
|
||||||
|
match_score=score,
|
||||||
|
is_confirmed=score >= cfg.cv.deep_scan.match_threshold,
|
||||||
|
))
|
||||||
|
if all_repaired and new_segments:
|
||||||
|
first = new_segments[0]
|
||||||
|
repaired_score = min(seg.match_score for seg in new_segments)
|
||||||
|
logger.info(
|
||||||
|
"Beat %d: realigned inside matched scene by vision action windows (%s)",
|
||||||
|
result.beat_id,
|
||||||
|
"; ".join(repair_reasons),
|
||||||
|
)
|
||||||
|
kept.append(replace(
|
||||||
|
result,
|
||||||
|
scene_id=first.scene_id,
|
||||||
|
in_point_s=first.in_point_s,
|
||||||
|
out_point_s=first.out_point_s,
|
||||||
|
in_point_frame=int(first.in_point_s * cfg.export.edl_frame_rate),
|
||||||
|
match_score=repaired_score,
|
||||||
|
is_confirmed=repaired_score >= cfg.cv.deep_scan.match_threshold,
|
||||||
|
segments=tuple(new_segments),
|
||||||
|
))
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
repair = realign_window(beat, result.scene_id)
|
||||||
|
if repair is not None:
|
||||||
|
scene, aligned_in_s, score, repair_reason = repair
|
||||||
|
logger.info(
|
||||||
|
"Beat %d: realigned inside matched scene by vision action window (%s)",
|
||||||
|
result.beat_id,
|
||||||
|
repair_reason,
|
||||||
|
)
|
||||||
|
kept.append(replace(
|
||||||
|
result,
|
||||||
|
scene_id=scene.scene_id,
|
||||||
|
in_point_s=aligned_in_s,
|
||||||
|
out_point_s=aligned_in_s + result.duration_s,
|
||||||
|
in_point_frame=int(aligned_in_s * cfg.export.edl_frame_rate),
|
||||||
|
match_score=score,
|
||||||
|
is_confirmed=score >= cfg.cv.deep_scan.match_threshold,
|
||||||
|
))
|
||||||
|
continue
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"Beat %d: rejected by vision action-phase verification (%s)",
|
"Beat %d: rejected by vision action-phase verification (%s)",
|
||||||
result.beat_id,
|
result.beat_id,
|
||||||
|
|||||||
+64
-3
@@ -827,6 +827,50 @@ def _motion_phase_score(
|
|||||||
return float((sum(scores) / len(scores)) * 0.65 + min(scores) * 0.35)
|
return float((sum(scores) / len(scores)) * 0.65 + min(scores) * 0.35)
|
||||||
|
|
||||||
|
|
||||||
|
def align_in_point_by_motion(
|
||||||
|
beat: TrailerBeat,
|
||||||
|
estimated_in_point_s: float,
|
||||||
|
cfg: AppConfig,
|
||||||
|
search_window_s: float | None = None,
|
||||||
|
) -> tuple[float, float]:
|
||||||
|
"""
|
||||||
|
Align a candidate by matching the frame-to-frame motion pattern.
|
||||||
|
|
||||||
|
This catches the common failure mode where the right source scene is found,
|
||||||
|
but the in-point is a few seconds too early or late inside a repeated
|
||||||
|
conversation/action beat.
|
||||||
|
"""
|
||||||
|
motion_templates = _prepare_motion_templates(beat, cfg)
|
||||||
|
if len(motion_templates) < 2:
|
||||||
|
return estimated_in_point_s, 0.0
|
||||||
|
|
||||||
|
with open_video(cfg.paths.source_movie) as cap:
|
||||||
|
fps = float(cap.get(cv2.CAP_PROP_FPS)) or cfg.export.edl_frame_rate
|
||||||
|
frame_step_s = 1.0 / fps
|
||||||
|
window_s = (
|
||||||
|
search_window_s
|
||||||
|
if search_window_s is not None
|
||||||
|
else cfg.cv.deep_scan.content_align_window_seconds
|
||||||
|
)
|
||||||
|
start_s = max(0.0, estimated_in_point_s - window_s)
|
||||||
|
end_s = estimated_in_point_s + window_s
|
||||||
|
tie_delta = cfg.cv.deep_scan.start_tie_break_score_delta
|
||||||
|
|
||||||
|
best_in = estimated_in_point_s
|
||||||
|
best_score = -1.0
|
||||||
|
t = start_s
|
||||||
|
while t <= end_s:
|
||||||
|
score = _motion_phase_score(cap, t, motion_templates, cfg)
|
||||||
|
if score > best_score + tie_delta:
|
||||||
|
best_score = score
|
||||||
|
best_in = t
|
||||||
|
elif score >= best_score - tie_delta and abs(t - estimated_in_point_s) < abs(best_in - estimated_in_point_s):
|
||||||
|
best_in = t
|
||||||
|
t = round(t + frame_step_s, 6)
|
||||||
|
|
||||||
|
return best_in, max(0.0, best_score)
|
||||||
|
|
||||||
|
|
||||||
def estimate_usable_source_duration(
|
def estimate_usable_source_duration(
|
||||||
beat: TrailerBeat,
|
beat: TrailerBeat,
|
||||||
in_point_s: float,
|
in_point_s: float,
|
||||||
@@ -1190,6 +1234,7 @@ def run_global_scan(
|
|||||||
for _, coarse_score, in_point_s in reranked_candidates[:refine_limit]
|
for _, coarse_score, in_point_s in reranked_candidates[:refine_limit]
|
||||||
]
|
]
|
||||||
validation_templates = _prepare_validation_templates(b, cfg)
|
validation_templates = _prepare_validation_templates(b, cfg)
|
||||||
|
motion_templates = _prepare_motion_templates(b, cfg)
|
||||||
logger.info(
|
logger.info(
|
||||||
'Beat %d: content-reranked top %d / %d candidates.',
|
'Beat %d: content-reranked top %d / %d candidates.',
|
||||||
b.beat_id,
|
b.beat_id,
|
||||||
@@ -1270,6 +1315,16 @@ def run_global_scan(
|
|||||||
if matchable_duration_s > 0 else 0.0
|
if matchable_duration_s > 0 else 0.0
|
||||||
)
|
)
|
||||||
|
|
||||||
|
motion_score = 0.0
|
||||||
|
if len(motion_templates) >= 2:
|
||||||
|
with open_video(cfg.paths.source_movie) as motion_cap:
|
||||||
|
motion_score = _motion_phase_score(
|
||||||
|
motion_cap,
|
||||||
|
adjusted_in_s,
|
||||||
|
motion_templates,
|
||||||
|
cfg,
|
||||||
|
)
|
||||||
|
|
||||||
if is_weighted_seed_candidate and scene is not None and content_score >= content_gate:
|
if is_weighted_seed_candidate and scene is not None and content_score >= content_gate:
|
||||||
contiguous_usable_s = _contiguous_scene_coverage_duration(
|
contiguous_usable_s = _contiguous_scene_coverage_duration(
|
||||||
b,
|
b,
|
||||||
@@ -1299,11 +1354,15 @@ def run_global_scan(
|
|||||||
final_score * (1.0 - scan_cfg.content_validation_weight)
|
final_score * (1.0 - scan_cfg.content_validation_weight)
|
||||||
+ content_score * scan_cfg.content_validation_weight
|
+ content_score * scan_cfg.content_validation_weight
|
||||||
)
|
)
|
||||||
|
if len(motion_templates) >= 2:
|
||||||
|
motion_score_clamped = max(0.0, min(1.0, motion_score))
|
||||||
|
final_score = final_score * 0.82 + motion_score_clamped * 0.18
|
||||||
if is_weighted_seed_candidate:
|
if is_weighted_seed_candidate:
|
||||||
vision_provisional_score = (
|
vision_provisional_score = (
|
||||||
content_score * 0.55
|
content_score * 0.45
|
||||||
+ duration_coverage * 0.33
|
+ duration_coverage * 0.33
|
||||||
+ coarse_score * 0.12
|
+ coarse_score * 0.12
|
||||||
|
+ max(0.0, min(1.0, motion_score)) * 0.10
|
||||||
)
|
)
|
||||||
final_score = max(final_score, vision_provisional_score)
|
final_score = max(final_score, vision_provisional_score)
|
||||||
if content_score < scan_cfg.match_threshold and not is_weighted_seed_candidate:
|
if content_score < scan_cfg.match_threshold and not is_weighted_seed_candidate:
|
||||||
@@ -1332,7 +1391,7 @@ def run_global_scan(
|
|||||||
if duration_coverage < scan_cfg.min_duration_coverage:
|
if duration_coverage < scan_cfg.min_duration_coverage:
|
||||||
rejected_short_candidates += 1
|
rejected_short_candidates += 1
|
||||||
logger.debug(
|
logger.debug(
|
||||||
'Beat %d short candidate in=%.3fs scene=%s sequence=%.3f span=%.3f coarse=%.3f content=%.3f coverage=%.2f final=%.3f',
|
'Beat %d short candidate in=%.3fs scene=%s sequence=%.3f span=%.3f coarse=%.3f content=%.3f motion=%.3f coverage=%.2f final=%.3f',
|
||||||
b.beat_id,
|
b.beat_id,
|
||||||
adjusted_in_s,
|
adjusted_in_s,
|
||||||
scene.scene_id if scene is not None else 'none',
|
scene.scene_id if scene is not None else 'none',
|
||||||
@@ -1340,6 +1399,7 @@ def run_global_scan(
|
|||||||
span_score,
|
span_score,
|
||||||
coarse_score,
|
coarse_score,
|
||||||
content_score,
|
content_score,
|
||||||
|
motion_score,
|
||||||
duration_coverage,
|
duration_coverage,
|
||||||
final_score,
|
final_score,
|
||||||
)
|
)
|
||||||
@@ -1364,7 +1424,7 @@ def run_global_scan(
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
logger.debug(
|
logger.debug(
|
||||||
'Beat %d candidate in=%.3fs scene=%s sequence=%.3f span=%.3f coarse=%.3f content=%.3f coverage=%.2f final=%.3f',
|
'Beat %d candidate in=%.3fs scene=%s sequence=%.3f span=%.3f coarse=%.3f content=%.3f motion=%.3f coverage=%.2f final=%.3f',
|
||||||
b.beat_id,
|
b.beat_id,
|
||||||
adjusted_in_s,
|
adjusted_in_s,
|
||||||
scene.scene_id if scene is not None else 'none',
|
scene.scene_id if scene is not None else 'none',
|
||||||
@@ -1372,6 +1432,7 @@ def run_global_scan(
|
|||||||
span_score,
|
span_score,
|
||||||
coarse_score,
|
coarse_score,
|
||||||
content_score,
|
content_score,
|
||||||
|
motion_score,
|
||||||
duration_coverage,
|
duration_coverage,
|
||||||
final_score,
|
final_score,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -595,3 +595,73 @@ def validate_match_window_with_vision(
|
|||||||
if missing_actions and score < threshold:
|
if missing_actions and score < threshold:
|
||||||
return False, f"{reason} missing_actions={sorted(missing_actions)}"
|
return False, f"{reason} missing_actions={sorted(missing_actions)}"
|
||||||
return True, reason
|
return True, reason
|
||||||
|
|
||||||
|
|
||||||
|
def find_action_window_in_scene(
|
||||||
|
beat: TrailerBeat,
|
||||||
|
scene: Scene,
|
||||||
|
cfg: AppConfig,
|
||||||
|
) -> tuple[float, float, float, str] | None:
|
||||||
|
"""
|
||||||
|
Search one already selected source scene for the beat's action phase.
|
||||||
|
|
||||||
|
This is used after CV picked the right broad scene but the wrong time
|
||||||
|
inside that scene. It stays automatic and cached: windows are described
|
||||||
|
evenly across the scene until the per-run vision budget is consumed.
|
||||||
|
"""
|
||||||
|
if not cfg.vision.enabled or scene.duration_s <= 0:
|
||||||
|
return None
|
||||||
|
|
||||||
|
cache = _load_cache(cfg)
|
||||||
|
budget = [max(0, cfg.vision.max_new_descriptions_per_run)]
|
||||||
|
beat_desc = _describe_sample(
|
||||||
|
kind="beat",
|
||||||
|
item_id=beat.beat_id,
|
||||||
|
label=f"trailer beat {beat.beat_id} action search",
|
||||||
|
video_path=beat.trailer_path,
|
||||||
|
start_s=beat.start_s,
|
||||||
|
end_s=beat.end_s,
|
||||||
|
cfg=cfg,
|
||||||
|
cache=cache,
|
||||||
|
budget=budget,
|
||||||
|
)
|
||||||
|
if not beat_desc:
|
||||||
|
return None
|
||||||
|
|
||||||
|
beat_actions = _semantic_action_groups(beat_desc) & _STRONG_ACTION_GROUPS
|
||||||
|
if not beat_actions:
|
||||||
|
return None
|
||||||
|
|
||||||
|
max_windows = max(
|
||||||
|
cfg.vision.seed_points_per_scene,
|
||||||
|
cfg.vision.max_new_descriptions_per_run,
|
||||||
|
)
|
||||||
|
best: tuple[float, float, float, str] | None = None
|
||||||
|
for start_s, end_s in _scene_window_ranges(scene, beat, max_windows):
|
||||||
|
desc = _describe_sample(
|
||||||
|
kind="action_window",
|
||||||
|
item_id=scene.scene_id,
|
||||||
|
label=f"source scene {scene.scene_id} action window {start_s:.2f}-{end_s:.2f}",
|
||||||
|
video_path=scene.source_path,
|
||||||
|
start_s=start_s,
|
||||||
|
end_s=end_s,
|
||||||
|
cfg=cfg,
|
||||||
|
cache=cache,
|
||||||
|
budget=budget,
|
||||||
|
)
|
||||||
|
if not desc:
|
||||||
|
continue
|
||||||
|
score, reason = _semantic_match_score(beat_desc, desc)
|
||||||
|
source_actions = _semantic_action_groups(desc)
|
||||||
|
missing_actions = beat_actions - source_actions
|
||||||
|
if missing_actions:
|
||||||
|
continue
|
||||||
|
threshold = max(0.38, cfg.vision.similarity_threshold + 0.18)
|
||||||
|
if score < threshold:
|
||||||
|
continue
|
||||||
|
candidate = (start_s, end_s, score, reason)
|
||||||
|
if best is None or candidate[2] > best[2]:
|
||||||
|
best = candidate
|
||||||
|
|
||||||
|
_save_cache(cfg, cache)
|
||||||
|
return best
|
||||||
|
|||||||
Reference in New Issue
Block a user