diff --git a/HANDOVER.md b/HANDOVER.md index 01a55c7..1aae663 100644 --- a/HANDOVER.md +++ b/HANDOVER.md @@ -88,6 +88,20 @@ Wenn das fehlschlägt: existieren, sonst ruft Vision live ab (kostet Credits; braucht Netz). 3. `match_results.json.bak` zurückspielen, falls der Cache zerschossen ist. +## Aktuelle Coverage (vor neuestem Lauf) + +``` +total beats: 25 +matched: 20 (5 confirmed, 15 provisional) +unmatched: beats 0, 2, 21, 23, 24 +``` + +Beat 0 ist das SHO-Logo (kein Source-Match möglich, korrekt). +Beats 22/23/24 haben keine sichtbaren Inseln (Endcredits/Title) — auch +korrekt unmatched. +Beat 2 und Beat 21 sind die echten Recovery-Kandidaten; die neue +Recovery-Stufe versucht sie beim nächsten `match`-Lauf nachzuziehen. + ## Offene Risiken / Bekannte Schwächen - Die Schwelle `0.06` für "Beat-Kontext gewinnt" in `realign_window` ist diff --git a/README.md b/README.md index 52ec881..20edfa2 100644 --- a/README.md +++ b/README.md @@ -310,6 +310,21 @@ beim Verbindungsaufbau. Schlägt die Vision-Verifikation während der finalen Filter-/Repair-Stufe trotzdem dauerhaft fehl, wird der bisherige gecachte Treffer für diesen Beat behalten statt verworfen — ein Netzproblem darf keinen schon korrekt gefundenen Match aus dem Cache löschen. +Die Phasen-Reparatur an gefundenen Treffern läuft nicht mehr nur in „langen" +Source-Szenen, sondern überall dort, wo die Szene mehr als nur das +Segment-Fenster trägt. Eine korrigierte Position wird übernommen, sobald sie +das Bildinhalt-Validate besteht UND nicht spürbar schlechter scort als das +Original (≤ 0.02 Verlust). Bereits bestätigte Treffer in eng zugeschnittenen +Szenen werden bewusst nicht angefasst, damit ein guter Match nicht durch eine +nominell gleichwertige Alternative ausgetauscht wird. +Beats, die nach dem CV-Lauf weder als Vollmatch noch als Segmentmatch landen, +durchlaufen anschließend eine Recovery-Stufe: Vibe-Check (Histogramm/pHash) +liefert Top-K Kandidatenszenen, die semantische Action-Window-Suche prüft +darin die Phase des sichtbaren Trailerbeat-Anteils, und der CV-Aligner setzt +den Inpoint frame-genau. Übernommen wird nur ein Kandidat, der dieselbe +Vision-Phasenvalidierung wie der Hauptpfad besteht. Beats ohne sichtbares +Bildmaterial (Logos, Titel-Karten, durchgehende Fades) werden gar nicht erst +gesucht — sie sind bewusst kein Match. Lange Trailerbeats werden nicht mehr automatisch über ihre gesamte Beat-Länge gegen einen einzigen Source-Clip validiert. Sobald nach einem sichtbaren Source-Abschnitt eine anhaltende Schwarzblende oder Titel-/Credit-Insel beginnt, diff --git a/cli.py b/cli.py index ff2241c..0281354 100644 --- a/cli.py +++ b/cli.py @@ -632,6 +632,166 @@ def _merge_best_results(existing: list, candidates: list, cfg) -> list: return sorted(by_id.values(), key=lambda r: r.beat_id) +def _recover_unmatched_beats_via_vision(results: list, beats: list, cfg) -> list: + """Try a vision-led search for beats that ended up without a match. + + For each unmatched beat that has scoreable visual content (i.e. not pure + fade/title-card material), this pass: + 1. Asks the vibe-check (CV histogram + pHash) for the top-K candidate + scenes. + 2. For each candidate, runs the semantic action-window search with the + beat's own description, prefering windows whose phase matches the + visible part of the beat. + 3. Refines the in-point with the regular CV content/motion aligner. + 4. Validates the resulting window with the vision phase check, exactly + like the main filter. + 5. Adds the best validated candidate as a provisional MatchResult. + + Confirmed and provisional matches both stay subject to the same thresholds + used elsewhere; this only adds matches that pass the same quality gates. + """ + if not cfg.vision.enabled or not beats: + return results + + from dataclasses import replace + from src.cv.global_scan import align_in_point_by_content_and_motion, estimate_usable_source_duration + from src.cv.scene_indexer import build_scene_index + from src.cv.vibe_check import run_vibe_check + from src.core.models import MatchResult + from src.llm.vision_cache import find_action_window_in_scene, validate_match_window_with_vision + + logger = logging.getLogger(__name__) + matched_ids = {r.beat_id for r in results} + unmatched = [b for b in beats if b.beat_id not in matched_ids] + if not unmatched: + return results + + scenes = build_scene_index(cfg) + if not scenes: + return results + + new_results = list(results) + for beat in unmatched: + try: + islands = _reference_scoreable_segments(beat, cfg) + except Exception: + islands = [] + if not islands: + # Pure fade/title material — no recovery possible by design. + continue + + # Use the longest visible island as the target for the recovery search. + anchor_start_s, anchor_end_s = max(islands, key=lambda iv: iv[1] - iv[0]) + from dataclasses import replace as _replace + anchor_beat = _replace( + beat, + start_s=beat.start_s + anchor_start_s, + end_s=beat.start_s + anchor_end_s, + ) + + try: + hits = run_vibe_check( + beat, + scenes, + top_k=max(cfg.cv.deep_scan.scene_seed_top_k, cfg.cv.vibe_check.top_k_candidates), + hist_method=cfg.cv.vibe_check.hist_compare_method, + phash_max_distance=64, + ) + except Exception as exc: + logger.warning("Beat %d: recovery vibe-check failed (%s)", beat.beat_id, exc) + continue + + scenes_by_id = {s.scene_id: s for s in scenes} + best = None # (score, scene, in_s, dur_s, reason) + seen = set() + for hit in hits[: cfg.cv.deep_scan.scene_seed_top_k]: + scene = scenes_by_id.get(hit.scene_id) + if scene is None or scene.scene_id in seen: + continue + seen.add(scene.scene_id) + + try: + found = find_action_window_in_scene(anchor_beat, scene, cfg) + except Exception as exc: + logger.debug("Beat %d: action window failed for scene %d (%s)", beat.beat_id, scene.scene_id, exc) + continue + if found is None: + continue + start_s, end_s, semantic_score, reason = found + + window_s = max(3.0, min(8.0, (end_s - start_s) * 4.0)) + try: + aligned_in_s, combined_score, content_score, motion_score = align_in_point_by_content_and_motion( + anchor_beat, + start_s, + cfg, + search_window_s=window_s, + ) + except Exception as exc: + logger.debug("Beat %d: align failed for scene %d (%s)", beat.beat_id, scene.scene_id, exc) + continue + aligned_in_s = max(scene.start_s, min(aligned_in_s, max(scene.start_s, scene.end_s - anchor_beat.duration_s))) + + try: + usable_duration_s, usable_score = estimate_usable_source_duration(anchor_beat, aligned_in_s, cfg) + except Exception: + usable_duration_s, usable_score = anchor_beat.duration_s, 0.0 + usable_duration_s = max(0.0, min(anchor_beat.duration_s, usable_duration_s)) + if usable_duration_s < max(0.32, anchor_beat.duration_s * 0.45): + usable_duration_s = anchor_beat.duration_s + + try: + ok, verify_reason = validate_match_window_with_vision( + anchor_beat, + source_path=scene.source_path, + scene_id=scene.scene_id, + in_point_s=aligned_in_s, + out_point_s=aligned_in_s + usable_duration_s, + cfg=cfg, + ) + except Exception as exc: + logger.debug("Beat %d: validate failed scene=%d (%s)", beat.beat_id, scene.scene_id, exc) + continue + if not ok: + continue + + final_score = max( + combined_score, + min(0.99, semantic_score * 0.65 + motion_score * 0.18 + content_score * 0.09 + usable_score * 0.08), + ) + if final_score < cfg.cv.deep_scan.provisional_match_threshold: + continue + candidate = (final_score, scene, aligned_in_s, usable_duration_s, f"recovery; {reason}; {verify_reason}") + if best is None or candidate[0] > best[0]: + best = candidate + + if best is None: + continue + score, scene, aligned_in_s, usable_duration_s, repair_reason = best + logger.info( + "Beat %d: recovered via vision action search scene=%d in=%.3fs score=%.3f (%s)", + beat.beat_id, + scene.scene_id, + aligned_in_s, + score, + repair_reason, + ) + new_results.append(MatchResult( + beat_id=beat.beat_id, + scene_id=scene.scene_id, + source_path=scene.source_path, + in_point_s=aligned_in_s, + out_point_s=aligned_in_s + usable_duration_s, + in_point_frame=int(aligned_in_s * cfg.export.edl_frame_rate), + match_score=score, + match_location=(0, 0), + is_confirmed=score >= cfg.cv.deep_scan.match_threshold, + segments=tuple(), + )) + + return sorted(new_results, key=lambda r: r.beat_id) + + def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg) -> list: """Drop vision-enabled matches whose final action phase contradicts the beat.""" if not cfg.vision.enabled or not results: @@ -785,7 +945,16 @@ def _filter_repair_one(result, beat, beats_by_id, scenes_by_id, kept, cfg, reali changed = False for segment in result.segments: scene = scenes_by_id.get(segment.scene_id) - if scene is None or scene.duration_s <= max(segment.duration_s * 1.6, 6.0): + # Allow phase-realign whenever the scene has any meaningful + # slack beyond the segment, not only for "long" scenes. + # Short scenes don't need realigning because the segment + # essentially is the scene. + if scene is None or scene.duration_s <= segment.duration_s + 0.5: + new_segments.append(segment) + continue + # For already-confirmed segments, skip the realign to avoid + # destabilizing a strong original match. + if segment.is_confirmed and scene.duration_s <= max(segment.duration_s * 1.6, 6.0): new_segments.append(segment) continue segment_beat = replace( @@ -801,6 +970,11 @@ def _filter_repair_one(result, beat, beats_by_id, scenes_by_id, kept, cfg, reali if abs(aligned_in_s - segment.in_point_s) <= 1.0 / cfg.export.edl_frame_rate: new_segments.append(segment) continue + # Don't commit a repair that scores meaningfully worse than + # the original; phase realign should improve, not regress. + if score < segment.match_score - 0.02: + new_segments.append(segment) + continue changed = True repair_reasons.append(repair_reason) new_segments.append(replace( @@ -833,11 +1007,22 @@ def _filter_repair_one(result, beat, beats_by_id, scenes_by_id, kept, cfg, reali repaired = True else: scene = scenes_by_id.get(result.scene_id) - if scene is not None and scene.duration_s > max(result.duration_s * 1.6, 6.0): + wide_scene = ( + scene is not None + and scene.duration_s > result.duration_s + 0.5 + ) + already_confirmed_in_tight_scene = ( + result.is_confirmed + and scene is not None + and scene.duration_s <= max(result.duration_s * 1.6, 6.0) + ) + if wide_scene and not already_confirmed_in_tight_scene: repair = realign_window(beat, result.scene_id) if repair is not None: repair_scene, aligned_in_s, usable_duration_s, score, repair_reason = repair - if abs(aligned_in_s - result.in_point_s) > 1.0 / cfg.export.edl_frame_rate: + moved = abs(aligned_in_s - result.in_point_s) > 1.0 / cfg.export.edl_frame_rate + improved = score >= result.match_score - 0.02 + if moved and improved: logger.info( "Beat %d: realigned semantically valid long scene by motion/action window (%s)", result.beat_id, @@ -1271,6 +1456,7 @@ def cmd_match(args: argparse.Namespace, cfg) -> list: ) results = _attach_visual_segments(results, beats, cfg) results = _filter_semantically_invalid_vision_matches(results, beats, cfg) + results = _recover_unmatched_beats_via_vision(results, beats, cfg) # A targeted one-beat match should improve the cache without deleting # automatic matches for other beats.