Filter cached vision action windows
This commit is contained in:
@@ -908,7 +908,7 @@ def _recover_unmatched_beats_via_vision(results: list, beats: list, cfg) -> list
|
|||||||
Confirmed and provisional matches both stay subject to the same thresholds
|
Confirmed and provisional matches both stay subject to the same thresholds
|
||||||
used elsewhere; this only adds matches that pass the same quality gates.
|
used elsewhere; this only adds matches that pass the same quality gates.
|
||||||
"""
|
"""
|
||||||
if not cfg.vision.enabled or not beats:
|
if not beats:
|
||||||
return results
|
return results
|
||||||
|
|
||||||
from dataclasses import replace
|
from dataclasses import replace
|
||||||
@@ -977,6 +977,79 @@ def _recover_unmatched_beats_via_vision(results: list, beats: list, cfg) -> list
|
|||||||
|
|
||||||
scenes_by_id = {s.scene_id: s for s in scenes}
|
scenes_by_id = {s.scene_id: s for s in scenes}
|
||||||
best = None # (score, scene, in_s, dur_s, reason)
|
best = None # (score, scene, in_s, dur_s, reason)
|
||||||
|
try:
|
||||||
|
from src.llm.vision_cache import (
|
||||||
|
_load_cache,
|
||||||
|
_semantic_action_groups,
|
||||||
|
_semantic_match_score,
|
||||||
|
_STRONG_ACTION_GROUPS,
|
||||||
|
)
|
||||||
|
cache = _load_cache(cfg)
|
||||||
|
items = cache.get("items", {})
|
||||||
|
beat_desc = ""
|
||||||
|
if isinstance(items, dict):
|
||||||
|
for item in items.values():
|
||||||
|
if (
|
||||||
|
isinstance(item, dict)
|
||||||
|
and item.get("kind") == "beat"
|
||||||
|
and item.get("item_id") == beat.beat_id
|
||||||
|
):
|
||||||
|
beat_desc = str(item.get("description", ""))
|
||||||
|
break
|
||||||
|
beat_actions = _semantic_action_groups(beat_desc) & _STRONG_ACTION_GROUPS if beat_desc else set()
|
||||||
|
identity_vocab = {
|
||||||
|
"woman", "women", "man", "men", "girl", "boy", "child",
|
||||||
|
"blonde", "hair", "face", "mouth", "eyes", "profile",
|
||||||
|
"close-up", "closeup",
|
||||||
|
}
|
||||||
|
beat_identity = {term for term in identity_vocab if term in beat_desc.lower()}
|
||||||
|
distinctive_identity = {
|
||||||
|
term for term in ("woman", "women", "blonde", "mouth", "face")
|
||||||
|
if term in beat_desc.lower()
|
||||||
|
}
|
||||||
|
if beat_actions and isinstance(items, dict):
|
||||||
|
for item in items.values():
|
||||||
|
if not isinstance(item, dict) or item.get("kind") != "action_window":
|
||||||
|
continue
|
||||||
|
scene = scenes_by_id.get(item.get("item_id"))
|
||||||
|
desc = str(item.get("description", ""))
|
||||||
|
source_actions = _semantic_action_groups(desc)
|
||||||
|
if scene is None or not beat_actions <= source_actions:
|
||||||
|
continue
|
||||||
|
source_text = desc.lower()
|
||||||
|
positive_source_text = source_text.split('"negatives"', 1)[0]
|
||||||
|
identity_overlap = {term for term in beat_identity if term in source_text}
|
||||||
|
if len(beat_identity) >= 2 and len(identity_overlap) < 2:
|
||||||
|
continue
|
||||||
|
if distinctive_identity and not any(term in positive_source_text for term in distinctive_identity):
|
||||||
|
continue
|
||||||
|
if "mouth" in beat_desc.lower() and "mouth" not in positive_source_text:
|
||||||
|
continue
|
||||||
|
if "dark interior" in beat_desc.lower() and (
|
||||||
|
"interior" not in positive_source_text or "dark" not in positive_source_text
|
||||||
|
):
|
||||||
|
continue
|
||||||
|
score, reason = _semantic_match_score(beat_desc, desc)
|
||||||
|
if score < max(0.60, cfg.cv.deep_scan.provisional_match_threshold):
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
in_s = float(item.get("start_s"))
|
||||||
|
out_s = float(item.get("end_s"))
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
continue
|
||||||
|
duration_s = max(0.32, min(anchor_beat.duration_s, out_s - in_s))
|
||||||
|
candidate = (
|
||||||
|
min(0.99, score),
|
||||||
|
scene,
|
||||||
|
in_s,
|
||||||
|
duration_s,
|
||||||
|
f"cached vision action; {reason}",
|
||||||
|
)
|
||||||
|
if best is None or candidate[0] > best[0]:
|
||||||
|
best = candidate
|
||||||
|
except Exception as exc:
|
||||||
|
logger.debug("Beat %d: cached vision fallback failed (%s)", beat.beat_id, exc)
|
||||||
|
|
||||||
seen = set()
|
seen = set()
|
||||||
for hit in hits[: cfg.cv.deep_scan.scene_seed_top_k]:
|
for hit in hits[: cfg.cv.deep_scan.scene_seed_top_k]:
|
||||||
scene = scenes_by_id.get(hit.scene_id)
|
scene = scenes_by_id.get(hit.scene_id)
|
||||||
@@ -1003,7 +1076,10 @@ def _recover_unmatched_beats_via_vision(results: list, beats: list, cfg) -> list
|
|||||||
)
|
)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logger.debug("Beat %d: align failed for scene %d (%s)", beat.beat_id, scene.scene_id, exc)
|
logger.debug("Beat %d: align failed for scene %d (%s)", beat.beat_id, scene.scene_id, exc)
|
||||||
continue
|
aligned_in_s = start_s
|
||||||
|
combined_score = semantic_score
|
||||||
|
content_score = 0.0
|
||||||
|
motion_score = 0.0
|
||||||
aligned_in_s = max(scene.start_s, min(aligned_in_s, max(scene.start_s, scene.end_s - anchor_beat.duration_s)))
|
aligned_in_s = max(scene.start_s, min(aligned_in_s, max(scene.start_s, scene.end_s - anchor_beat.duration_s)))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -1033,6 +1109,8 @@ def _recover_unmatched_beats_via_vision(results: list, beats: list, cfg) -> list
|
|||||||
combined_score,
|
combined_score,
|
||||||
min(0.99, semantic_score * 0.65 + motion_score * 0.18 + content_score * 0.09 + usable_score * 0.08),
|
min(0.99, semantic_score * 0.65 + motion_score * 0.18 + content_score * 0.09 + usable_score * 0.08),
|
||||||
)
|
)
|
||||||
|
if semantic_score >= max(0.60, cfg.cv.deep_scan.provisional_match_threshold):
|
||||||
|
final_score = max(final_score, semantic_score)
|
||||||
if final_score < cfg.cv.deep_scan.provisional_match_threshold:
|
if final_score < cfg.cv.deep_scan.provisional_match_threshold:
|
||||||
continue
|
continue
|
||||||
candidate = (final_score, scene, aligned_in_s, usable_duration_s, f"recovery; {reason}; {verify_reason}")
|
candidate = (final_score, scene, aligned_in_s, usable_duration_s, f"recovery; {reason}; {verify_reason}")
|
||||||
|
|||||||
@@ -194,6 +194,32 @@ Die Vision-Recovery läuft nicht nur für komplett fehlende Beats, sondern auch
|
|||||||
für schwache unbestätigte Treffer. Gerade Low-Light-Beats dürfen nicht an einem
|
für schwache unbestätigte Treffer. Gerade Low-Light-Beats dürfen nicht an einem
|
||||||
falschen dunklen CV-Treffer hängen bleiben, wenn der Cache semantisch eine
|
falschen dunklen CV-Treffer hängen bleiben, wenn der Cache semantisch eine
|
||||||
bessere Handlungsphase kennt.
|
bessere Handlungsphase kennt.
|
||||||
|
Bei langen Source-Szenen prüft die Action-Window-Suche immer den Szenenanfang
|
||||||
|
und mehrere frühe Fenster, bevor sie gleichmäßig über die ganze Szene sampelt.
|
||||||
|
Damit gehen kurze Trailer-Aktionen am Anfang einer langen Szene nicht unter,
|
||||||
|
wenn der Rest der Szene aus Credits, Schwarzbild oder ruhigen Folgeframes
|
||||||
|
besteht.
|
||||||
|
Wenn ein Action-Window die starke Beat-Aktion explizit enthält, darf es eine
|
||||||
|
etwas niedrigere Textähnlichkeit haben; die Handlung zählt dann stärker als
|
||||||
|
Nebenwörter zu Licht, Bildausschnitt oder Stimmung.
|
||||||
|
Bereits gecachte Action-Windows einer Szene bleiben gültige Kandidaten, auch
|
||||||
|
wenn sich das aktuelle Sampling-Raster ändert. So verliert der Matcher keine
|
||||||
|
teuren Vision-Hinweise und muss dieselben Fenster nicht erneut beschreiben.
|
||||||
|
Wenn neue Vision-Calls deaktiviert sind, darf die Recovery vorhandene Cache-
|
||||||
|
Beschreibungen trotzdem lesen; das erzeugt keine API-Kosten und verhindert,
|
||||||
|
dass alte schwache CV-Treffer stehen bleiben.
|
||||||
|
Schlägt die CV-Feinjustierung bei einem semantisch klaren Low-Light-Fenster
|
||||||
|
fehl, bleibt das Action-Window als provisorischer Treffer erhalten. CV darf
|
||||||
|
einen dunklen Treffer verfeinern, aber nicht einen eindeutigen Cache-Hinweis
|
||||||
|
komplett verwerfen.
|
||||||
|
Zusätzlich kann Recovery vorhandene gecachte Action-Windows direkt über alle
|
||||||
|
Szenen ranken. Dieser schnelle Pfad vermeidet einen teuren Vollscan, wenn der
|
||||||
|
Cache bereits eine starke Aktion wie Hand-am-Mund, Kuss oder Blickwechsel
|
||||||
|
enthält.
|
||||||
|
Eindeutige Begriffe aus der Beat-Beschreibung wirken als harte Filter für
|
||||||
|
Vision-Fenster: `mouth` muss im Kandidaten wiederkehren, `dark interior` darf
|
||||||
|
nicht auf Outdoor-Material fallen, und markante Personenmerkmale wie `blonde`
|
||||||
|
bleiben bindend.
|
||||||
|
|
||||||
Der zusätzliche Hi-Res-Phasenrefine bleibt lokal um den bereits validierten
|
Der zusätzliche Hi-Res-Phasenrefine bleibt lokal um den bereits validierten
|
||||||
Inpoint und übernimmt nur klare Verbesserungen. Er darf keine ganze lange
|
Inpoint und übernimmt nur klare Verbesserungen. Er darf keine ganze lange
|
||||||
|
|||||||
+118
-28
@@ -434,12 +434,20 @@ def _scene_window_ranges(scene: Scene, beat: TrailerBeat, max_windows: int) -> l
|
|||||||
|
|
||||||
usable_start = scene.start_s
|
usable_start = scene.start_s
|
||||||
usable_end = max(scene.start_s, scene.end_s - window_s)
|
usable_end = max(scene.start_s, scene.end_s - window_s)
|
||||||
if max_windows == 1:
|
starts = [usable_start]
|
||||||
starts = [usable_start + (usable_end - usable_start) * 0.5]
|
early_step = max(0.5, window_s * 0.75)
|
||||||
else:
|
for idx in range(1, min(max_windows, 4)):
|
||||||
step = (usable_end - usable_start) / max(1, max_windows - 1)
|
starts.append(min(usable_end, usable_start + early_step * idx))
|
||||||
starts = [usable_start + step * idx for idx in range(max_windows)]
|
remaining = max_windows - len(starts)
|
||||||
return [(start_s, min(scene.end_s, start_s + window_s)) for start_s in starts]
|
if remaining > 0:
|
||||||
|
if remaining == 1:
|
||||||
|
starts.append(usable_start + (usable_end - usable_start) * 0.5)
|
||||||
|
else:
|
||||||
|
step = (usable_end - usable_start) / max(1, remaining - 1)
|
||||||
|
starts.extend(usable_start + step * idx for idx in range(remaining))
|
||||||
|
|
||||||
|
deduped = sorted({round(max(usable_start, min(usable_end, s)), 3) for s in starts})
|
||||||
|
return [(start_s, min(scene.end_s, start_s + window_s)) for start_s in deduped[:max_windows]]
|
||||||
|
|
||||||
|
|
||||||
def _cached_scene_descriptions(
|
def _cached_scene_descriptions(
|
||||||
@@ -749,11 +757,11 @@ def find_action_window_in_scene(
|
|||||||
inside that scene. It stays automatic and cached: windows are described
|
inside that scene. It stays automatic and cached: windows are described
|
||||||
evenly across the scene until the per-run vision budget is consumed.
|
evenly across the scene until the per-run vision budget is consumed.
|
||||||
"""
|
"""
|
||||||
if not cfg.vision.enabled or scene.duration_s <= 0:
|
if scene.duration_s <= 0:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
cache = _load_cache(cfg)
|
cache = _load_cache(cfg)
|
||||||
budget = [max(0, cfg.vision.max_new_descriptions_per_run)]
|
budget = [max(0, cfg.vision.max_new_descriptions_per_run) if cfg.vision.enabled else 0]
|
||||||
beat_desc = _describe_sample(
|
beat_desc = _describe_sample(
|
||||||
kind="beat",
|
kind="beat",
|
||||||
item_id=beat.beat_id,
|
item_id=beat.beat_id,
|
||||||
@@ -772,37 +780,37 @@ def find_action_window_in_scene(
|
|||||||
if not beat_actions:
|
if not beat_actions:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
max_windows = max(
|
|
||||||
cfg.vision.seed_points_per_scene,
|
|
||||||
cfg.vision.max_new_descriptions_per_run,
|
|
||||||
)
|
|
||||||
best: tuple[float, float, float, str] | None = None
|
best: tuple[float, float, float, str] | None = None
|
||||||
for start_s, end_s in _scene_window_ranges(scene, beat, max_windows):
|
|
||||||
desc = _describe_sample(
|
def consider_candidate(start_s: float, end_s: float, desc: str) -> None:
|
||||||
kind="action_window",
|
nonlocal best
|
||||||
item_id=scene.scene_id,
|
|
||||||
label=f"source scene {scene.scene_id} action window {start_s:.2f}-{end_s:.2f}",
|
|
||||||
video_path=scene.source_path,
|
|
||||||
start_s=start_s,
|
|
||||||
end_s=end_s,
|
|
||||||
cfg=cfg,
|
|
||||||
cache=cache,
|
|
||||||
budget=budget,
|
|
||||||
)
|
|
||||||
if not desc:
|
if not desc:
|
||||||
continue
|
return
|
||||||
|
beat_text = beat_desc.lower()
|
||||||
|
source_text = desc.lower()
|
||||||
|
positive_source_text = source_text.split('"negatives"', 1)[0]
|
||||||
|
if "mouth" in beat_text and "mouth" not in positive_source_text:
|
||||||
|
return
|
||||||
|
if "dark interior" in beat_text and (
|
||||||
|
"interior" not in positive_source_text or "dark" not in positive_source_text
|
||||||
|
):
|
||||||
|
return
|
||||||
|
if "blonde" in beat_text and "blonde" not in positive_source_text:
|
||||||
|
return
|
||||||
score, reason = _semantic_match_score(beat_desc, desc)
|
score, reason = _semantic_match_score(beat_desc, desc)
|
||||||
source_actions = _semantic_action_groups(desc)
|
source_actions = _semantic_action_groups(desc)
|
||||||
missing_actions = _missing_action_groups(beat_actions, source_actions)
|
missing_actions = _missing_action_groups(beat_actions, source_actions)
|
||||||
if missing_actions:
|
if missing_actions:
|
||||||
continue
|
return
|
||||||
threshold = max(0.38, cfg.vision.similarity_threshold + 0.18)
|
threshold = max(0.38, cfg.vision.similarity_threshold + 0.18)
|
||||||
|
if beat_actions and beat_actions <= source_actions:
|
||||||
|
threshold = min(threshold, max(0.52, cfg.vision.similarity_threshold + 0.05))
|
||||||
if score < threshold:
|
if score < threshold:
|
||||||
continue
|
return
|
||||||
phase_adjustment, phase_reason = _action_phase_adjustment(beat_desc, desc)
|
phase_adjustment, phase_reason = _action_phase_adjustment(beat_desc, desc)
|
||||||
adjusted_score = max(0.0, min(1.0, score + phase_adjustment))
|
adjusted_score = max(0.0, min(1.0, score + phase_adjustment))
|
||||||
if adjusted_score < threshold:
|
if adjusted_score < threshold:
|
||||||
continue
|
return
|
||||||
candidate = (
|
candidate = (
|
||||||
start_s,
|
start_s,
|
||||||
end_s,
|
end_s,
|
||||||
@@ -814,5 +822,87 @@ def find_action_window_in_scene(
|
|||||||
):
|
):
|
||||||
best = candidate
|
best = candidate
|
||||||
|
|
||||||
|
max_windows = max(
|
||||||
|
cfg.vision.seed_points_per_scene,
|
||||||
|
cfg.vision.max_new_descriptions_per_run,
|
||||||
|
)
|
||||||
|
ranges = _scene_window_ranges(scene, beat, max_windows)
|
||||||
|
cached_desc_by_range: dict[tuple[float, float], str] = {}
|
||||||
|
cached_items = cache.get("items", {})
|
||||||
|
if isinstance(cached_items, dict):
|
||||||
|
for item in cached_items.values():
|
||||||
|
if not isinstance(item, dict) or item.get("kind") != "action_window":
|
||||||
|
continue
|
||||||
|
if item.get("item_id") != scene.scene_id:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
start_s = float(item.get("start_s"))
|
||||||
|
end_s = float(item.get("end_s"))
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
continue
|
||||||
|
if scene.start_s <= start_s < scene.end_s and end_s > start_s:
|
||||||
|
key = (round(start_s, 3), round(min(scene.end_s, end_s), 3))
|
||||||
|
ranges.append(key)
|
||||||
|
description = item.get("description", "")
|
||||||
|
if isinstance(description, str) and description.strip():
|
||||||
|
cached_desc_by_range[key] = description
|
||||||
|
consider_candidate(key[0], key[1], description)
|
||||||
|
ranges = sorted({(round(start_s, 3), round(end_s, 3)) for start_s, end_s in ranges})
|
||||||
|
|
||||||
|
for start_s, end_s in ranges:
|
||||||
|
desc = cached_desc_by_range.get((round(start_s, 3), round(end_s, 3)))
|
||||||
|
if desc is None:
|
||||||
|
desc = _describe_sample(
|
||||||
|
kind="action_window",
|
||||||
|
item_id=scene.scene_id,
|
||||||
|
label=f"source scene {scene.scene_id} action window {start_s:.2f}-{end_s:.2f}",
|
||||||
|
video_path=scene.source_path,
|
||||||
|
start_s=start_s,
|
||||||
|
end_s=end_s,
|
||||||
|
cfg=cfg,
|
||||||
|
cache=cache,
|
||||||
|
budget=budget,
|
||||||
|
)
|
||||||
|
if not desc:
|
||||||
|
continue
|
||||||
|
consider_candidate(start_s, end_s, desc)
|
||||||
|
|
||||||
_save_cache(cfg, cache)
|
_save_cache(cfg, cache)
|
||||||
|
if best is None and isinstance(cached_items, dict):
|
||||||
|
for item in cached_items.values():
|
||||||
|
if not isinstance(item, dict) or item.get("kind") != "action_window":
|
||||||
|
continue
|
||||||
|
if item.get("item_id") != scene.scene_id:
|
||||||
|
continue
|
||||||
|
desc = item.get("description", "")
|
||||||
|
if not isinstance(desc, str) or not desc.strip():
|
||||||
|
continue
|
||||||
|
beat_text = beat_desc.lower()
|
||||||
|
source_text = desc.lower()
|
||||||
|
positive_source_text = source_text.split('"negatives"', 1)[0]
|
||||||
|
if "mouth" in beat_text and "mouth" not in positive_source_text:
|
||||||
|
continue
|
||||||
|
if "dark interior" in beat_text and (
|
||||||
|
"interior" not in positive_source_text or "dark" not in positive_source_text
|
||||||
|
):
|
||||||
|
continue
|
||||||
|
if "blonde" in beat_text and "blonde" not in positive_source_text:
|
||||||
|
continue
|
||||||
|
source_actions = _semantic_action_groups(desc)
|
||||||
|
if not beat_actions or not beat_actions <= source_actions:
|
||||||
|
continue
|
||||||
|
score, reason = _semantic_match_score(beat_desc, desc)
|
||||||
|
if score < max(0.38, cfg.vision.similarity_threshold + 0.05):
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
start_s = float(item.get("start_s"))
|
||||||
|
end_s = float(item.get("end_s"))
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
continue
|
||||||
|
return (
|
||||||
|
start_s,
|
||||||
|
min(scene.end_s, end_s),
|
||||||
|
min(0.99, score),
|
||||||
|
f"{reason} phase=cached_action_window raw={score:.3f}",
|
||||||
|
)
|
||||||
return best
|
return best
|
||||||
|
|||||||
Reference in New Issue
Block a user