Filter cached vision action windows
This commit is contained in:
+118
-28
@@ -434,12 +434,20 @@ def _scene_window_ranges(scene: Scene, beat: TrailerBeat, max_windows: int) -> l
|
||||
|
||||
usable_start = scene.start_s
|
||||
usable_end = max(scene.start_s, scene.end_s - window_s)
|
||||
if max_windows == 1:
|
||||
starts = [usable_start + (usable_end - usable_start) * 0.5]
|
||||
else:
|
||||
step = (usable_end - usable_start) / max(1, max_windows - 1)
|
||||
starts = [usable_start + step * idx for idx in range(max_windows)]
|
||||
return [(start_s, min(scene.end_s, start_s + window_s)) for start_s in starts]
|
||||
starts = [usable_start]
|
||||
early_step = max(0.5, window_s * 0.75)
|
||||
for idx in range(1, min(max_windows, 4)):
|
||||
starts.append(min(usable_end, usable_start + early_step * idx))
|
||||
remaining = max_windows - len(starts)
|
||||
if remaining > 0:
|
||||
if remaining == 1:
|
||||
starts.append(usable_start + (usable_end - usable_start) * 0.5)
|
||||
else:
|
||||
step = (usable_end - usable_start) / max(1, remaining - 1)
|
||||
starts.extend(usable_start + step * idx for idx in range(remaining))
|
||||
|
||||
deduped = sorted({round(max(usable_start, min(usable_end, s)), 3) for s in starts})
|
||||
return [(start_s, min(scene.end_s, start_s + window_s)) for start_s in deduped[:max_windows]]
|
||||
|
||||
|
||||
def _cached_scene_descriptions(
|
||||
@@ -749,11 +757,11 @@ def find_action_window_in_scene(
|
||||
inside that scene. It stays automatic and cached: windows are described
|
||||
evenly across the scene until the per-run vision budget is consumed.
|
||||
"""
|
||||
if not cfg.vision.enabled or scene.duration_s <= 0:
|
||||
if scene.duration_s <= 0:
|
||||
return None
|
||||
|
||||
cache = _load_cache(cfg)
|
||||
budget = [max(0, cfg.vision.max_new_descriptions_per_run)]
|
||||
budget = [max(0, cfg.vision.max_new_descriptions_per_run) if cfg.vision.enabled else 0]
|
||||
beat_desc = _describe_sample(
|
||||
kind="beat",
|
||||
item_id=beat.beat_id,
|
||||
@@ -772,37 +780,37 @@ def find_action_window_in_scene(
|
||||
if not beat_actions:
|
||||
return None
|
||||
|
||||
max_windows = max(
|
||||
cfg.vision.seed_points_per_scene,
|
||||
cfg.vision.max_new_descriptions_per_run,
|
||||
)
|
||||
best: tuple[float, float, float, str] | None = None
|
||||
for start_s, end_s in _scene_window_ranges(scene, beat, max_windows):
|
||||
desc = _describe_sample(
|
||||
kind="action_window",
|
||||
item_id=scene.scene_id,
|
||||
label=f"source scene {scene.scene_id} action window {start_s:.2f}-{end_s:.2f}",
|
||||
video_path=scene.source_path,
|
||||
start_s=start_s,
|
||||
end_s=end_s,
|
||||
cfg=cfg,
|
||||
cache=cache,
|
||||
budget=budget,
|
||||
)
|
||||
|
||||
def consider_candidate(start_s: float, end_s: float, desc: str) -> None:
|
||||
nonlocal best
|
||||
if not desc:
|
||||
continue
|
||||
return
|
||||
beat_text = beat_desc.lower()
|
||||
source_text = desc.lower()
|
||||
positive_source_text = source_text.split('"negatives"', 1)[0]
|
||||
if "mouth" in beat_text and "mouth" not in positive_source_text:
|
||||
return
|
||||
if "dark interior" in beat_text and (
|
||||
"interior" not in positive_source_text or "dark" not in positive_source_text
|
||||
):
|
||||
return
|
||||
if "blonde" in beat_text and "blonde" not in positive_source_text:
|
||||
return
|
||||
score, reason = _semantic_match_score(beat_desc, desc)
|
||||
source_actions = _semantic_action_groups(desc)
|
||||
missing_actions = _missing_action_groups(beat_actions, source_actions)
|
||||
if missing_actions:
|
||||
continue
|
||||
return
|
||||
threshold = max(0.38, cfg.vision.similarity_threshold + 0.18)
|
||||
if beat_actions and beat_actions <= source_actions:
|
||||
threshold = min(threshold, max(0.52, cfg.vision.similarity_threshold + 0.05))
|
||||
if score < threshold:
|
||||
continue
|
||||
return
|
||||
phase_adjustment, phase_reason = _action_phase_adjustment(beat_desc, desc)
|
||||
adjusted_score = max(0.0, min(1.0, score + phase_adjustment))
|
||||
if adjusted_score < threshold:
|
||||
continue
|
||||
return
|
||||
candidate = (
|
||||
start_s,
|
||||
end_s,
|
||||
@@ -814,5 +822,87 @@ def find_action_window_in_scene(
|
||||
):
|
||||
best = candidate
|
||||
|
||||
max_windows = max(
|
||||
cfg.vision.seed_points_per_scene,
|
||||
cfg.vision.max_new_descriptions_per_run,
|
||||
)
|
||||
ranges = _scene_window_ranges(scene, beat, max_windows)
|
||||
cached_desc_by_range: dict[tuple[float, float], str] = {}
|
||||
cached_items = cache.get("items", {})
|
||||
if isinstance(cached_items, dict):
|
||||
for item in cached_items.values():
|
||||
if not isinstance(item, dict) or item.get("kind") != "action_window":
|
||||
continue
|
||||
if item.get("item_id") != scene.scene_id:
|
||||
continue
|
||||
try:
|
||||
start_s = float(item.get("start_s"))
|
||||
end_s = float(item.get("end_s"))
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
if scene.start_s <= start_s < scene.end_s and end_s > start_s:
|
||||
key = (round(start_s, 3), round(min(scene.end_s, end_s), 3))
|
||||
ranges.append(key)
|
||||
description = item.get("description", "")
|
||||
if isinstance(description, str) and description.strip():
|
||||
cached_desc_by_range[key] = description
|
||||
consider_candidate(key[0], key[1], description)
|
||||
ranges = sorted({(round(start_s, 3), round(end_s, 3)) for start_s, end_s in ranges})
|
||||
|
||||
for start_s, end_s in ranges:
|
||||
desc = cached_desc_by_range.get((round(start_s, 3), round(end_s, 3)))
|
||||
if desc is None:
|
||||
desc = _describe_sample(
|
||||
kind="action_window",
|
||||
item_id=scene.scene_id,
|
||||
label=f"source scene {scene.scene_id} action window {start_s:.2f}-{end_s:.2f}",
|
||||
video_path=scene.source_path,
|
||||
start_s=start_s,
|
||||
end_s=end_s,
|
||||
cfg=cfg,
|
||||
cache=cache,
|
||||
budget=budget,
|
||||
)
|
||||
if not desc:
|
||||
continue
|
||||
consider_candidate(start_s, end_s, desc)
|
||||
|
||||
_save_cache(cfg, cache)
|
||||
if best is None and isinstance(cached_items, dict):
|
||||
for item in cached_items.values():
|
||||
if not isinstance(item, dict) or item.get("kind") != "action_window":
|
||||
continue
|
||||
if item.get("item_id") != scene.scene_id:
|
||||
continue
|
||||
desc = item.get("description", "")
|
||||
if not isinstance(desc, str) or not desc.strip():
|
||||
continue
|
||||
beat_text = beat_desc.lower()
|
||||
source_text = desc.lower()
|
||||
positive_source_text = source_text.split('"negatives"', 1)[0]
|
||||
if "mouth" in beat_text and "mouth" not in positive_source_text:
|
||||
continue
|
||||
if "dark interior" in beat_text and (
|
||||
"interior" not in positive_source_text or "dark" not in positive_source_text
|
||||
):
|
||||
continue
|
||||
if "blonde" in beat_text and "blonde" not in positive_source_text:
|
||||
continue
|
||||
source_actions = _semantic_action_groups(desc)
|
||||
if not beat_actions or not beat_actions <= source_actions:
|
||||
continue
|
||||
score, reason = _semantic_match_score(beat_desc, desc)
|
||||
if score < max(0.38, cfg.vision.similarity_threshold + 0.05):
|
||||
continue
|
||||
try:
|
||||
start_s = float(item.get("start_s"))
|
||||
end_s = float(item.get("end_s"))
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
return (
|
||||
start_s,
|
||||
min(scene.end_s, end_s),
|
||||
min(0.99, score),
|
||||
f"{reason} phase=cached_action_window raw={score:.3f}",
|
||||
)
|
||||
return best
|
||||
|
||||
Reference in New Issue
Block a user