Improve vision matching for dissolve-heavy beats
This commit is contained in:
+136
-6
@@ -261,6 +261,107 @@ def _scene_seed_points(scene: Scene, max_points: int) -> list[float]:
|
||||
return [scene.start_s + step * idx for idx in range(max_points)]
|
||||
|
||||
|
||||
def _scene_window_ranges(scene: Scene, beat: TrailerBeat, max_windows: int) -> list[tuple[float, float]]:
|
||||
if max_windows <= 0 or scene.duration_s <= 0:
|
||||
return []
|
||||
window_s = min(scene.duration_s, max(1.0, beat.duration_s))
|
||||
if scene.duration_s <= window_s + 0.2:
|
||||
return [(scene.start_s, min(scene.end_s, scene.start_s + window_s))]
|
||||
|
||||
usable_start = scene.start_s
|
||||
usable_end = max(scene.start_s, scene.end_s - window_s)
|
||||
if max_windows == 1:
|
||||
starts = [usable_start + (usable_end - usable_start) * 0.5]
|
||||
else:
|
||||
step = (usable_end - usable_start) / max(1, max_windows - 1)
|
||||
starts = [usable_start + step * idx for idx in range(max_windows)]
|
||||
return [(start_s, min(scene.end_s, start_s + window_s)) for start_s in starts]
|
||||
|
||||
|
||||
def _cached_scene_descriptions(
|
||||
cache: dict,
|
||||
scenes_by_id: dict[int, Scene],
|
||||
cfg: AppConfig,
|
||||
) -> list[tuple[Scene, str]]:
|
||||
descriptions: list[tuple[Scene, str]] = []
|
||||
items = cache.get("items", {})
|
||||
if not isinstance(items, dict):
|
||||
return descriptions
|
||||
current_model_marker = f":{cfg.vision.provider}:{cfg.vision.model}:"
|
||||
for key, item in items.items():
|
||||
if current_model_marker not in str(key):
|
||||
continue
|
||||
if not isinstance(item, dict) or item.get("kind") != "scene":
|
||||
continue
|
||||
scene_id = item.get("item_id")
|
||||
if not isinstance(scene_id, int):
|
||||
continue
|
||||
scene = scenes_by_id.get(scene_id)
|
||||
description = item.get("description", "")
|
||||
if scene is not None and isinstance(description, str) and description.strip():
|
||||
descriptions.append((scene, description))
|
||||
return descriptions
|
||||
|
||||
|
||||
def _add_window_seed_descriptions(
|
||||
*,
|
||||
beat: TrailerBeat,
|
||||
beat_desc: str,
|
||||
ranked: list[tuple[float, Scene, str]],
|
||||
cfg: AppConfig,
|
||||
cache: dict,
|
||||
budget: list[int],
|
||||
ranked_by_scene: dict[int, tuple[float, Scene, str]],
|
||||
) -> list[tuple[float, float]]:
|
||||
points: list[tuple[float, float]] = []
|
||||
if budget[0] <= 0:
|
||||
return points
|
||||
|
||||
scenes_to_probe = ranked[: max(1, cfg.vision.max_seed_scenes)]
|
||||
windows_per_scene = max(1, min(6, cfg.vision.seed_points_per_scene // 2))
|
||||
for _, scene, _ in scenes_to_probe:
|
||||
if budget[0] <= 0:
|
||||
break
|
||||
if scene.duration_s <= max(beat.duration_s * 1.6, 6.0):
|
||||
continue
|
||||
for start_s, end_s in _scene_window_ranges(scene, beat, windows_per_scene):
|
||||
if budget[0] <= 0:
|
||||
break
|
||||
desc = _describe_sample(
|
||||
kind="scene_window",
|
||||
item_id=scene.scene_id,
|
||||
label=f"source scene {scene.scene_id} window {start_s:.2f}-{end_s:.2f}",
|
||||
video_path=scene.source_path,
|
||||
start_s=start_s,
|
||||
end_s=end_s,
|
||||
cfg=cfg,
|
||||
cache=cache,
|
||||
budget=budget,
|
||||
)
|
||||
if not desc:
|
||||
continue
|
||||
score = _text_similarity(beat_desc, desc)
|
||||
if score < cfg.vision.similarity_threshold:
|
||||
continue
|
||||
semantic_score = min(0.99, score + 0.30)
|
||||
existing = ranked_by_scene.get(scene.scene_id)
|
||||
if existing is None or semantic_score > existing[0]:
|
||||
ranked_by_scene[scene.scene_id] = (semantic_score, scene, "window")
|
||||
logger.info(
|
||||
"Beat %d: vision window seed scene=%d start=%.3fs score=%.3f",
|
||||
beat.beat_id,
|
||||
scene.scene_id,
|
||||
start_s,
|
||||
semantic_score,
|
||||
)
|
||||
weighted_score = max(
|
||||
cfg.cv.deep_scan.coarse_candidate_threshold,
|
||||
min(0.99, cfg.vision.seed_score * (0.78 + min(1.0, semantic_score) * 0.22)),
|
||||
)
|
||||
points.append((start_s, weighted_score))
|
||||
return points
|
||||
|
||||
|
||||
def build_vision_seed_in_points(
|
||||
beats: Sequence[TrailerBeat],
|
||||
scenes: Sequence[Scene],
|
||||
@@ -308,7 +409,7 @@ def build_vision_seed_in_points(
|
||||
phash_max_distance=64,
|
||||
)
|
||||
|
||||
ranked: list[tuple[float, Scene]] = []
|
||||
ranked_by_scene: dict[int, tuple[float, Scene, str]] = {}
|
||||
for hit in hits:
|
||||
scene = scenes_by_id.get(hit.scene_id)
|
||||
if scene is None:
|
||||
@@ -328,16 +429,45 @@ def build_vision_seed_in_points(
|
||||
continue
|
||||
score = _text_similarity(beat_desc, scene_desc)
|
||||
if score >= cfg.vision.similarity_threshold:
|
||||
ranked.append((score, scene))
|
||||
ranked_by_scene[scene.scene_id] = (min(0.99, score + 0.25), scene, "vision")
|
||||
|
||||
ranked.sort(key=lambda item: item[0], reverse=True)
|
||||
points: list[tuple[float, float]] = []
|
||||
for score, scene in ranked[:cfg.vision.max_seed_scenes]:
|
||||
# Keep the strongest low-level visual candidates as seeds as well.
|
||||
# Text descriptions can miss timing-specific repeats inside one
|
||||
# scene; the deep scan still has to validate every seed frame.
|
||||
vibe_score = max(0.0, min(1.0, float(hit.combined_score)))
|
||||
existing = ranked_by_scene.get(scene.scene_id)
|
||||
if existing is None or vibe_score > existing[0]:
|
||||
ranked_by_scene[scene.scene_id] = (vibe_score, scene, "vibe")
|
||||
|
||||
for scene, scene_desc in _cached_scene_descriptions(cache, scenes_by_id, cfg):
|
||||
score = _text_similarity(beat_desc, scene_desc)
|
||||
if score < cfg.vision.similarity_threshold:
|
||||
continue
|
||||
semantic_score = min(0.99, score + 0.25)
|
||||
existing = ranked_by_scene.get(scene.scene_id)
|
||||
if existing is None or semantic_score > existing[0]:
|
||||
ranked_by_scene[scene.scene_id] = (semantic_score, scene, "cache")
|
||||
|
||||
ranked = sorted(ranked_by_scene.values(), key=lambda item: item[0], reverse=True)
|
||||
window_points = _add_window_seed_descriptions(
|
||||
beat=beat,
|
||||
beat_desc=beat_desc,
|
||||
ranked=ranked,
|
||||
cfg=cfg,
|
||||
cache=cache,
|
||||
budget=budget,
|
||||
ranked_by_scene=ranked_by_scene,
|
||||
)
|
||||
ranked = sorted(ranked_by_scene.values(), key=lambda item: item[0], reverse=True)
|
||||
seed_limit = min(len(ranked), max(cfg.vision.max_seed_scenes, cfg.vision.max_seed_scenes * 2))
|
||||
points: list[tuple[float, float]] = [*window_points]
|
||||
for score, scene, source in ranked[:seed_limit]:
|
||||
logger.info(
|
||||
"Beat %d: vision seed scene=%d score=%.3f",
|
||||
"Beat %d: vision seed scene=%d score=%.3f source=%s",
|
||||
beat.beat_id,
|
||||
scene.scene_id,
|
||||
score,
|
||||
source,
|
||||
)
|
||||
weighted_score = max(
|
||||
cfg.cv.deep_scan.coarse_candidate_threshold,
|
||||
|
||||
Reference in New Issue
Block a user