Improve segmented vision matching quality
This commit is contained in:
@@ -232,7 +232,12 @@ Bei aktivierter Vision wird für gezielte Match-Läufe trotzdem zuerst ein
|
||||
schneller seed-basierter CV-Prepass ausgeführt. Er überspringt den vollen
|
||||
FFmpeg-Stream nur vorläufig und akzeptiert einen Treffer erst nach derselben
|
||||
Bild-/Phasenvalidierung wie der normale Matcher. Nur nicht gelöste Beats fallen
|
||||
danach auf den vollständigen Scan zurück.
|
||||
danach auf den vollständigen Scan zurück. Die Qualitätsparameter für lokale
|
||||
Vision-Szenenscans und Refine-Kandidaten bleiben dabei erhalten; der Prepass ist
|
||||
eine Reihenfolge-Optimierung, kein Qualitätsdeckel.
|
||||
OpenRouter-/Vision-Rate-Limits werden mit progressiv längeren Pausen erneut
|
||||
versucht. Billing-, Credit- oder Token-Guthaben-Fehler werden dagegen sofort als
|
||||
echter Blocker gemeldet, weil Warten dort nicht hilft.
|
||||
Lange Trailerbeats werden nicht mehr automatisch über ihre gesamte Beat-Länge
|
||||
gegen einen einzigen Source-Clip validiert. Sobald nach einem sichtbaren
|
||||
Source-Abschnitt eine anhaltende Schwarzblende oder Titel-/Credit-Insel beginnt,
|
||||
@@ -246,6 +251,12 @@ Insel; der HTML-Report setzt diese Source-Segmente frame-lockend zusammen und
|
||||
füllt nur echte Zwischenlücken mit Schwarz. Dadurch können per Blende verbundene
|
||||
Trailer-Einstellungen innerhalb eines Beats getrennt gematcht werden, ohne die
|
||||
globale Scene Detection aggressiver oder beat-spezifisch zu kuratieren.
|
||||
Beats mit mehreren sichtbaren Inseln werden direkt segmentiert gesucht, statt
|
||||
zuerst als ein künstlich zusammenhängender Source-Clip über den ganzen Film zu
|
||||
laufen. Jede Insel nutzt dieselbe gestufte Vision-/CV-Validierung wie ein
|
||||
normaler Beat; der zusammengesetzte Report bleibt beat-synchron. Wenn der
|
||||
schnelle validierte Vision-Prepass für eine Insel keinen Treffer liefert, darf
|
||||
diese Insel weiterhin in den vollständigen Scan fallen.
|
||||
Falls ein kompletter Beat keinen belastbaren Einzelclip ergibt, versucht der
|
||||
Matcher dieselbe Segmentlogik automatisch als Fallback: sichtbare Inseln werden
|
||||
einzeln global gesucht und anschließend wieder zu einem Beat-Ergebnis
|
||||
|
||||
@@ -623,6 +623,47 @@ def _attach_visual_segments(results: list, beats: list, cfg) -> list:
|
||||
return expanded
|
||||
|
||||
|
||||
def _fast_vision_match_cfg(cfg):
|
||||
"""Return a vision-seed prepass config that still keeps quality settings."""
|
||||
from dataclasses import replace
|
||||
|
||||
return replace(
|
||||
cfg,
|
||||
cv=replace(
|
||||
cfg.cv,
|
||||
deep_scan=replace(cfg.cv.deep_scan, skip_coarse_scan_with_weighted_seeds=True),
|
||||
),
|
||||
vision=replace(
|
||||
cfg.vision,
|
||||
fullscan_fallback=False,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def _run_segment_match(segment_beat, continuity, cfg, allow_fullscan: bool = True):
|
||||
"""Match one visual island with the same generic staged strategy as a beat."""
|
||||
from src.pipeline.matcher import run_matching
|
||||
|
||||
if cfg.vision.enabled:
|
||||
fast_cfg = _fast_vision_match_cfg(cfg)
|
||||
fast_matches = run_matching(
|
||||
fast_cfg,
|
||||
[segment_beat],
|
||||
seed_in_points=continuity,
|
||||
)
|
||||
if fast_matches:
|
||||
return fast_matches
|
||||
|
||||
if not allow_fullscan:
|
||||
return []
|
||||
|
||||
return run_matching(
|
||||
cfg,
|
||||
[segment_beat],
|
||||
seed_in_points=continuity,
|
||||
)
|
||||
|
||||
|
||||
def _match_unmatched_visual_segments(
|
||||
results: list,
|
||||
beats: list,
|
||||
@@ -634,7 +675,6 @@ def _match_unmatched_visual_segments(
|
||||
from dataclasses import replace
|
||||
from src.core.models import MatchResult, MatchSegment
|
||||
from src.cv.frame_extractor import get_video_info
|
||||
from src.cv.global_scan import run_global_scan
|
||||
|
||||
matched_ids = {r.beat_id for r in results}
|
||||
expanded = list(results)
|
||||
@@ -667,11 +707,7 @@ def _match_unmatched_visual_segments(
|
||||
)
|
||||
segment_matches = []
|
||||
if beat.beat_id not in skip_global_segment_scan_for:
|
||||
segment_matches = run_global_scan(
|
||||
[segment_beat],
|
||||
cfg,
|
||||
seed_in_points=continuity,
|
||||
)
|
||||
segment_matches = _run_segment_match(segment_beat, continuity, cfg, allow_fullscan=True)
|
||||
if not segment_matches:
|
||||
local_segment = _local_same_scene_segment_match(
|
||||
segment_beat,
|
||||
@@ -799,7 +835,13 @@ def cmd_match(args: argparse.Namespace, cfg) -> list:
|
||||
all_beats = _load_beats(cfg)
|
||||
beats = _select_beats(all_beats, getattr(args, "beat", None))
|
||||
cached = _normalize_cached_results(all_beats, _load_results(cfg), cfg) if _results_cache_path(cfg).exists() else []
|
||||
multi_island_beat_ids = {
|
||||
beat.beat_id
|
||||
for beat in beats
|
||||
if len(_reference_scoreable_segments(beat, cfg)) > 1
|
||||
}
|
||||
scan_beats, single_island_trims = _trim_beats_to_single_visual_island(beats, cfg)
|
||||
scan_beats = [b for b in scan_beats if b.beat_id not in multi_island_beat_ids]
|
||||
seed_in_points = (
|
||||
_continuity_seed_in_points(args.beat, all_beats, cached, cfg)
|
||||
if getattr(args, "beat", None) is not None
|
||||
@@ -807,14 +849,7 @@ def cmd_match(args: argparse.Namespace, cfg) -> list:
|
||||
)
|
||||
results = []
|
||||
if cfg.vision.enabled:
|
||||
fast_cfg = replace(
|
||||
cfg,
|
||||
cv=replace(
|
||||
cfg.cv,
|
||||
deep_scan=replace(cfg.cv.deep_scan, skip_coarse_scan_with_weighted_seeds=True),
|
||||
),
|
||||
vision=replace(cfg.vision, fullscan_fallback=False),
|
||||
)
|
||||
fast_cfg = _fast_vision_match_cfg(cfg)
|
||||
results = run_matching(
|
||||
fast_cfg,
|
||||
scan_beats,
|
||||
|
||||
@@ -13,6 +13,7 @@ import base64
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from dataclasses import asdict
|
||||
@@ -38,6 +39,17 @@ Return only compact JSON with these keys:
|
||||
subject, setting, composition, action_phase, distinctive_objects, lighting_color, negatives.
|
||||
Focus on stable visual facts and spatial layout. Ignore timecode overlays, subtitles, logos, compression, aspect ratio, and color grading differences."""
|
||||
|
||||
_RETRYABLE_HTTP_CODES = {408, 409, 425, 429, 500, 502, 503, 504}
|
||||
_CREDIT_ERROR_PATTERNS = (
|
||||
"insufficient credit",
|
||||
"insufficient credits",
|
||||
"no credits",
|
||||
"out of credits",
|
||||
"billing",
|
||||
"quota exceeded",
|
||||
"payment required",
|
||||
)
|
||||
|
||||
|
||||
def _cache_path(cfg: AppConfig) -> Path:
|
||||
return cfg.paths.cache_dir / "vision_descriptions.json"
|
||||
@@ -133,13 +145,44 @@ def _call_vision_model(label: str, image_urls: list[str], cfg: AppConfig) -> str
|
||||
|
||||
url = f"{vision.base_url.rstrip('/')}/chat/completions"
|
||||
req = urllib.request.Request(url, data=body, headers=headers, method="POST")
|
||||
delays_s = (8.0, 20.0, 45.0, 90.0)
|
||||
for attempt in range(len(delays_s) + 1):
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=vision.timeout_seconds) as resp:
|
||||
data = json.loads(resp.read().decode("utf-8"))
|
||||
return str(data["choices"][0]["message"]["content"]).strip()
|
||||
except urllib.error.HTTPError as exc:
|
||||
body_text = exc.read().decode(errors="replace")
|
||||
lowered = body_text.lower()
|
||||
if exc.code == 402 or any(pattern in lowered for pattern in _CREDIT_ERROR_PATTERNS):
|
||||
raise RuntimeError(f"Vision HTTP {exc.code} from {url}:\n{body_text}") from exc
|
||||
if exc.code not in _RETRYABLE_HTTP_CODES or attempt >= len(delays_s):
|
||||
raise RuntimeError(f"Vision HTTP {exc.code} from {url}:\n{body_text}") from exc
|
||||
delay_s = delays_s[attempt]
|
||||
logger.warning(
|
||||
"Vision HTTP %d for %s; waiting %.0fs before retry %d/%d.",
|
||||
exc.code,
|
||||
label,
|
||||
delay_s,
|
||||
attempt + 1,
|
||||
len(delays_s),
|
||||
)
|
||||
time.sleep(delay_s)
|
||||
except urllib.error.URLError as exc:
|
||||
if attempt >= len(delays_s):
|
||||
raise RuntimeError(f"Vision request failed for {url}: {exc}") from exc
|
||||
delay_s = delays_s[attempt]
|
||||
logger.warning(
|
||||
"Vision request failed for %s (%s); waiting %.0fs before retry %d/%d.",
|
||||
label,
|
||||
exc.reason,
|
||||
delay_s,
|
||||
attempt + 1,
|
||||
len(delays_s),
|
||||
)
|
||||
time.sleep(delay_s)
|
||||
|
||||
raise RuntimeError(f"Vision request failed unexpectedly for {url}")
|
||||
|
||||
|
||||
def _description_key(kind: str, item_id: int, start_s: float, end_s: float, cfg: AppConfig) -> str:
|
||||
|
||||
@@ -175,6 +175,9 @@ def run_matching(
|
||||
logger.info("Trailer: %s", cfg.paths.reference_trailer.name)
|
||||
logger.info("Beats : %d", len(beats))
|
||||
logger.info("=" * 60)
|
||||
if not beats:
|
||||
logger.info("No beats requested; skipping scene and global scans.")
|
||||
return []
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Phase 0: Scene index
|
||||
|
||||
Reference in New Issue
Block a user