Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 730b5ef3c0 | |||
| f20f89b06b |
+1
-1
File diff suppressed because one or more lines are too long
+5
-5
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
Before Width: | Height: | Size: 11 KiB After Width: | Height: | Size: 11 KiB |
File diff suppressed because one or more lines are too long
+355
-16
@@ -198,6 +198,158 @@ def _fixed_content_features(frame: np.ndarray, cfg: AppConfig) -> tuple[np.ndarr
|
||||
)
|
||||
|
||||
|
||||
def _hires_phase_feature(frame: np.ndarray) -> np.ndarray:
|
||||
"""High-resolution normalised luma feature for intra-scene phase matching.
|
||||
|
||||
Standard pipeline features (160×80) lose the subtle pixel differences
|
||||
between talking-head phases (mouth open vs. closed). This 320×160 feature
|
||||
with an 8×8 spatial histogram grid provides enough spatial resolution to
|
||||
discriminate facial expression phases within a single continuous scene.
|
||||
"""
|
||||
trimmed = _trim_dark_borders(frame)
|
||||
h, w = trimmed.shape[:2]
|
||||
cropped = trimmed[int(h * 0.05):int(h * 0.95), :]
|
||||
gray = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY)
|
||||
gray = cv2.equalizeHist(gray)
|
||||
resized = cv2.resize(gray, (320, 160), interpolation=cv2.INTER_AREA)
|
||||
return resized
|
||||
|
||||
|
||||
def _hires_spatial_hist(frame_feature: np.ndarray) -> np.ndarray:
|
||||
"""8×8 grid spatial colour histogram from a hi-res luma feature."""
|
||||
h, w = frame_feature.shape[:2]
|
||||
grid = 8
|
||||
cell_h = h // grid
|
||||
cell_w = w // grid
|
||||
parts: list[np.ndarray] = []
|
||||
for gy in range(grid):
|
||||
for gx in range(grid):
|
||||
cell = frame_feature[gy * cell_h:(gy + 1) * cell_h,
|
||||
gx * cell_w:(gx + 1) * cell_w]
|
||||
hist = cv2.calcHist([cell], [0], None, [24], [0, 256]).astype(np.float32).flatten()
|
||||
parts.append(hist / (float(np.sum(hist)) + 1e-6))
|
||||
return np.concatenate(parts)
|
||||
|
||||
|
||||
def _hires_phase_score(
|
||||
ref_feature: np.ndarray,
|
||||
ref_spatial: np.ndarray,
|
||||
src_frame: np.ndarray,
|
||||
) -> float:
|
||||
"""Compare a source frame to a reference using hi-res phase features.
|
||||
|
||||
Uses three signals:
|
||||
1. Full-frame NCC for overall similarity
|
||||
2. Center-crop NCC for face/expression matching (key for talking heads)
|
||||
3. Spatial histogram for structural layout
|
||||
"""
|
||||
src_feat = _hires_phase_feature(src_frame)
|
||||
# Full-frame NCC
|
||||
ncc_full = float(cv2.matchTemplate(
|
||||
src_feat, ref_feature, cv2.TM_CCOEFF_NORMED
|
||||
)[0][0])
|
||||
# Center-crop NCC (face region — the center 40% of the frame)
|
||||
h, w = ref_feature.shape[:2]
|
||||
cy, cx = h // 2, w // 2
|
||||
ch, cw = int(h * 0.20), int(w * 0.20)
|
||||
ref_center = ref_feature[cy - ch:cy + ch, cx - cw:cx + cw]
|
||||
src_center = src_feat[cy - ch:cy + ch, cx - cw:cx + cw]
|
||||
ncc_center = float(cv2.matchTemplate(
|
||||
src_center, ref_center, cv2.TM_CCOEFF_NORMED
|
||||
)[0][0])
|
||||
# Spatial histogram similarity
|
||||
src_spatial = _hires_spatial_hist(src_feat)
|
||||
spatial = _hist_intersection(ref_spatial, src_spatial)
|
||||
return ncc_full * 0.25 + ncc_center * 0.45 + spatial * 0.30
|
||||
|
||||
|
||||
def _hires_phase_refine(
|
||||
beat: TrailerBeat,
|
||||
in_point_s: float,
|
||||
scene_start_s: float,
|
||||
scene_end_s: float,
|
||||
cfg: AppConfig,
|
||||
) -> float:
|
||||
"""Re-scan the full source scene at high resolution to correct phase.
|
||||
|
||||
This is applied as a final refinement step after the standard pipeline
|
||||
has identified the correct scene. It addresses the case where low-res
|
||||
features cannot distinguish between different phases of the same shot
|
||||
(e.g. mouth open vs. closed in a talking-head close-up).
|
||||
"""
|
||||
# Build hi-res templates from only the stable, bright reference frames
|
||||
# before any fade begins. Fading frames have dropping brightness that
|
||||
# would penalise correct source positions where those offsets map to
|
||||
# bright content in the source.
|
||||
matchable_s = estimate_matchable_reference_duration(beat, cfg, sample_step_s=0.04)
|
||||
ref_templates: list[tuple[float, np.ndarray, np.ndarray, float]] = []
|
||||
step_s = max(1.0 / cfg.export.edl_frame_rate, 0.04)
|
||||
t = 0.0
|
||||
while t <= matchable_s:
|
||||
frame = grab_frame_at_path(beat.trailer_path, beat.start_s + t)
|
||||
if frame is not None and _is_scoreable_reference_frame(frame, cfg):
|
||||
mean_l, p90_l, contrast = _reference_visibility_stats(frame, cfg)
|
||||
# Only use clearly visible frames (skip dimming fade frames)
|
||||
if mean_l >= 50.0 and contrast >= 40.0:
|
||||
feat = _hires_phase_feature(frame)
|
||||
spatial = _hires_spatial_hist(feat)
|
||||
ref_templates.append((t, feat, spatial, mean_l))
|
||||
t = round(t + step_s, 6)
|
||||
|
||||
if not ref_templates:
|
||||
return in_point_s
|
||||
|
||||
# For very short matchable durations (fast fades / cross-dissolves),
|
||||
# keep only the brightest template. When the beat fades quickly the
|
||||
# later templates are dim and penalise every bright source candidate
|
||||
# equally, destroying phase discrimination. A single bright anchor
|
||||
# gives maximum selectivity.
|
||||
if matchable_s < 1.0 and len(ref_templates) > 1:
|
||||
ref_templates.sort(key=lambda x: x[3], reverse=True)
|
||||
ref_templates = ref_templates[:1]
|
||||
logger.debug(
|
||||
'Beat %d: hi-res using single brightest template at offset %.3fs (luma %.1f)',
|
||||
beat.beat_id, ref_templates[0][0], ref_templates[0][3],
|
||||
)
|
||||
|
||||
# Strip the luma field for the scan loop
|
||||
scan_templates = [(off, feat, sp) for off, feat, sp, _ in ref_templates]
|
||||
max_ref_offset = max(off for off, _, _ in scan_templates)
|
||||
|
||||
# Scan the full scene
|
||||
best_t = in_point_s
|
||||
best_score = -1.0
|
||||
scan_step_s = max(1.0 / (cfg.export.edl_frame_rate or 24.0), 0.04)
|
||||
|
||||
with open_video(cfg.paths.source_movie) as cap:
|
||||
t = scene_start_s
|
||||
while t + max_ref_offset <= scene_end_s:
|
||||
scores: list[float] = []
|
||||
all_ok = True
|
||||
for off, ref_feat, ref_spatial in scan_templates:
|
||||
src_frame = grab_frame_at(cap, t + off)
|
||||
if src_frame is None:
|
||||
all_ok = False
|
||||
break
|
||||
scores.append(_hires_phase_score(ref_feat, ref_spatial, src_frame))
|
||||
if all_ok and scores:
|
||||
avg = sum(scores) / len(scores)
|
||||
combined = avg * 0.7 + min(scores) * 0.3
|
||||
if combined > best_score:
|
||||
best_score = combined
|
||||
best_t = t
|
||||
t = round(t + scan_step_s, 6)
|
||||
|
||||
if best_t != in_point_s:
|
||||
logger.info(
|
||||
'Beat %d: hi-res phase refine moved in-point %.3fs -> %.3fs '
|
||||
'(delta=%.3fs, score=%.4f)',
|
||||
beat.beat_id, in_point_s, best_t,
|
||||
best_t - in_point_s, best_score,
|
||||
)
|
||||
return best_t
|
||||
|
||||
|
||||
def _fixed_content_pair_score(
|
||||
ref_features: tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray],
|
||||
source_frame: np.ndarray,
|
||||
@@ -388,12 +540,36 @@ def _rerank_candidates_by_content(
|
||||
reranked: list[tuple[float, float, float]] = []
|
||||
with open_video(cfg.paths.source_movie) as cap:
|
||||
for coarse_score, t_sec in candidates:
|
||||
content_score = _fixed_content_sequence_score(cap, t_sec, templates, cfg)
|
||||
# If the candidate lands just before a scene boundary, also evaluate
|
||||
# the start of the next scene. A coarse-scan offset can place the
|
||||
# in-point a few frames into the preceding (wrong) scene, causing
|
||||
# the content and coverage scores to be artificially low even though
|
||||
# the next scene is the correct visual match.
|
||||
eval_t = t_sec
|
||||
if scenes is not None:
|
||||
cur_scene = _find_scene_for_time(scenes, t_sec, cfg)
|
||||
if cur_scene is not None:
|
||||
remaining = float(cur_scene.end_s) - t_sec
|
||||
next_idx = next(
|
||||
(i + 1 for i, s in enumerate(scenes) if s.scene_id == cur_scene.scene_id),
|
||||
None,
|
||||
)
|
||||
if (
|
||||
remaining < cfg.cv.deep_scan.scene_boundary_epsilon_s * 4
|
||||
and next_idx is not None
|
||||
and next_idx < len(scenes)
|
||||
):
|
||||
next_scene_start = float(scenes[next_idx].start_s)
|
||||
alt_content = _fixed_content_sequence_score(cap, next_scene_start, templates, cfg)
|
||||
cur_content = _fixed_content_sequence_score(cap, t_sec, templates, cfg)
|
||||
if alt_content > cur_content:
|
||||
eval_t = next_scene_start
|
||||
content_score = _fixed_content_sequence_score(cap, eval_t, templates, cfg)
|
||||
coverage_score = 1.0
|
||||
if scenes is not None and matchable_duration_s and matchable_duration_s > 0:
|
||||
usable_s = _contiguous_scene_coverage_duration(
|
||||
beat,
|
||||
t_sec,
|
||||
eval_t,
|
||||
scenes,
|
||||
matchable_duration_s,
|
||||
cfg,
|
||||
@@ -404,7 +580,7 @@ def _rerank_candidates_by_content(
|
||||
+ coarse_score * 0.18
|
||||
+ coverage_score * 0.20
|
||||
)
|
||||
reranked.append((rank_score, coarse_score, t_sec))
|
||||
reranked.append((rank_score, coarse_score, eval_t))
|
||||
|
||||
return sorted(reranked, key=lambda item: item[0], reverse=True)
|
||||
|
||||
@@ -772,6 +948,8 @@ def _content_alignment_score(
|
||||
in_point_s: float,
|
||||
templates: list[tuple[float, np.ndarray]],
|
||||
cfg: AppConfig,
|
||||
fps: float | None = None,
|
||||
frame_cache: dict[int, np.ndarray] | None = None,
|
||||
) -> float:
|
||||
if not templates:
|
||||
return -1.0
|
||||
@@ -782,7 +960,13 @@ def _content_alignment_score(
|
||||
early_scores: list[float] = []
|
||||
|
||||
for offset_s, template in templates:
|
||||
frame = grab_frame_at(cap, in_point_s + offset_s)
|
||||
t0 = in_point_s + offset_s
|
||||
if frame_cache is not None and fps is not None:
|
||||
idx = int(round(t0 * fps))
|
||||
frame = frame_cache.get(idx)
|
||||
else:
|
||||
frame = grab_frame_at(cap, t0)
|
||||
|
||||
if frame is None:
|
||||
return -1.0
|
||||
|
||||
@@ -840,6 +1024,20 @@ def align_in_point_by_content(
|
||||
end_s = estimated_in_point_s + window_s
|
||||
tie_delta = cfg.cv.deep_scan.start_tie_break_score_delta
|
||||
|
||||
min_offset = min(off for off, _ in templates)
|
||||
max_offset = max(off for off, _ in templates)
|
||||
req_start_s = max(0.0, start_s + min_offset - frame_step_s)
|
||||
req_end_s = end_s + max_offset + frame_step_s
|
||||
|
||||
frame_cache = {}
|
||||
t_req = req_start_s
|
||||
while t_req <= req_end_s:
|
||||
idx = int(round(t_req * fps))
|
||||
frame = grab_frame_at(cap, t_req)
|
||||
if frame is not None:
|
||||
frame_cache[idx] = frame
|
||||
t_req = round(t_req + frame_step_s, 6)
|
||||
|
||||
best_in = estimated_in_point_s
|
||||
best_score = -1.0
|
||||
t = start_s
|
||||
@@ -852,7 +1050,7 @@ def align_in_point_by_content(
|
||||
active_templates = []
|
||||
else:
|
||||
active_templates = templates
|
||||
score = _content_alignment_score(cap, t, active_templates, cfg) if active_templates else -1.0
|
||||
score = _content_alignment_score(cap, t, active_templates, cfg, fps=fps, frame_cache=frame_cache) if active_templates else -1.0
|
||||
if score > best_score + tie_delta:
|
||||
best_score = score
|
||||
best_in = t
|
||||
@@ -868,11 +1066,23 @@ def _motion_phase_score(
|
||||
in_point_s: float,
|
||||
motion_templates: list[tuple[float, float, np.ndarray, tuple[int, ...]]],
|
||||
cfg: AppConfig,
|
||||
fps: float | None = None,
|
||||
frame_cache: dict[int, np.ndarray] | None = None,
|
||||
) -> float:
|
||||
scores: list[float] = []
|
||||
for offset_s, step_s, ref_delta, template_shape in motion_templates:
|
||||
f0 = grab_frame_at(cap, in_point_s + offset_s)
|
||||
f1 = grab_frame_at(cap, in_point_s + offset_s + step_s)
|
||||
t0 = in_point_s + offset_s
|
||||
t1 = in_point_s + offset_s + step_s
|
||||
|
||||
if frame_cache is not None and fps is not None:
|
||||
idx0 = int(round(t0 * fps))
|
||||
idx1 = int(round(t1 * fps))
|
||||
f0 = frame_cache.get(idx0)
|
||||
f1 = frame_cache.get(idx1)
|
||||
else:
|
||||
f0 = grab_frame_at(cap, t0)
|
||||
f1 = grab_frame_at(cap, t1)
|
||||
|
||||
if f0 is None or f1 is None:
|
||||
return -1.0
|
||||
src0 = _fixed_feature(f0, template_shape, cfg)
|
||||
@@ -913,11 +1123,25 @@ def align_in_point_by_motion(
|
||||
end_s = estimated_in_point_s + window_s
|
||||
tie_delta = cfg.cv.deep_scan.start_tie_break_score_delta
|
||||
|
||||
min_offset = min(off for off, _, _, _ in motion_templates)
|
||||
max_offset = max(off + step for off, step, _, _ in motion_templates)
|
||||
req_start_s = max(0.0, start_s + min_offset - frame_step_s)
|
||||
req_end_s = end_s + max_offset + frame_step_s
|
||||
|
||||
frame_cache = {}
|
||||
t_req = req_start_s
|
||||
while t_req <= req_end_s:
|
||||
idx = int(round(t_req * fps))
|
||||
frame = grab_frame_at(cap, t_req)
|
||||
if frame is not None:
|
||||
frame_cache[idx] = frame
|
||||
t_req = round(t_req + frame_step_s, 6)
|
||||
|
||||
best_in = estimated_in_point_s
|
||||
best_score = -1.0
|
||||
t = start_s
|
||||
while t <= end_s:
|
||||
score = _motion_phase_score(cap, t, motion_templates, cfg)
|
||||
score = _motion_phase_score(cap, t, motion_templates, cfg, fps=fps, frame_cache=frame_cache)
|
||||
if score > best_score + tie_delta:
|
||||
best_score = score
|
||||
best_in = t
|
||||
@@ -933,6 +1157,7 @@ def align_in_point_by_content_and_motion(
|
||||
estimated_in_point_s: float,
|
||||
cfg: AppConfig,
|
||||
search_window_s: float | None = None,
|
||||
scene_end_s: float | None = None,
|
||||
) -> tuple[float, float, float, float]:
|
||||
"""
|
||||
Align a candidate using still-frame content and motion phase together.
|
||||
@@ -959,23 +1184,57 @@ def align_in_point_by_content_and_motion(
|
||||
end_s = estimated_in_point_s + window_s
|
||||
tie_delta = cfg.cv.deep_scan.start_tie_break_score_delta
|
||||
|
||||
min_t_offset = min(off for off, _ in templates) if templates else 0.0
|
||||
max_t_offset = max(off for off, _ in templates) if templates else 0.0
|
||||
min_m_offset = min(off for off, _, _, _ in motion_templates) if motion_templates else 0.0
|
||||
max_m_offset = max(off + step for off, step, _, _ in motion_templates) if motion_templates else 0.0
|
||||
|
||||
min_offset = min(min_t_offset, min_m_offset)
|
||||
max_offset = max(max_t_offset, max_m_offset)
|
||||
req_start_s = max(0.0, start_s + min_offset - frame_step_s)
|
||||
req_end_s = end_s + max_offset + frame_step_s
|
||||
|
||||
frame_cache = {}
|
||||
t_req = req_start_s
|
||||
while t_req <= req_end_s:
|
||||
idx = int(round(t_req * fps))
|
||||
frame = grab_frame_at(cap, t_req)
|
||||
if frame is not None:
|
||||
frame_cache[idx] = frame
|
||||
t_req = round(t_req + frame_step_s, 6)
|
||||
|
||||
best_in = estimated_in_point_s
|
||||
best_score = -1.0
|
||||
best_content = -1.0
|
||||
best_motion = -1.0
|
||||
t = start_s
|
||||
while t <= end_s:
|
||||
content_score = _content_alignment_score(cap, t, templates, cfg)
|
||||
if scene_end_s is not None:
|
||||
avail_s = scene_end_s - t
|
||||
if avail_s > 0:
|
||||
active_templates = [(off, tpl) for off, tpl in templates if off <= avail_s]
|
||||
active_motion = [(off, step, delta, shape) for off, step, delta, shape in motion_templates if off + step <= avail_s]
|
||||
else:
|
||||
active_templates = []
|
||||
active_motion = []
|
||||
else:
|
||||
active_templates = templates
|
||||
active_motion = motion_templates
|
||||
|
||||
content_score = _content_alignment_score(cap, t, active_templates, cfg, fps=fps, frame_cache=frame_cache) if active_templates else -1.0
|
||||
motion_score = (
|
||||
_motion_phase_score(cap, t, motion_templates, cfg)
|
||||
if len(motion_templates) >= 2
|
||||
_motion_phase_score(cap, t, active_motion, cfg, fps=fps, frame_cache=frame_cache)
|
||||
if len(active_motion) >= 2
|
||||
else content_score
|
||||
)
|
||||
if content_score < 0 or motion_score < 0:
|
||||
t = round(t + frame_step_s, 6)
|
||||
continue
|
||||
raw_score = content_score * 0.64 + motion_score * 0.36
|
||||
anchor_penalty = min(0.18, abs(t - estimated_in_point_s) * 0.05)
|
||||
# The previous anchor_penalty of 0.05 per second was stronger than the
|
||||
# actual variance in raw_score, preventing phase correction. We reduce it
|
||||
# so that it only acts as a tie-breaker.
|
||||
anchor_penalty = min(0.18, abs(t - estimated_in_point_s) * 0.005)
|
||||
score = raw_score - anchor_penalty
|
||||
if score > best_score + tie_delta:
|
||||
best_score = score
|
||||
@@ -1027,6 +1286,18 @@ def estimate_usable_source_duration(
|
||||
frame = grab_frame_at(cap, in_point_s + offset_s)
|
||||
if frame is None:
|
||||
break
|
||||
|
||||
# If the template is scoreable (has content) but the source frame is dark,
|
||||
# this is a bad match. We should not let dark source frames
|
||||
# provide high correlation to dark templates.
|
||||
# templates are already pre-processed into feature images (grayscale/edges),
|
||||
# so we can't use _is_scoreable_reference_frame on them directly.
|
||||
# Instead, we rely on the fact that _prepare_beat_templates already
|
||||
# filtered out non-scoreable frames.
|
||||
if _is_dark_reference_frame(frame, cfg):
|
||||
scores.append((offset_s, 0.0))
|
||||
continue
|
||||
|
||||
scores.append((offset_s, _match_score(frame, template, cfg)))
|
||||
|
||||
if not scores:
|
||||
@@ -1038,12 +1309,14 @@ def estimate_usable_source_duration(
|
||||
|
||||
last_good = 0.0
|
||||
bad_run = 0
|
||||
bad_run_start_offset: float | None = None
|
||||
good_scores: list[float] = []
|
||||
|
||||
for offset_s, score in scores:
|
||||
if score >= min_score:
|
||||
last_good = offset_s
|
||||
bad_run = 0
|
||||
bad_run_start_offset = None
|
||||
good_scores.append(score)
|
||||
continue
|
||||
|
||||
@@ -1051,7 +1324,34 @@ def estimate_usable_source_duration(
|
||||
continue
|
||||
|
||||
bad_run += 1
|
||||
if bad_run_start_offset is None:
|
||||
bad_run_start_offset = offset_s
|
||||
if bad_run >= 3:
|
||||
# Before killing the span, check whether the remaining scores form a
|
||||
# stable plateau. This handles scenes where a grading/exposure
|
||||
# difference between trailer and source causes a gradual score drop
|
||||
# rather than a hard cut. A genuine cut produces chaotic scores;
|
||||
# a grading mismatch produces a flat, low-but-consistent plateau.
|
||||
# Conditions: low variance (std < 0.025), scores above pure-black
|
||||
# (mean > 0.20), and the warmup baseline was meaningful (>= 0.30).
|
||||
tail_scores = [s for o, s in scores if o >= bad_run_start_offset]
|
||||
if (
|
||||
len(tail_scores) >= 3
|
||||
and float(np.std(tail_scores)) < 0.025
|
||||
and float(np.mean(tail_scores)) > 0.20
|
||||
and baseline >= 0.30
|
||||
):
|
||||
logger.debug(
|
||||
'Beat %d: stable plateau detected at offset %.3fs '
|
||||
'(tail mean=%.3f std=%.3f) — extending span to full duration.',
|
||||
beat.beat_id, bad_run_start_offset,
|
||||
float(np.mean(tail_scores)), float(np.std(tail_scores)),
|
||||
)
|
||||
last_good = scores[-1][0]
|
||||
good_scores.extend(tail_scores)
|
||||
break
|
||||
logger.debug('Beat %d: Match died at offset %.3fs. Score %.3f < min_score %.3f. Bad run count: %d',
|
||||
beat.beat_id, offset_s, score, min_score, bad_run)
|
||||
break
|
||||
|
||||
tail_safety_s = max(0.0, cfg.cv.deep_scan.trim_tail_frames / source_fps)
|
||||
@@ -1113,7 +1413,10 @@ def refine_in_point_with_sequence(
|
||||
Returns:
|
||||
(best_in_point_s, sequence_score)
|
||||
"""
|
||||
return align_in_point_by_content(beat, estimated_in_point_s, cfg, search_window_s, scene_end_s)
|
||||
best_in, best_score, _, _ = align_in_point_by_content_and_motion(
|
||||
beat, estimated_in_point_s, cfg, search_window_s, scene_end_s
|
||||
)
|
||||
return best_in, best_score
|
||||
|
||||
|
||||
def _find_scene_for_time(scenes: Sequence | None, t_sec: float, cfg: AppConfig):
|
||||
@@ -1451,7 +1754,7 @@ def run_global_scan(
|
||||
max_source_duration_s=duration_s if rough_scene_end_s is not None else None,
|
||||
)
|
||||
content_score = original_content_score
|
||||
content_in_s, align_content_score = align_in_point_by_content(
|
||||
content_in_s, _, align_content_score, _ = align_in_point_by_content_and_motion(
|
||||
b,
|
||||
adjusted_in_s,
|
||||
cfg,
|
||||
@@ -1495,7 +1798,7 @@ def run_global_scan(
|
||||
cfg,
|
||||
)
|
||||
|
||||
motion_in_s, align_motion_score = align_in_point_by_motion(
|
||||
motion_in_s, _, _, align_motion_score = align_in_point_by_content_and_motion(
|
||||
b,
|
||||
adjusted_in_s,
|
||||
cfg,
|
||||
@@ -1504,6 +1807,7 @@ def run_global_scan(
|
||||
if local_align_window_s is not None
|
||||
else min(1.0, cfg.cv.deep_scan.content_align_window_seconds)
|
||||
),
|
||||
scene_end_s=rough_scene_end_s,
|
||||
)
|
||||
|
||||
if align_motion_score >= original_motion_score + 0.015:
|
||||
@@ -1561,7 +1865,12 @@ def run_global_scan(
|
||||
)
|
||||
if len(motion_templates) >= 2:
|
||||
motion_score_clamped = max(0.0, min(1.0, motion_score))
|
||||
final_score = final_score * 0.82 + motion_score_clamped * 0.18
|
||||
blended = final_score * 0.82 + motion_score_clamped * 0.18
|
||||
# Do not let motion blending drag the score below the
|
||||
# content-validated level. A weak motion score often just
|
||||
# means the shot contains a camera pan or slow zoom; it
|
||||
# should not veto an otherwise well-supported content match.
|
||||
final_score = max(blended, final_score - 0.015)
|
||||
if is_weighted_seed_candidate:
|
||||
vision_provisional_score = (
|
||||
content_score * 0.45
|
||||
@@ -1741,6 +2050,36 @@ def run_global_scan(
|
||||
best_result.match_score,
|
||||
)
|
||||
|
||||
# Final hi-res phase refinement: scan the full source scene at
|
||||
# higher resolution to correct phase mismatches that the standard
|
||||
# 160×80 features cannot resolve (e.g. talking-head close-ups).
|
||||
final_in_s = best_result.in_point_s
|
||||
final_scene = _find_scene_for_time(scenes, final_in_s, cfg)
|
||||
if final_scene is not None:
|
||||
refined_phase_in_s = _hires_phase_refine(
|
||||
b,
|
||||
final_in_s,
|
||||
float(final_scene.start_s),
|
||||
float(final_scene.end_s),
|
||||
cfg,
|
||||
)
|
||||
if refined_phase_in_s != final_in_s:
|
||||
final_in_s = refined_phase_in_s
|
||||
# Recompute out-point preserving the duration
|
||||
final_out_s = final_in_s + best_result.duration_s
|
||||
if final_scene is not None:
|
||||
final_out_s = min(final_out_s, float(final_scene.end_s))
|
||||
best_result = MatchResult(
|
||||
beat_id=b.beat_id,
|
||||
scene_id=best_result.scene_id,
|
||||
source_path=cfg.paths.source_movie,
|
||||
in_point_s=final_in_s,
|
||||
out_point_s=final_out_s,
|
||||
in_point_frame=int(final_in_s * source_fps),
|
||||
match_score=best_result.match_score,
|
||||
is_confirmed=is_confirmed,
|
||||
)
|
||||
|
||||
results.append(MatchResult(
|
||||
beat_id=b.beat_id,
|
||||
scene_id=best_result.scene_id,
|
||||
|
||||
Reference in New Issue
Block a user