Fix forehead_touch action group + always-fresh cutter assets

1. Action-group classifier conflated object-touches and person-touches. "man touches the red door with a small object" was being tagged as forehead_touch because "touch" was in the forehead_touch needles set. That made the realign pass yank Beat 16 from scene 451 (correct: man painting red door, IV stand) over to scene 623 (woman/man in bed) — a totally wrong shot at score 0.344. Fix: removed generic "touch*" verbs from forehead_touch's needle set. forehead_touch is now added in _semantic_action_groups() only when a touch verb is paired with an explicit body-part target (forehead, face, cheek, head, hand, ...) and not paired with an object target (door, handle, brush, tool, lock, ...). Effect on Beat 16 after `match --beat 16 --vision`: scene 623 in=5476.28 score=0.344 -> scene 451 in=3912.48 score=0.626. 2. Cutter-report stills/clips were keyed by source-video mtime, so a match-position change without a video change served stale frames from the previous match. Dropped the mtime cache; both extractors now render fresh every time. Slower (~minute per full regen) but correct. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-05 05:23:24 +02:00
parent dbadc3fc26
commit 8aa6fe8323
52 changed files with 73 additions and 46 deletions
@@ -19,7 +19,7 @@ video { width: 100%; border-radius: 6px; box-shadow: 0 4px 6px rgba(0,0,0,0.5);
 .code-hint { background: #000; padding: 10px; border-radius: 4px; font-family: monospace; font-size: 0.9em; margin-top: 15px; color: #a3e635; }
 </style></head><body>
 <h1>AI Trailer Generator — Match Report</h1>
-<div class='stats'>Total Beats: 25 | Matched: 19</div>
+<div class='stats'>Total Beats: 25 | Matched: 20</div>
 <script>
 function syncBeat(row) {
  const vids = row.querySelectorAll('video');
@@ -348,14 +348,18 @@ document.addEventListener('DOMContentLoaded', () => document.querySelectorAll('.
 <h3>Beat 016</h3>
 <p><b>Type:</b> UNKNOWN</p>
 <p><b>Trailer:</b> 61.48s &rarr; 64.48s</p>
-<p class='status-miss'>NO MATCH</p>
+<p style='color: #fbbf24; font-weight: bold; font-size: 1.1em;'>PROVISIONAL MATCH</p>
 <p><b>Scene ID:</b> 451</p>
 <p><b>Movie In:</b> 3912.48s</p>
 <p><b>Source Dur:</b> 2.80s</p>
 <p><b>Unmatched Tail:</b> 0.12s placeholder</p>
 <p><b>Score:</b> <span class='score'>0.626</span></p>
 <p style='color: #fbbf24; font-size: 0.9em;'>Some trailer frames are still unmatched; report fills only those gaps with placeholder black.</p>
 <p style='color: #fbbf24; font-size: 0.9em;'>⚠️ Score below 0.80. Verify visually.</p>
 <div class='code-hint'>python cli.py rematch --beat 16</div>
 </div>
 <div class='videos'>
-<div class='video-container'>
+<div class='compare'><p>Frame-Locked Compare</p><video src='beat_016_compare.mp4' controls loop muted autoplay></video></div>
 <div class='video-col'><p>Reference Trailer</p><video src='beat_016_ref.mp4' controls loop muted autoplay></video></div>
 <div class='video-col'><p>Matched Source</p><div style='width: 100%; aspect-ratio: 16/9; background: #222; display: flex; align-items: center; justify-content: center; border-radius: 6px; color: #555;'>No Match</div></div>
 </div>
 </div>
 </div>
 <div class='beat-row'>
@@ -136,18 +136,12 @@ CLIP_WIDTH = 480
 CLIP_MAX_DURATION_S = 30.0
 def _stale(out: Path, src: Path) -> bool:
    try:
        return not (out.exists() and out.stat().st_mtime >= src.stat().st_mtime and out.stat().st_size > 0)
    except OSError:
        return True
 def extract_still(video_path: Path, t_s: float, out: Path) -> bool:
    """Always render fresh. The match position can change without the source
    video changing, so a mtime-based cache would silently serve stale frames
    from the previous match. The cutter expects bit-current previews."""
    if not video_path.exists():
        return False
    if not _stale(out, video_path):
        return True
    out.parent.mkdir(parents=True, exist_ok=True)
    cmd = [
        "ffmpeg", "-y", "-loglevel", "error",
@@ -166,10 +160,9 @@ def extract_still(video_path: Path, t_s: float, out: Path) -> bool:
 def extract_clip(video_path: Path, start_s: float, duration_s: float, out: Path) -> bool:
    """Always render fresh — see extract_still for rationale."""
    if not video_path.exists():
        return False
    if not _stale(out, video_path):
        return True
    out.parent.mkdir(parents=True, exist_ok=True)
    cmd = [
        "ffmpeg", "-y", "-loglevel", "error",
@@ -53,7 +53,11 @@ _CREDIT_ERROR_PATTERNS = (
 _ACTION_GROUPS = {
    "kiss": {"kiss", "kisses", "kissing", "kissed"},
-    "forehead_touch": {"forehead", "foreheads", "touch", "touches", "touching", "touched"},
+    # "touch" is intentionally NOT in forehead_touch — a generic "touch" can
    # mean "touches the door / handle / brush" which is unrelated to person
    # contact. forehead_touch is added in _semantic_action_groups() only
    # when an explicit body-part target is present.
    "forehead_touch": {"forehead", "foreheads"},
    "approach": {"approach", "approaches", "approaching", "closer", "lean", "leans", "leaning"},
    "talk": {"talk", "talking", "speak", "speaking", "conversation", "conversing"},
    "hand": {"hand", "hands", "holding", "holds", "raise", "raises", "raising", "lift", "lifting"},
@@ -61,6 +65,21 @@ _ACTION_GROUPS = {
    "look_down": {"down", "lowering", "lowers"},
    "turn": {"turn", "turns", "turning"},
 }
 # Words that, when paired with "touch"-family verbs, signal an object touch
 # (door, handle, brush, tool, ...) rather than a person-on-person touch.
 _OBJECT_TOUCH_TARGETS = {
    "door", "doors", "handle", "knob", "lock", "mechanism", "brush", "tool",
    "pole", "stand", "rail", "button", "switch", "wall", "surface", "object",
    "knife", "blade", "weapon", "phone", "glass", "bottle", "cup",
 }
 # Words that, when paired with "touch", signal a person-on-person touch
 # (forehead/face/skin/...). These keep forehead_touch as a strong action.
 _PERSON_TOUCH_TARGETS = {
    "forehead", "foreheads", "face", "faces", "cheek", "cheeks",
    "head", "skin", "lips", "lip", "neck", "shoulder", "shoulders",
    "arm", "arms", "chest", "hand", "hands", "hair", "body",
 }
 _STRONG_ACTION_GROUPS = {"kiss", "forehead_touch", "approach", "hand", "cutting"}
@@ -285,6 +304,17 @@ def _semantic_action_groups(text: str) -> set[str]:
        for name, needles in _ACTION_GROUPS.items()
        if terms & needles
    }
    # Distinguish person-on-person touches from object touches. "touches the
    # red door" must NOT count as forehead_touch; "touches her forehead"
    # must. We look at the action_phase first (most specific), fall back to
    # the full description.
    phase = _action_phase_text(text)
    touch_present = any(w in phase for w in ("touch", "touches", "touching", "touched"))
    if touch_present:
        person_target = any(w in phase for w in _PERSON_TOUCH_TARGETS)
        object_target = any(w in phase for w in _OBJECT_TOUCH_TARGETS)
        if person_target and not object_target:
            groups.add("forehead_touch")
    if "moving closer" in lowered or "move closer" in lowered:
        groups.add("approach")
    if "face-to-face" in lowered or "faces facing" in lowered: