Commit ·
2e7f9a4
1
Parent(s): 894b188
Fix xregen truncating total audio duration when clip < segment window
Browse filesWhen an xregen model generates a shorter clip than the original segment
window (e.g. MMAudio 8s on a HunyuanFoley 15s segment), _stitch_wavs
trims the last segment's wav expecting it to cover the full window.
A short wav gets min-clipped, making the final stitched audio shorter
than total_dur_s.
Fix: in _xregen_splice, after prepending leading silence to align to
seg_start, append trailing silence to pad the wav to the full original
segment duration (seg_end - seg_start). _stitch_wavs then trims it
correctly and the output is always exactly total_dur_s long.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
app.py
CHANGED
|
@@ -1785,20 +1785,34 @@ def _xregen_splice(new_wav_raw: np.ndarray, src_sr: int,
|
|
| 1785 |
slot_wavs = _load_seg_wavs(meta["wav_paths"])
|
| 1786 |
new_wav = _resample_to_slot_sr(new_wav_raw, src_sr, slot_sr, slot_wavs[0])
|
| 1787 |
|
| 1788 |
-
# Align new_wav so sample index 0 corresponds to seg_start in video time
|
| 1789 |
-
#
|
| 1790 |
-
#
|
| 1791 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1792 |
if clip_start_s is not None:
|
| 1793 |
-
seg_start = meta["segments"][seg_idx]
|
| 1794 |
-
|
|
|
|
|
|
|
|
|
|
| 1795 |
if offset_s < 0:
|
| 1796 |
pad_samples = int(round(abs(offset_s) * slot_sr))
|
| 1797 |
-
silence = np.zeros(
|
| 1798 |
-
|
| 1799 |
-
|
| 1800 |
-
|
| 1801 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1802 |
|
| 1803 |
video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
|
| 1804 |
new_wav, seg_idx, meta, slot_id
|
|
|
|
| 1785 |
slot_wavs = _load_seg_wavs(meta["wav_paths"])
|
| 1786 |
new_wav = _resample_to_slot_sr(new_wav_raw, src_sr, slot_sr, slot_wavs[0])
|
| 1787 |
|
| 1788 |
+
# Align new_wav so sample index 0 corresponds to seg_start in video time,
|
| 1789 |
+
# and the wav is long enough to cover the full original segment window.
|
| 1790 |
+
#
|
| 1791 |
+
# _stitch_wavs trims each wav relative to its seg_start, expecting the wav
|
| 1792 |
+
# to cover the full segment window (seg_end - seg_start). xregen models
|
| 1793 |
+
# may generate a shorter clip (e.g. MMAudio 8 s on a 15 s segment), which
|
| 1794 |
+
# causes _stitch_wavs to trim short and produce truncated output.
|
| 1795 |
+
#
|
| 1796 |
+
# Steps:
|
| 1797 |
+
# 1. Prepend silence if the clip started after seg_start.
|
| 1798 |
+
# 2. Append silence if the wav is still shorter than the full segment window.
|
| 1799 |
if clip_start_s is not None:
|
| 1800 |
+
seg_start, seg_end = meta["segments"][seg_idx]
|
| 1801 |
+
full_seg_samples = int(round((seg_end - seg_start) * slot_sr))
|
| 1802 |
+
|
| 1803 |
+
# Step 1: prepend silence to align to seg_start
|
| 1804 |
+
offset_s = seg_start - clip_start_s # negative when clip starts after seg_start
|
| 1805 |
if offset_s < 0:
|
| 1806 |
pad_samples = int(round(abs(offset_s) * slot_sr))
|
| 1807 |
+
silence = np.zeros((new_wav.shape[0], pad_samples), dtype=new_wav.dtype)
|
| 1808 |
+
new_wav = np.concatenate([silence, new_wav], axis=1)
|
| 1809 |
+
|
| 1810 |
+
# Step 2: append silence to fill the full segment window
|
| 1811 |
+
current_samples = new_wav.shape[1]
|
| 1812 |
+
if current_samples < full_seg_samples:
|
| 1813 |
+
tail = np.zeros((new_wav.shape[0], full_seg_samples - current_samples),
|
| 1814 |
+
dtype=new_wav.dtype)
|
| 1815 |
+
new_wav = np.concatenate([new_wav, tail], axis=1)
|
| 1816 |
|
| 1817 |
video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
|
| 1818 |
new_wav, seg_idx, meta, slot_id
|