File size: 10,155 Bytes
85752bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
# /data/rczhang/PencilFolder/multi-shot/data/
# ├── Data1/                     # movie1
# │   ├── meta                    # 元数据文件(包含视频描述和ID信息)
# │   ├── 1/
# │   │   └── 1.mp4              # 视频文件
# │   ├── 2/
# │   │   ├── 1.mp4              # 视频文件
# │   │   └── 2.mp4              # 视频文件
# │   └── ID/
# │       └── ID1/
# │           ├── 1.png          # 人物ID图片
# │           ├── 2.png          # 人物ID图片
# │           └── 3.png          # 人物ID图片
# │
# ├── Data2/                     # movie2
# │   ├── meta                    # 元数据文件
# │   ├── 1/
# │   │   ├── 1.mp4              # 视频文件
# │   │   └── 2.mp4              # 视频文件
# │   ├── 2/
# │   │   └── 1.mp4              # 视频文件
# │   └── ID/
# │       ├── ID1/
# │       │   ├── 1.png          # 人物ID图片
# │       │   ├── 2.png          # 人物ID图片
# │       │   └── 3.png          # 人物ID图片
# │       └── ID2/
# │           ├── 1.png          # 人物ID图片
# │           ├── 2.png          # 人物ID图片
# │           └── 3.png          # 人物ID图片
# │
# └── Data3/                     # movie3
#     ├── meta                    # 元数据文件
#     ├── 1/
#     │   └── 1.mp4              # 视频文件
#     └── ID/
#         └── ID1/
#             ├── 1.png          # 人物ID图片
#             ├── 2.png          # 人物ID图片
#             └── 3.png          # 人物ID图片
import argparse
import json
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple

import imageio.v2 as imageio
import numpy as np
from PIL import Image


IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".webp", ".bmp"}
DEFAULT_FPS = 16
TARGET_HEIGHT = 480
TARGET_WIDTH = 832


def read_json_lines(path: Path) -> List[Dict[str, Any]]:
    entries: List[Dict[str, Any]] = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            entries.append(json.loads(line))
    return entries


def normalize_list(value: Any) -> List[str]:
    if value is None:
        return []
    if isinstance(value, list):
        return [str(item) for item in value]
    return [str(value)]


def sort_key_numeric(path: Path) -> Tuple[int, int, str]:
    stem = path.stem
    if stem.isdigit():
        return (0, int(stem), stem)
    return (1, 0, stem)


def list_shot_videos(segment_dir: Path) -> List[Path]:
    shots = [p for p in segment_dir.iterdir() if p.is_file() and p.suffix.lower() == ".mp4"]
    return sorted(shots, key=sort_key_numeric)


def list_id_images(id_dir: Path) -> List[Path]:
    images = [p for p in id_dir.iterdir() if p.is_file() and p.suffix.lower() in IMAGE_EXTS]
    return sorted(images, key=sort_key_numeric)


def resize_center_crop(image: Image.Image, target_h: int, target_w: int) -> Image.Image:
    w, h = image.size
    scale = max(target_w / w, target_h / h)
    new_w = int(round(w * scale))
    new_h = int(round(h * scale))
    image = image.resize((new_w, new_h), Image.Resampling.LANCZOS)
    left = max(0, (new_w - target_w) // 2)
    top = max(0, (new_h - target_h) // 2)
    return image.crop((left, top, left + target_w, top + target_h))


def iter_resampled_frames(video_path: Path, target_fps: int) -> Iterable[np.ndarray]:
    reader = imageio.get_reader(str(video_path))
    try:
        meta = reader.get_meta_data()
        fps = meta.get("fps", target_fps) or target_fps
        total_frames: Optional[int] = None
        try:
            total_frames = reader.count_frames()
        except Exception:
            duration = meta.get("duration")
            if duration:
                total_frames = int(round(duration * fps))

        if total_frames is None or total_frames <= 0:
            frames = [frame for frame in reader]
            if not frames:
                return
            total_frames = len(frames)
            target_count = max(1, int(round(total_frames * target_fps / fps)))
            indices = np.linspace(0, total_frames - 1, num=target_count, dtype=int)
            for idx in indices:
                yield frames[int(idx)]
            return

        duration = total_frames / fps if fps > 0 else total_frames / target_fps
        target_count = max(1, int(round(duration * target_fps)))
        indices = np.linspace(0, total_frames - 1, num=target_count, dtype=int)
        for idx in indices:
            yield reader.get_data(int(idx))
    finally:
        reader.close()


def pick_id_names(id_list: List[str], id_root: Path) -> List[str]:
    if id_list:
        seen = set()
        ordered = []
        for name in id_list:
            if name not in seen:
                ordered.append(name)
                seen.add(name)
        return ordered
    if not id_root.exists():
        return []
    names = []
    for child in sorted(id_root.iterdir(), key=sort_key_numeric):
        if child.is_dir():
            names.append(child.name)
    return names


def build_text(captions: List[str]) -> str:
    cleaned = [caption.strip() for caption in captions if caption and caption.strip()]
    return " ".join(cleaned) if cleaned else "unknown"


def process_segment(
    data_dir: Path,
    segment_index: int,
    meta_entry: Dict[str, Any],
    output_dir: Path,
    target_fps: int,
    height: int,
    width: int,
) -> Optional[Dict[str, Any]]:
    segment_dir = data_dir / str(segment_index)
    if not segment_dir.exists():
        return None

    shot_videos = list_shot_videos(segment_dir)
    if not shot_videos:
        return None

    captions = normalize_list(meta_entry.get("caption"))
    ids = normalize_list(meta_entry.get("ID"))
    shot_captions: List[str] = []
    for idx in range(len(shot_videos)):
        if idx < len(captions):
            shot_captions.append(captions[idx])
        elif captions:
            shot_captions.append(captions[-1])
        else:
            shot_captions.append("")

    text = build_text(shot_captions)
    id_root = data_dir / "ID"
    id_names = pick_id_names(ids, id_root)
    if not id_names:
        return None

    id_dir = id_root
    if not id_dir.exists():
        return None

    output_dir.mkdir(parents=True, exist_ok=True)
    output_path = output_dir / f"{segment_index}.mp4"
    writer = imageio.get_writer(str(output_path), fps=target_fps)
    try:
        for shot_path in shot_videos:
            for frame in iter_resampled_frames(shot_path, target_fps):
                img = Image.fromarray(frame).convert("RGB")
                img = resize_center_crop(img, height, width)
                writer.append_data(np.asarray(img))
    finally:
        writer.close()

    return {
        "disk_path": str(output_path),
        "text": text,
        "shot_captions": shot_captions,
        "id_dir": str(id_dir),
        "id_names": id_names,
    }


def build_dataset(
    data_root: Path,
    output_root: Path,
    output_json: Path,
    target_fps: int,
    height: int,
    width: int,
    limit: Optional[int],
) -> Path:
    dataset: Dict[str, Dict[str, Any]] = {}
    total_segments = 0

    for data_dir in sorted([p for p in data_root.iterdir() if p.is_dir()], key=sort_key_numeric):
        meta_path = data_dir / "meta"
        if not meta_path.exists():
            continue

        meta_entries = read_json_lines(meta_path)
        if not meta_entries:
            continue

        output_dir = output_root / data_dir.name

        for idx, meta_entry in enumerate(meta_entries, start=1):
            entry = process_segment(
                data_dir=data_dir,
                segment_index=idx,
                meta_entry=meta_entry,
                output_dir=output_dir,
                target_fps=target_fps,
                height=height,
                width=width,
            )
            if entry is None:
                continue

            key = f"{data_dir.name}/{idx}"
            dataset[key] = entry
            total_segments += 1
            if limit is not None and total_segments >= limit:
                break

        if limit is not None and total_segments >= limit:
            break

    output_json.parent.mkdir(parents=True, exist_ok=True)
    with output_json.open("w", encoding="utf-8") as f:
        json.dump(dataset, f, ensure_ascii=False, indent=2)

    return output_json


def main() -> int:
    parser = argparse.ArgumentParser(description="Preprocess multi-shot data into videodataset JSON.")
    parser.add_argument(
        "--data_root",
        type=str,
        default="/data/rczhang/PencilFolder/multi-shot/data",
        help="Root folder that contains Data* directories.",
    )
    parser.add_argument(
        "--output_root",
        type=str,
        default="/data/rczhang/PencilFolder/multi-shot/processed",
        help="Where to save processed videos.",
    )
    parser.add_argument(
        "--output_json",
        type=str,
        default="/data/rczhang/PencilFolder/multi-shot/processed/dataset.json",
        help="Output dataset JSON path.",
    )
    parser.add_argument("--target_fps", type=int, default=DEFAULT_FPS)
    parser.add_argument("--height", type=int, default=TARGET_HEIGHT)
    parser.add_argument("--width", type=int, default=TARGET_WIDTH)
    parser.add_argument("--limit", type=int, default=None, help="Limit processed segments for quick tests.")
    args = parser.parse_args()

    output_json = build_dataset(
        data_root=Path(args.data_root),
        output_root=Path(args.output_root),
        output_json=Path(args.output_json),
        target_fps=args.target_fps,
        height=args.height,
        width=args.width,
        limit=args.limit,
    )
    print(f"Saved dataset JSON to: {output_json}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())