| |
| |
| |
| |
| |
|
|
| from seamless_interaction.fs import DatasetConfig, SeamlessInteractionFS |
|
|
|
|
| def download_single_example(file_id: str | None = None): |
| """ |
| Download a single interaction example (~50-100MB). |
| |
| Perfect for quick exploration and understanding data structure. |
| Auto-samples from vendors with smaller files (V00, V01) if no file_id |
| provided. |
| |
| :param file_id: Specific file ID to download, or None to auto-sample |
| """ |
| config = DatasetConfig(label="improvised", split="dev", preferred_vendors_only=True) |
| fs = SeamlessInteractionFS(config=config) |
|
|
| if file_id is None: |
| |
| file_ids = fs.sample_random_file_ids(num_samples=1) |
| file_id = file_ids[0] |
| print(f"🎲 Auto-sampled file ID: {file_id}") |
|
|
| |
| fs.gather_file_id_data_from_s3(file_id) |
| print(f"✅ Downloaded single example: {file_id}") |
|
|
|
|
| def download_interaction_pair(interaction_key: str | None = None): |
| """ |
| Download a pair of interactions from the same session (~100-200MB). |
| |
| Ideal for studying conversational dynamics between participants. |
| Auto-samples interaction pairs if no interaction_key provided. |
| |
| :param interaction_key: Interaction key (V00_S0809_I00000582) or None to auto-sample |
| """ |
| config = DatasetConfig(label="improvised", split="dev", preferred_vendors_only=True,local_dir = "/root/autodl-tmp/seamless") |
| fs = SeamlessInteractionFS(config=config) |
|
|
| if interaction_key is None: |
| |
| pairs = fs.get_interaction_pairs(num_pairs=4) |
| |
| print(f"✅ 获取了{len(pairs)}个pairs (共{len(pairs)*2}个file IDs)") |
| |
| |
| file_ids = [fid for pair in pairs for fid in pair] |
| |
| print(f"✅ 下载完成") |
| else: |
| |
| pairs = fs.get_interaction_pairs(interaction_keys=[interaction_key]) |
| file_ids = pairs[0] if pairs else [] |
| print(f"📍 Using interaction key: {interaction_key} -> {file_ids}") |
|
|
| if not file_ids: |
| print("❌ No interaction pairs found") |
| return |
|
|
| |
| fs.download_batch_from_s3(file_ids) |
| print(f"✅ Downloaded interaction pair: {file_ids}") |
|
|
|
|
| def download_samples_1gb(file_ids: list[str] | None = None, num_samples: int = 10): |
| """ |
| Download approximately 1GB of samples (~10 interactions). |
| |
| Good for initial exploration and prototyping. |
| Auto-samples diverse interactions if no file_ids provided. |
| |
| :param file_ids: Specific file IDs to download, or None to auto-sample |
| :param num_samples: Number of samples to download (if auto-sampling) |
| """ |
| config = DatasetConfig( |
| label="improvised", |
| split="test", |
| preferred_vendors_only=True, |
| seed=42, |
| num_workers=4, |
| ) |
| fs = SeamlessInteractionFS(config=config) |
|
|
| if file_ids is None: |
| |
| file_ids = fs.sample_random_file_ids(num_samples=num_samples) |
| print(f"🎲 Auto-sampled {len(file_ids)} file IDs from preferred vendors") |
| ids_preview = file_ids[:3] if len(file_ids) > 3 else file_ids |
| print( |
| f"Sample IDs: {ids_preview}..." |
| if len(file_ids) > 3 |
| else f"Sample IDs: {ids_preview}" |
| ) |
|
|
| fs.download_batch_from_s3(file_ids) |
| print(f"✅ Downloaded {len(file_ids)} samples (~{len(file_ids) * 100}MB)") |
|
|
|
|
| def download_session_exploration( |
| session_key: str | None = None, interactions_per_session: int = 4 |
| ): |
| """ |
| Download complete session groups for deeper exploration (~400MB per session). |
| |
| Perfect for studying conversational context and session dynamics. |
| Auto-samples sessions with rich interaction content if no session_key provided. |
| |
| :param session_key: Session key (V00_S0809) or None to auto-sample |
| :param interactions_per_session: Target interactions per session |
| """ |
| config = DatasetConfig( |
| label="naturalistic", split="dev", preferred_vendors_only=True, num_workers=4 |
| ) |
| fs = SeamlessInteractionFS(config=config) |
|
|
| if session_key is None: |
| |
| session_groups = fs.get_session_groups( |
| num_sessions=1, interactions_per_session=interactions_per_session |
| ) |
| all_file_ids = session_groups[0] if session_groups else [] |
| print(f"🎲 Auto-sampled session: {len(all_file_ids)} interactions") |
| else: |
| |
| session_groups = fs.get_session_groups( |
| session_keys=[session_key], |
| interactions_per_session=interactions_per_session, |
| ) |
| all_file_ids = session_groups[0] if session_groups else [] |
| print( |
| f"📍 Using session key: {session_key} -> {len(all_file_ids)} interactions" |
| ) |
|
|
| if not all_file_ids: |
| print("❌ No session interactions found") |
| return |
|
|
| fs.download_batch_from_s3(all_file_ids) |
| print(f"✅ Downloaded session with {len(all_file_ids)} interactions") |
|
|
|
|
| def main(): |
| """ |
| Demonstrate S3-based flexible download options with intelligent sampling. |
| |
| All functions support both manual key specification and automatic sampling. |
| Auto-sampling prioritizes smaller vendors (V00, V01). |
| """ |
| print("🔍 S3 Download Options with Intelligent Sampling:") |
| print("1. Single example (~100MB) - Quick exploration") |
| print("2. Interaction pair (~200MB) - Conversational dynamics") |
| print("3. Sample set (~1GB) - Initial prototyping") |
| print("4. Session exploration (~400MB/session) - Deep context study") |
| print() |
| print("💡 All options auto-sample from preferred vendors if no keys provided") |
| print(" Preferred: V00, V01 (smaller files)") |
| print(" Avoided: V03 (larger 100MB-800MB videos)") |
| print() |
| print("📍 You can also specify exact keys:") |
| print(" Interaction key: V00_S0809_I00000582") |
| print(" Session key: V00_S0809") |
|
|
| |
| |
| |
| download_interaction_pair() |
| |
| |
| |
| |
| |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|