| |
| |
| |
| |
| |
|
|
| from seamless_interaction.fs import DatasetConfig, SeamlessInteractionFS |
|
|
|
|
| def download_1gb_sample_archive(): |
| """ |
| Download ~1GB of samples using selective archives. |
| |
| Traditional archive-based approach for quick exploration on laptops. |
| """ |
| config = DatasetConfig(label="improvised", split="dev", num_workers=4) |
| fs = SeamlessInteractionFS(config=config) |
|
|
| |
| fs.download_batch_from_hf(batch_idx=0, archive_list=[0]) |
| print("β
Downloaded ~1GB sample from HF (archive-based)") |
|
|
|
|
| def download_single_batch(): |
| """ |
| Download a complete batch (~50-100GB). |
| |
| Good for substantial local exploration and development. |
| """ |
| config = DatasetConfig(label="improvised", split="dev", num_workers=8) |
| fs = SeamlessInteractionFS(config=config) |
|
|
| |
| fs.download_batch_from_hf(batch_idx=0) |
| print("β
Downloaded single batch (~50-100GB)") |
|
|
|
|
| def download_multiple_batches(): |
| """ |
| Download multiple batches for training datasets. |
| |
| Suitable for model training and large-scale analysis. |
| """ |
| config = DatasetConfig(label="improvised", split="train", num_workers=8) |
| fs = SeamlessInteractionFS(config=config) |
|
|
| |
| for batch_idx in range(3): |
| fs.download_batch_from_hf(batch_idx=batch_idx) |
| print(f"β
Downloaded batch {batch_idx}") |
|
|
| print("β
Downloaded multiple batches (~150GB+)") |
|
|
|
|
| def download_different_splits(): |
| """ |
| Download data from different splits and labels. |
| |
| Covers both improvised/naturalistic and train/dev/test splits. |
| """ |
| |
| splits_to_download = [ |
| ("improvised", "dev", 0), |
| ("naturalistic", "dev", 0), |
| ("improvised", "test", 0), |
| ("naturalistic", "test", 0), |
| ] |
|
|
| for label, split, batch_idx in splits_to_download: |
| config = DatasetConfig(label=label, num_workers=4) |
| fs = SeamlessInteractionFS(config=config) |
|
|
| |
| fs.download_batch_from_hf( |
| split=split, batch_idx=batch_idx, archive_list=[0, 1, 2] |
| ) |
| print(f"β
Downloaded {label}/{split} sample") |
|
|
| print("β
Downloaded samples from different splits") |
|
|
|
|
| def download_whole_dataset(): |
| """ |
| Download the complete dataset (~27TB). |
| |
| β οΈ CAUTION: This will download the entire dataset! |
| Only use on high-capacity storage with fast internet. |
| """ |
| |
| labels = ["improvised", "naturalistic"] |
| splits = ["train", "dev", "test"] |
|
|
| confirm = input( |
| "Are you sure you want to download the entire dataset (~27TB)? (y/n): " |
| ) |
| if confirm not in ["y", "Y", "yes", "Yes", "YES"]: |
| print("Download cancelled.") |
| return |
|
|
| for label in labels: |
| for split in splits: |
| print(f"Downloading all {label}/{split} batches...") |
| config = DatasetConfig(label=label, num_workers=16) |
| fs = SeamlessInteractionFS(config=config) |
| fs.download_batch_from_hf( |
| split=split, |
| batch_idx=None, |
| ) |
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
| print("β
Downloaded complete dataset (~27TB)") |
|
|
|
|
| def main(): |
| """ |
| Demonstrate HuggingFace-based flexible download options. |
| """ |
| print("π¦ HuggingFace Download Options:") |
| print("1. Sample set (~1GB) - Traditional archive-based") |
| print("2. Single batch (~50-100GB)") |
| print("3. Multiple batches (~150GB+)") |
| print("4. Different splits (improvised/naturalistic, train/dev/test)") |
| print("5. Whole dataset (~27TB)") |
|
|
| |
| download_1gb_sample_archive() |
| |
| |
| |
| |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|