| import argparse |
| import os |
| from pathlib import Path |
| from typing import Union |
|
|
|
|
| def validate_dataset(data_path: Union[str, Path]) -> bool: |
| """ |
| Validate that each image in the dataset directory has a corresponding text file and vice versa. |
| |
| Args: |
| data_path: Path to the dataset directory. |
| |
| Returns: |
| True if the dataset is valid, False otherwise. |
| """ |
| data_dir = Path(data_path) |
| image_extensions = {'.png', '.jpg', '.jpeg', '.webp'} |
|
|
| images = [] |
| texts = [] |
|
|
| for file in data_dir.iterdir(): |
| if file.suffix.lower() in image_extensions: |
| images.append(file.stem) |
| elif file.suffix == '.txt': |
| texts.append(file.stem) |
|
|
| images.sort() |
| texts.sort() |
|
|
| print(f"Found {len(images)} images and {len(texts)} text files") |
|
|
| missing_texts = set(images) - set(texts) |
| missing_images = set(texts) - set(images) |
|
|
| if missing_texts: |
| print(f"Missing text files for: {missing_texts}") |
| if missing_images: |
| print(f"Missing images for: {missing_images}") |
|
|
| if not missing_texts and not missing_images: |
| print("Dataset structure is valid!") |
|
|
| return len(images) == len(texts) and not missing_texts and not missing_images |
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser(description="Validate image-text dataset structure.") |
| parser.add_argument( |
| "--path", |
| type=str, |
| required=True, |
| help="Path to the dataset directory to validate." |
| ) |
| args = parser.parse_args() |
| validate_dataset(args.path) |