File size: 1,566 Bytes
da76488 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 | import argparse
import os
from pathlib import Path
from typing import Union
def validate_dataset(data_path: Union[str, Path]) -> bool:
"""
Validate that each image in the dataset directory has a corresponding text file and vice versa.
Args:
data_path: Path to the dataset directory.
Returns:
True if the dataset is valid, False otherwise.
"""
data_dir = Path(data_path)
image_extensions = {'.png', '.jpg', '.jpeg', '.webp'}
images = []
texts = []
for file in data_dir.iterdir():
if file.suffix.lower() in image_extensions:
images.append(file.stem)
elif file.suffix == '.txt':
texts.append(file.stem)
images.sort()
texts.sort()
print(f"Found {len(images)} images and {len(texts)} text files")
missing_texts = set(images) - set(texts)
missing_images = set(texts) - set(images)
if missing_texts:
print(f"Missing text files for: {missing_texts}")
if missing_images:
print(f"Missing images for: {missing_images}")
if not missing_texts and not missing_images:
print("Dataset structure is valid!")
return len(images) == len(texts) and not missing_texts and not missing_images
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Validate image-text dataset structure.")
parser.add_argument(
"--path",
type=str,
required=True,
help="Path to the dataset directory to validate."
)
args = parser.parse_args()
validate_dataset(args.path) |