File size: 1,566 Bytes
da76488
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import argparse
import os
from pathlib import Path
from typing import Union


def validate_dataset(data_path: Union[str, Path]) -> bool:
    """
    Validate that each image in the dataset directory has a corresponding text file and vice versa.

    Args:
        data_path: Path to the dataset directory.

    Returns:
        True if the dataset is valid, False otherwise.
    """
    data_dir = Path(data_path)
    image_extensions = {'.png', '.jpg', '.jpeg', '.webp'}

    images = []
    texts = []

    for file in data_dir.iterdir():
        if file.suffix.lower() in image_extensions:
            images.append(file.stem)
        elif file.suffix == '.txt':
            texts.append(file.stem)

    images.sort()
    texts.sort()

    print(f"Found {len(images)} images and {len(texts)} text files")

    missing_texts = set(images) - set(texts)
    missing_images = set(texts) - set(images)

    if missing_texts:
        print(f"Missing text files for: {missing_texts}")
    if missing_images:
        print(f"Missing images for: {missing_images}")

    if not missing_texts and not missing_images:
        print("Dataset structure is valid!")

    return len(images) == len(texts) and not missing_texts and not missing_images

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Validate image-text dataset structure.")
    parser.add_argument(
        "--path",
        type=str,
        required=True,
        help="Path to the dataset directory to validate."
    )
    args = parser.parse_args()
    validate_dataset(args.path)