dreamlessx commited on
Commit
0d8a73f
·
verified ·
1 Parent(s): 4fe4688

Upload landmarkdiff/data_version.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. landmarkdiff/data_version.py +297 -0
landmarkdiff/data_version.py ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Dataset versioning and provenance tracking.
2
+
3
+ Tracks dataset composition, checksums, and lineage for reproducible training.
4
+ Creates manifest files that record exactly which data was used for each
5
+ training run.
6
+
7
+ Usage:
8
+ from landmarkdiff.data_version import DataManifest
9
+
10
+ manifest = DataManifest.from_directory("data/training")
11
+ manifest.save("data/training/manifest.json")
12
+
13
+ # Later, verify data hasn't changed
14
+ manifest2 = DataManifest.from_directory("data/training")
15
+ assert manifest.checksum == manifest2.checksum
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import hashlib
21
+ import json
22
+ from dataclasses import dataclass, field
23
+ from datetime import datetime, timezone
24
+ from pathlib import Path
25
+ from typing import Any, Iterator
26
+
27
+
28
+ @dataclass
29
+ class FileEntry:
30
+ """Metadata for a single dataset file."""
31
+
32
+ path: str
33
+ size_bytes: int
34
+ checksum: str # md5 of first 64KB (fast approximate)
35
+ procedure: str = ""
36
+
37
+ @staticmethod
38
+ def from_path(filepath: Path, base_dir: Path | None = None) -> FileEntry:
39
+ """Create entry from a file path."""
40
+ rel = str(filepath.relative_to(base_dir)) if base_dir else str(filepath)
41
+ size = filepath.stat().st_size
42
+
43
+ # Fast checksum: first 64KB
44
+ h = hashlib.md5()
45
+ with open(filepath, "rb") as f:
46
+ h.update(f.read(65536))
47
+
48
+ # Infer procedure from filename
49
+ proc = ""
50
+ for p in ["rhinoplasty", "blepharoplasty", "rhytidectomy", "orthognathic"]:
51
+ if p in filepath.name or p in str(filepath.parent):
52
+ proc = p
53
+ break
54
+
55
+ return FileEntry(path=rel, size_bytes=size, checksum=h.hexdigest(), procedure=proc)
56
+
57
+
58
+ @dataclass
59
+ class DataManifest:
60
+ """Dataset manifest for versioning and reproducibility.
61
+
62
+ Attributes:
63
+ version: Manifest format version.
64
+ created_at: Creation timestamp.
65
+ root_dir: Root directory of the dataset.
66
+ files: List of file entries.
67
+ metadata: Additional dataset metadata.
68
+ """
69
+
70
+ version: str = "1.0"
71
+ created_at: str = field(
72
+ default_factory=lambda: datetime.now(timezone.utc).isoformat()
73
+ )
74
+ root_dir: str = ""
75
+ files: list[FileEntry] = field(default_factory=list)
76
+ metadata: dict[str, Any] = field(default_factory=dict)
77
+
78
+ @property
79
+ def total_files(self) -> int:
80
+ return len(self.files)
81
+
82
+ @property
83
+ def total_size_bytes(self) -> int:
84
+ return sum(f.size_bytes for f in self.files)
85
+
86
+ @property
87
+ def total_size_mb(self) -> float:
88
+ return self.total_size_bytes / (1024 * 1024)
89
+
90
+ @property
91
+ def checksum(self) -> str:
92
+ """Compute aggregate checksum from all file checksums."""
93
+ h = hashlib.md5()
94
+ for f in sorted(self.files, key=lambda x: x.path):
95
+ h.update(f"{f.path}:{f.checksum}:{f.size_bytes}".encode())
96
+ return h.hexdigest()
97
+
98
+ @property
99
+ def by_procedure(self) -> dict[str, int]:
100
+ """Count files by procedure."""
101
+ counts: dict[str, int] = {}
102
+ for f in self.files:
103
+ key = f.procedure or "unknown"
104
+ counts[key] = counts.get(key, 0) + 1
105
+ return counts
106
+
107
+ @staticmethod
108
+ def from_directory(
109
+ directory: str | Path,
110
+ extensions: set[str] | None = None,
111
+ include_patterns: list[str] | None = None,
112
+ ) -> DataManifest:
113
+ """Create manifest from a directory of dataset files.
114
+
115
+ Args:
116
+ directory: Path to dataset directory.
117
+ extensions: File extensions to include (default: image types).
118
+ include_patterns: Glob patterns to include.
119
+
120
+ Returns:
121
+ DataManifest with entries for all matching files.
122
+ """
123
+ directory = Path(directory)
124
+ if not directory.exists():
125
+ raise FileNotFoundError(f"Directory not found: {directory}")
126
+
127
+ if extensions is None:
128
+ extensions = {".png", ".jpg", ".jpeg", ".webp", ".npy", ".npz"}
129
+
130
+ files: list[FileEntry] = []
131
+ if include_patterns:
132
+ for pattern in include_patterns:
133
+ for fp in sorted(directory.glob(pattern)):
134
+ if fp.is_file():
135
+ files.append(FileEntry.from_path(fp, base_dir=directory))
136
+ else:
137
+ for fp in sorted(directory.rglob("*")):
138
+ if fp.is_file() and fp.suffix.lower() in extensions:
139
+ files.append(FileEntry.from_path(fp, base_dir=directory))
140
+
141
+ manifest = DataManifest(
142
+ root_dir=str(directory),
143
+ files=files,
144
+ metadata={
145
+ "extensions": sorted(extensions),
146
+ "host": _get_hostname(),
147
+ },
148
+ )
149
+ return manifest
150
+
151
+ def save(self, path: str | Path) -> Path:
152
+ """Save manifest to JSON.
153
+
154
+ Args:
155
+ path: Output file path.
156
+
157
+ Returns:
158
+ Path to saved manifest.
159
+ """
160
+ path = Path(path)
161
+ path.parent.mkdir(parents=True, exist_ok=True)
162
+
163
+ data = {
164
+ "version": self.version,
165
+ "created_at": self.created_at,
166
+ "root_dir": self.root_dir,
167
+ "checksum": self.checksum,
168
+ "total_files": self.total_files,
169
+ "total_size_mb": round(self.total_size_mb, 2),
170
+ "by_procedure": self.by_procedure,
171
+ "metadata": self.metadata,
172
+ "files": [
173
+ {
174
+ "path": f.path,
175
+ "size_bytes": f.size_bytes,
176
+ "checksum": f.checksum,
177
+ "procedure": f.procedure,
178
+ }
179
+ for f in self.files
180
+ ],
181
+ }
182
+
183
+ with open(path, "w") as f:
184
+ json.dump(data, f, indent=2)
185
+
186
+ return path
187
+
188
+ @staticmethod
189
+ def load(path: str | Path) -> DataManifest:
190
+ """Load manifest from JSON.
191
+
192
+ Args:
193
+ path: Path to manifest file.
194
+
195
+ Returns:
196
+ DataManifest instance.
197
+ """
198
+ with open(path) as f:
199
+ data = json.load(f)
200
+
201
+ files = [
202
+ FileEntry(
203
+ path=fe["path"],
204
+ size_bytes=fe["size_bytes"],
205
+ checksum=fe["checksum"],
206
+ procedure=fe.get("procedure", ""),
207
+ )
208
+ for fe in data.get("files", [])
209
+ ]
210
+
211
+ return DataManifest(
212
+ version=data.get("version", "1.0"),
213
+ created_at=data.get("created_at", ""),
214
+ root_dir=data.get("root_dir", ""),
215
+ files=files,
216
+ metadata=data.get("metadata", {}),
217
+ )
218
+
219
+ def verify(self, directory: str | Path | None = None) -> tuple[bool, list[str]]:
220
+ """Verify dataset matches this manifest.
221
+
222
+ Args:
223
+ directory: Directory to verify (default: original root_dir).
224
+
225
+ Returns:
226
+ (all_match, list_of_issues)
227
+ """
228
+ directory = Path(directory or self.root_dir)
229
+ issues: list[str] = []
230
+
231
+ for entry in self.files:
232
+ fp = directory / entry.path
233
+ if not fp.exists():
234
+ issues.append(f"Missing: {entry.path}")
235
+ continue
236
+
237
+ actual_size = fp.stat().st_size
238
+ if actual_size != entry.size_bytes:
239
+ issues.append(
240
+ f"Size mismatch: {entry.path} "
241
+ f"(expected {entry.size_bytes}, got {actual_size})"
242
+ )
243
+
244
+ # Check checksum
245
+ h = hashlib.md5()
246
+ with open(fp, "rb") as f:
247
+ h.update(f.read(65536))
248
+ if h.hexdigest() != entry.checksum:
249
+ issues.append(f"Checksum mismatch: {entry.path}")
250
+
251
+ return len(issues) == 0, issues
252
+
253
+ def diff(self, other: DataManifest) -> dict[str, list[str]]:
254
+ """Compare two manifests.
255
+
256
+ Returns:
257
+ Dict with 'added', 'removed', 'modified' file lists.
258
+ """
259
+ self_files = {f.path: f for f in self.files}
260
+ other_files = {f.path: f for f in other.files}
261
+
262
+ self_paths = set(self_files.keys())
263
+ other_paths = set(other_files.keys())
264
+
265
+ added = sorted(other_paths - self_paths)
266
+ removed = sorted(self_paths - other_paths)
267
+ modified = sorted(
268
+ p for p in self_paths & other_paths
269
+ if self_files[p].checksum != other_files[p].checksum
270
+ )
271
+
272
+ return {"added": added, "removed": removed, "modified": modified}
273
+
274
+ def summary(self) -> str:
275
+ """Human-readable summary."""
276
+ lines = [
277
+ f"Dataset Manifest v{self.version}",
278
+ f" Root: {self.root_dir}",
279
+ f" Files: {self.total_files}",
280
+ f" Size: {self.total_size_mb:.1f} MB",
281
+ f" Checksum: {self.checksum}",
282
+ ]
283
+ procs = self.by_procedure
284
+ if procs:
285
+ lines.append(" By procedure:")
286
+ for proc, count in sorted(procs.items()):
287
+ lines.append(f" {proc}: {count}")
288
+ return "\n".join(lines)
289
+
290
+
291
+ def _get_hostname() -> str:
292
+ """Get hostname safely."""
293
+ try:
294
+ import socket
295
+ return socket.gethostname()
296
+ except Exception:
297
+ return "unknown"