Upload 18 files
Browse files- dataset_evaluator.py +25 -13
- models/jobs.joblib +2 -2
dataset_evaluator.py
CHANGED
|
@@ -328,30 +328,33 @@ def _get_dataset_size(dataset_id, load_kwargs):
|
|
| 328 |
return 0
|
| 329 |
|
| 330 |
|
| 331 |
-
def _streaming_download_with_progress(
|
|
|
|
|
|
|
| 332 |
"""Download dataset using streaming with progress tracking."""
|
| 333 |
import pandas as pd
|
| 334 |
|
| 335 |
total_rows = _get_dataset_size(dataset_id, load_kwargs)
|
| 336 |
print(f"[PROGRESS] Dataset size: {total_rows} rows", flush=True)
|
| 337 |
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
|
|
|
| 341 |
|
| 342 |
try:
|
| 343 |
ds = load_dataset(dataset_id, split="train", streaming=True, **load_kwargs)
|
| 344 |
rows = []
|
| 345 |
for i, row in enumerate(tqdm(ds, desc="Downloading", unit="rows")):
|
| 346 |
rows.append(row)
|
| 347 |
-
if progress_callback
|
| 348 |
-
progress_callback(i + 1,
|
| 349 |
if i % 100 == 0:
|
| 350 |
-
print(
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
return rows, total_rows
|
| 355 |
except Exception as e:
|
| 356 |
print(f"[PROGRESS] Streaming failed: {e}", flush=True)
|
| 357 |
pass
|
|
@@ -359,6 +362,9 @@ def _streaming_download_with_progress(dataset_id, load_kwargs, progress_callback
|
|
| 359 |
try:
|
| 360 |
url = f"https://huggingface.co/api/datasets/{dataset_id}/parquet/default/train/0.parquet"
|
| 361 |
df = pd.read_parquet(url)
|
|
|
|
|
|
|
|
|
|
| 362 |
total = len(df)
|
| 363 |
if progress_callback:
|
| 364 |
progress_callback(0, total, "downloading")
|
|
@@ -436,7 +442,7 @@ def load_dataset_texts(
|
|
| 436 |
if progress_callback:
|
| 437 |
try:
|
| 438 |
rows, total_rows = _streaming_download_with_progress(
|
| 439 |
-
dataset_id, load_kwargs, progress_callback
|
| 440 |
)
|
| 441 |
except Exception as e:
|
| 442 |
fallback_error = None
|
|
@@ -445,6 +451,8 @@ def load_dataset_texts(
|
|
| 445 |
|
| 446 |
url = f"https://huggingface.co/api/datasets/{dataset_id}/parquet/default/train/0.parquet"
|
| 447 |
df = pd.read_parquet(url)
|
|
|
|
|
|
|
| 448 |
total_rows = len(df)
|
| 449 |
if progress_callback:
|
| 450 |
progress_callback(0, total_rows, "downloading")
|
|
@@ -467,7 +475,9 @@ def load_dataset_texts(
|
|
| 467 |
try:
|
| 468 |
ds = load_dataset(dataset_id, split="train", **load_kwargs)
|
| 469 |
total_rows = len(ds)
|
| 470 |
-
|
|
|
|
|
|
|
| 471 |
except Exception as e:
|
| 472 |
fallback_error = None
|
| 473 |
try:
|
|
@@ -475,6 +485,8 @@ def load_dataset_texts(
|
|
| 475 |
|
| 476 |
url = f"https://huggingface.co/api/datasets/{dataset_id}/parquet/default/train/0.parquet"
|
| 477 |
df = pd.read_parquet(url)
|
|
|
|
|
|
|
| 478 |
total_rows = len(df)
|
| 479 |
rows = df.to_dict(orient="records")
|
| 480 |
except Exception as e2:
|
|
|
|
| 328 |
return 0
|
| 329 |
|
| 330 |
|
| 331 |
+
def _streaming_download_with_progress(
|
| 332 |
+
dataset_id, load_kwargs, progress_callback=None, max_rows=None
|
| 333 |
+
):
|
| 334 |
"""Download dataset using streaming with progress tracking."""
|
| 335 |
import pandas as pd
|
| 336 |
|
| 337 |
total_rows = _get_dataset_size(dataset_id, load_kwargs)
|
| 338 |
print(f"[PROGRESS] Dataset size: {total_rows} rows", flush=True)
|
| 339 |
|
| 340 |
+
download_limit = max_rows if max_rows and max_rows < total_rows else total_rows
|
| 341 |
+
if progress_callback:
|
| 342 |
+
progress_callback(0, download_limit, "fetching_info")
|
| 343 |
+
print(f"[PROGRESS] Initial callback: 0/{download_limit}", flush=True)
|
| 344 |
|
| 345 |
try:
|
| 346 |
ds = load_dataset(dataset_id, split="train", streaming=True, **load_kwargs)
|
| 347 |
rows = []
|
| 348 |
for i, row in enumerate(tqdm(ds, desc="Downloading", unit="rows")):
|
| 349 |
rows.append(row)
|
| 350 |
+
if progress_callback:
|
| 351 |
+
progress_callback(i + 1, download_limit, "downloading")
|
| 352 |
if i % 100 == 0:
|
| 353 |
+
print(f"[PROGRESS] Downloaded {i + 1}/{download_limit}", flush=True)
|
| 354 |
+
if max_rows and i + 1 >= max_rows:
|
| 355 |
+
print(f"[PROGRESS] Stopping at {i + 1} rows", flush=True)
|
| 356 |
+
break
|
| 357 |
+
return rows, min(len(rows), total_rows or len(rows))
|
| 358 |
except Exception as e:
|
| 359 |
print(f"[PROGRESS] Streaming failed: {e}", flush=True)
|
| 360 |
pass
|
|
|
|
| 362 |
try:
|
| 363 |
url = f"https://huggingface.co/api/datasets/{dataset_id}/parquet/default/train/0.parquet"
|
| 364 |
df = pd.read_parquet(url)
|
| 365 |
+
if max_rows and max_rows < len(df):
|
| 366 |
+
df = df.head(max_rows)
|
| 367 |
+
print(f"[PROGRESS] Limited to first {max_rows} rows", flush=True)
|
| 368 |
total = len(df)
|
| 369 |
if progress_callback:
|
| 370 |
progress_callback(0, total, "downloading")
|
|
|
|
| 442 |
if progress_callback:
|
| 443 |
try:
|
| 444 |
rows, total_rows = _streaming_download_with_progress(
|
| 445 |
+
dataset_id, load_kwargs, progress_callback, max_samples
|
| 446 |
)
|
| 447 |
except Exception as e:
|
| 448 |
fallback_error = None
|
|
|
|
| 451 |
|
| 452 |
url = f"https://huggingface.co/api/datasets/{dataset_id}/parquet/default/train/0.parquet"
|
| 453 |
df = pd.read_parquet(url)
|
| 454 |
+
if max_samples and max_samples < len(df):
|
| 455 |
+
df = df.head(max_samples)
|
| 456 |
total_rows = len(df)
|
| 457 |
if progress_callback:
|
| 458 |
progress_callback(0, total_rows, "downloading")
|
|
|
|
| 475 |
try:
|
| 476 |
ds = load_dataset(dataset_id, split="train", **load_kwargs)
|
| 477 |
total_rows = len(ds)
|
| 478 |
+
if max_samples and max_samples < total_rows:
|
| 479 |
+
total_rows = max_samples
|
| 480 |
+
rows = list(ds)[:max_samples] if max_samples else list(ds)
|
| 481 |
except Exception as e:
|
| 482 |
fallback_error = None
|
| 483 |
try:
|
|
|
|
| 485 |
|
| 486 |
url = f"https://huggingface.co/api/datasets/{dataset_id}/parquet/default/train/0.parquet"
|
| 487 |
df = pd.read_parquet(url)
|
| 488 |
+
if max_samples and max_samples < len(df):
|
| 489 |
+
df = df.head(max_samples)
|
| 490 |
total_rows = len(df)
|
| 491 |
rows = df.to_dict(orient="records")
|
| 492 |
except Exception as e2:
|
models/jobs.joblib
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:508296d28eb9a3d5d1dcdf1f76ba87f5cf6d9daf6c26699ebc84464fc83d7976
|
| 3 |
+
size 8380
|