CompactAI commited on
Commit
f37b83c
·
verified ·
1 Parent(s): bb0efe6

Upload 18 files

Browse files
Files changed (2) hide show
  1. dataset_evaluator.py +25 -13
  2. models/jobs.joblib +2 -2
dataset_evaluator.py CHANGED
@@ -328,30 +328,33 @@ def _get_dataset_size(dataset_id, load_kwargs):
328
  return 0
329
 
330
 
331
- def _streaming_download_with_progress(dataset_id, load_kwargs, progress_callback=None):
 
 
332
  """Download dataset using streaming with progress tracking."""
333
  import pandas as pd
334
 
335
  total_rows = _get_dataset_size(dataset_id, load_kwargs)
336
  print(f"[PROGRESS] Dataset size: {total_rows} rows", flush=True)
337
 
338
- if total_rows > 0 and progress_callback:
339
- progress_callback(0, total_rows, "fetching_info")
340
- print(f"[PROGRESS] Initial callback: 0/{total_rows}", flush=True)
 
341
 
342
  try:
343
  ds = load_dataset(dataset_id, split="train", streaming=True, **load_kwargs)
344
  rows = []
345
  for i, row in enumerate(tqdm(ds, desc="Downloading", unit="rows")):
346
  rows.append(row)
347
- if progress_callback and total_rows > 0:
348
- progress_callback(i + 1, total_rows, "downloading")
349
  if i % 100 == 0:
350
- print(
351
- f"[PROGRESS] Downloaded {i + 1}/{total_rows} ({100 * (i + 1) / total_rows:.1f}%)",
352
- flush=True,
353
- )
354
- return rows, total_rows
355
  except Exception as e:
356
  print(f"[PROGRESS] Streaming failed: {e}", flush=True)
357
  pass
@@ -359,6 +362,9 @@ def _streaming_download_with_progress(dataset_id, load_kwargs, progress_callback
359
  try:
360
  url = f"https://huggingface.co/api/datasets/{dataset_id}/parquet/default/train/0.parquet"
361
  df = pd.read_parquet(url)
 
 
 
362
  total = len(df)
363
  if progress_callback:
364
  progress_callback(0, total, "downloading")
@@ -436,7 +442,7 @@ def load_dataset_texts(
436
  if progress_callback:
437
  try:
438
  rows, total_rows = _streaming_download_with_progress(
439
- dataset_id, load_kwargs, progress_callback
440
  )
441
  except Exception as e:
442
  fallback_error = None
@@ -445,6 +451,8 @@ def load_dataset_texts(
445
 
446
  url = f"https://huggingface.co/api/datasets/{dataset_id}/parquet/default/train/0.parquet"
447
  df = pd.read_parquet(url)
 
 
448
  total_rows = len(df)
449
  if progress_callback:
450
  progress_callback(0, total_rows, "downloading")
@@ -467,7 +475,9 @@ def load_dataset_texts(
467
  try:
468
  ds = load_dataset(dataset_id, split="train", **load_kwargs)
469
  total_rows = len(ds)
470
- rows = list(ds)
 
 
471
  except Exception as e:
472
  fallback_error = None
473
  try:
@@ -475,6 +485,8 @@ def load_dataset_texts(
475
 
476
  url = f"https://huggingface.co/api/datasets/{dataset_id}/parquet/default/train/0.parquet"
477
  df = pd.read_parquet(url)
 
 
478
  total_rows = len(df)
479
  rows = df.to_dict(orient="records")
480
  except Exception as e2:
 
328
  return 0
329
 
330
 
331
+ def _streaming_download_with_progress(
332
+ dataset_id, load_kwargs, progress_callback=None, max_rows=None
333
+ ):
334
  """Download dataset using streaming with progress tracking."""
335
  import pandas as pd
336
 
337
  total_rows = _get_dataset_size(dataset_id, load_kwargs)
338
  print(f"[PROGRESS] Dataset size: {total_rows} rows", flush=True)
339
 
340
+ download_limit = max_rows if max_rows and max_rows < total_rows else total_rows
341
+ if progress_callback:
342
+ progress_callback(0, download_limit, "fetching_info")
343
+ print(f"[PROGRESS] Initial callback: 0/{download_limit}", flush=True)
344
 
345
  try:
346
  ds = load_dataset(dataset_id, split="train", streaming=True, **load_kwargs)
347
  rows = []
348
  for i, row in enumerate(tqdm(ds, desc="Downloading", unit="rows")):
349
  rows.append(row)
350
+ if progress_callback:
351
+ progress_callback(i + 1, download_limit, "downloading")
352
  if i % 100 == 0:
353
+ print(f"[PROGRESS] Downloaded {i + 1}/{download_limit}", flush=True)
354
+ if max_rows and i + 1 >= max_rows:
355
+ print(f"[PROGRESS] Stopping at {i + 1} rows", flush=True)
356
+ break
357
+ return rows, min(len(rows), total_rows or len(rows))
358
  except Exception as e:
359
  print(f"[PROGRESS] Streaming failed: {e}", flush=True)
360
  pass
 
362
  try:
363
  url = f"https://huggingface.co/api/datasets/{dataset_id}/parquet/default/train/0.parquet"
364
  df = pd.read_parquet(url)
365
+ if max_rows and max_rows < len(df):
366
+ df = df.head(max_rows)
367
+ print(f"[PROGRESS] Limited to first {max_rows} rows", flush=True)
368
  total = len(df)
369
  if progress_callback:
370
  progress_callback(0, total, "downloading")
 
442
  if progress_callback:
443
  try:
444
  rows, total_rows = _streaming_download_with_progress(
445
+ dataset_id, load_kwargs, progress_callback, max_samples
446
  )
447
  except Exception as e:
448
  fallback_error = None
 
451
 
452
  url = f"https://huggingface.co/api/datasets/{dataset_id}/parquet/default/train/0.parquet"
453
  df = pd.read_parquet(url)
454
+ if max_samples and max_samples < len(df):
455
+ df = df.head(max_samples)
456
  total_rows = len(df)
457
  if progress_callback:
458
  progress_callback(0, total_rows, "downloading")
 
475
  try:
476
  ds = load_dataset(dataset_id, split="train", **load_kwargs)
477
  total_rows = len(ds)
478
+ if max_samples and max_samples < total_rows:
479
+ total_rows = max_samples
480
+ rows = list(ds)[:max_samples] if max_samples else list(ds)
481
  except Exception as e:
482
  fallback_error = None
483
  try:
 
485
 
486
  url = f"https://huggingface.co/api/datasets/{dataset_id}/parquet/default/train/0.parquet"
487
  df = pd.read_parquet(url)
488
+ if max_samples and max_samples < len(df):
489
+ df = df.head(max_samples)
490
  total_rows = len(df)
491
  rows = df.to_dict(orient="records")
492
  except Exception as e2:
models/jobs.joblib CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:24c91be6078f7fd4303d7f28e8a7212cea5f2113e05ab1335dacb6382c62c21e
3
- size 7254
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:508296d28eb9a3d5d1dcdf1f76ba87f5cf6d9daf6c26699ebc84464fc83d7976
3
+ size 8380