Spaces:

CompactAI
/

AIFinder

Running

App Files Files Community

CompactAI commited on about 22 hours ago

Commit

f37b83c

verified ·

1 Parent(s): bb0efe6

Upload 18 files

Browse files

Files changed (2) hide show

dataset_evaluator.py +25 -13
models/jobs.joblib +2 -2

dataset_evaluator.py CHANGED Viewed

@@ -328,30 +328,33 @@ def _get_dataset_size(dataset_id, load_kwargs):
         return 0
-def _streaming_download_with_progress(dataset_id, load_kwargs, progress_callback=None):
     """Download dataset using streaming with progress tracking."""
     import pandas as pd
     total_rows = _get_dataset_size(dataset_id, load_kwargs)
     print(f"[PROGRESS] Dataset size: {total_rows} rows", flush=True)
-    if total_rows > 0 and progress_callback:
-        progress_callback(0, total_rows, "fetching_info")
-        print(f"[PROGRESS] Initial callback: 0/{total_rows}", flush=True)
     try:
         ds = load_dataset(dataset_id, split="train", streaming=True, **load_kwargs)
         rows = []
         for i, row in enumerate(tqdm(ds, desc="Downloading", unit="rows")):
             rows.append(row)
-            if progress_callback and total_rows > 0:
-                progress_callback(i + 1, total_rows, "downloading")
                 if i % 100 == 0:
-                    print(
-                        f"[PROGRESS] Downloaded {i + 1}/{total_rows} ({100 * (i + 1) / total_rows:.1f}%)",
-                        flush=True,
-                    )
-        return rows, total_rows
     except Exception as e:
         print(f"[PROGRESS] Streaming failed: {e}", flush=True)
         pass
@@ -359,6 +362,9 @@ def _streaming_download_with_progress(dataset_id, load_kwargs, progress_callback
     try:
         url = f"https://huggingface.co/api/datasets/{dataset_id}/parquet/default/train/0.parquet"
         df = pd.read_parquet(url)
         total = len(df)
         if progress_callback:
             progress_callback(0, total, "downloading")
@@ -436,7 +442,7 @@ def load_dataset_texts(
         if progress_callback:
             try:
                 rows, total_rows = _streaming_download_with_progress(
-                    dataset_id, load_kwargs, progress_callback
                 )
             except Exception as e:
                 fallback_error = None
@@ -445,6 +451,8 @@ def load_dataset_texts(
                     url = f"https://huggingface.co/api/datasets/{dataset_id}/parquet/default/train/0.parquet"
                     df = pd.read_parquet(url)
                     total_rows = len(df)
                     if progress_callback:
                         progress_callback(0, total_rows, "downloading")
@@ -467,7 +475,9 @@ def load_dataset_texts(
             try:
                 ds = load_dataset(dataset_id, split="train", **load_kwargs)
                 total_rows = len(ds)
-                rows = list(ds)
             except Exception as e:
                 fallback_error = None
                 try:
@@ -475,6 +485,8 @@ def load_dataset_texts(
                     url = f"https://huggingface.co/api/datasets/{dataset_id}/parquet/default/train/0.parquet"
                     df = pd.read_parquet(url)
                     total_rows = len(df)
                     rows = df.to_dict(orient="records")
                 except Exception as e2:

         return 0
+def _streaming_download_with_progress(
+    dataset_id, load_kwargs, progress_callback=None, max_rows=None
+):
     """Download dataset using streaming with progress tracking."""
     import pandas as pd
     total_rows = _get_dataset_size(dataset_id, load_kwargs)
     print(f"[PROGRESS] Dataset size: {total_rows} rows", flush=True)
+    download_limit = max_rows if max_rows and max_rows < total_rows else total_rows
+    if progress_callback:
+        progress_callback(0, download_limit, "fetching_info")
+        print(f"[PROGRESS] Initial callback: 0/{download_limit}", flush=True)
     try:
         ds = load_dataset(dataset_id, split="train", streaming=True, **load_kwargs)
         rows = []
         for i, row in enumerate(tqdm(ds, desc="Downloading", unit="rows")):
             rows.append(row)
+            if progress_callback:
+                progress_callback(i + 1, download_limit, "downloading")
                 if i % 100 == 0:
+                    print(f"[PROGRESS] Downloaded {i + 1}/{download_limit}", flush=True)
+            if max_rows and i + 1 >= max_rows:
+                print(f"[PROGRESS] Stopping at {i + 1} rows", flush=True)
+                break
+        return rows, min(len(rows), total_rows or len(rows))
     except Exception as e:
         print(f"[PROGRESS] Streaming failed: {e}", flush=True)
         pass
     try:
         url = f"https://huggingface.co/api/datasets/{dataset_id}/parquet/default/train/0.parquet"
         df = pd.read_parquet(url)
+        if max_rows and max_rows < len(df):
+            df = df.head(max_rows)
+            print(f"[PROGRESS] Limited to first {max_rows} rows", flush=True)
         total = len(df)
         if progress_callback:
             progress_callback(0, total, "downloading")
         if progress_callback:
             try:
                 rows, total_rows = _streaming_download_with_progress(
+                    dataset_id, load_kwargs, progress_callback, max_samples
                 )
             except Exception as e:
                 fallback_error = None
                     url = f"https://huggingface.co/api/datasets/{dataset_id}/parquet/default/train/0.parquet"
                     df = pd.read_parquet(url)
+                    if max_samples and max_samples < len(df):
+                        df = df.head(max_samples)
                     total_rows = len(df)
                     if progress_callback:
                         progress_callback(0, total_rows, "downloading")
             try:
                 ds = load_dataset(dataset_id, split="train", **load_kwargs)
                 total_rows = len(ds)
+                if max_samples and max_samples < total_rows:
+                    total_rows = max_samples
+                rows = list(ds)[:max_samples] if max_samples else list(ds)
             except Exception as e:
                 fallback_error = None
                 try:
                     url = f"https://huggingface.co/api/datasets/{dataset_id}/parquet/default/train/0.parquet"
                     df = pd.read_parquet(url)
+                    if max_samples and max_samples < len(df):
+                        df = df.head(max_samples)
                     total_rows = len(df)
                     rows = df.to_dict(orient="records")
                 except Exception as e2:

models/jobs.joblib CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:24c91be6078f7fd4303d7f28e8a7212cea5f2113e05ab1335dacb6382c62c21e
-size 7254

 version https://git-lfs.github.com/spec/v1
+oid sha256:508296d28eb9a3d5d1dcdf1f76ba87f5cf6d9daf6c26699ebc84464fc83d7976
+size 8380