| |
| import os |
| from pathlib import Path |
|
|
| |
| import pandas as pd |
|
|
| |
| from huggingface_hub import snapshot_download |
|
|
| DATA_DIR = Path(__file__).resolve().parent / "data" |
|
|
|
|
| def get_full_gaia_level1_data(): |
| """ |
| Download the full GAIA level 1 data and save it locally in data/. |
| """ |
| os.makedirs(DATA_DIR, exist_ok=True) |
|
|
| |
| token = os.getenv("HF_FINAL_ASSIGNMENT_DRAFT") or os.getenv("HF_TOKEN") |
|
|
| kwargs = { |
| "repo_id": "gaia-benchmark/GAIA", |
| "repo_type": "dataset", |
| "local_dir": DATA_DIR, |
| } |
| if token: |
| kwargs["token"] = token |
|
|
| snapshot_download(**kwargs) |
|
|
|
|
| def get_file_from_gaia_level1_data(task_id: str): |
| """ |
| Given a GAIA level 1 task ID, return the corresponding file path in the local directory data/gaia_level1_data. |
| les fichiers servant au test HF sont dans data/gaia_level1_data/2023_level1/validation/ |
| """ |
| |
| validation_dir = DATA_DIR / "2023" / "validation" |
| for root, _, files in os.walk(validation_dir): |
| for file in files: |
| if task_id in file: |
| return os.path.join(root, file) |
| return None |
|
|
|
|
| def get_question(task_id: str) -> str: |
| """ |
| Given a GAIA level 1 task ID, return the corresponding question. |
| """ |
| metadata_file = os.path.join( |
| DATA_DIR, "2023", "validation", "metadata.level1.parquet" |
| ) |
| if os.path.exists(metadata_file): |
| metadata_df = pd.read_parquet(metadata_file) |
| return metadata_df.loc[metadata_df["task_id"] == task_id, "Question"].values[0] |
| else: |
| print(f"Metadata file not found: {metadata_file}") |
| return "" |
|
|
|
|
| def ensure_validation_data(base_dir: Path): |
| """ |
| Ensure GAIA 2023 level1 validation files are present. |
| Returns (ok: bool, error_message: str | None). |
| """ |
| validation_dir = base_dir / "data" / "2023" / "validation" |
| metadata_file = validation_dir / "metadata.level1.parquet" |
| need_download_reason = None |
|
|
| |
| if not validation_dir.exists() or not any(validation_dir.iterdir()): |
| need_download_reason = f"Validation data not found in {validation_dir}" |
| |
| elif not metadata_file.exists(): |
| need_download_reason = ( |
| f"Metadata file not found: {metadata_file}. Cannot verify expected files" |
| ) |
| else: |
| try: |
| |
| metadata_df = pd.read_parquet(metadata_file, columns=["file_name"]) |
| expected_files = { |
| str(name) for name in metadata_df["file_name"].dropna().unique() |
| } |
|
|
| present_files = {p.name for p in validation_dir.iterdir() if p.is_file()} |
| missing_files = expected_files - present_files |
|
|
| if missing_files: |
| need_download_reason = ( |
| f"Missing {len(missing_files)} expected validation files" |
| ) |
| except Exception as e: |
| need_download_reason = ( |
| f"Error reading metadata ({metadata_file}): {e}. " |
| "Cannot verify expected files" |
| ) |
|
|
| if need_download_reason is not None: |
| print(f"{need_download_reason}. Downloading full GAIA level 1 data...") |
| try: |
| get_full_gaia_level1_data() |
| print("Data download completed.") |
| except Exception as e: |
| error_message = f"Error downloading GAIA level 1 data: {e}" |
| print(error_message) |
| return False, error_message |
| else: |
| print("All expected validation files are present. Skipping data download.") |
|
|
| return True, None |
|
|
|
|
| if __name__ == "__main__": |
| |
| print(get_file_from_gaia_level1_data("cca530fc-4052-43b2-b130-b30968d8aa44")) |
| |
|
|