polyMoe's picture
update (app.py, graph.py, tools.py) : ajout récup des fichiers questions, et gestion du graph et des outils + premier test run (3 bonnes réponses)
cb20efa
# std lib
import os
from pathlib import Path
# 3rd party imports
import pandas as pd
# local imports
from huggingface_hub import snapshot_download
DATA_DIR = Path(__file__).resolve().parent / "data"
def get_full_gaia_level1_data():
"""
Download the full GAIA level 1 data and save it locally in data/.
"""
os.makedirs(DATA_DIR, exist_ok=True)
# Fallback to common HF token names; token can be optional for public datasets.
token = os.getenv("HF_FINAL_ASSIGNMENT_DRAFT") or os.getenv("HF_TOKEN")
kwargs = {
"repo_id": "gaia-benchmark/GAIA",
"repo_type": "dataset",
"local_dir": DATA_DIR,
}
if token:
kwargs["token"] = token
snapshot_download(**kwargs)
def get_file_from_gaia_level1_data(task_id: str):
"""
Given a GAIA level 1 task ID, return the corresponding file path in the local directory data/gaia_level1_data.
les fichiers servant au test HF sont dans data/gaia_level1_data/2023_level1/validation/
"""
# lire le contenu du dossier data/gaia_level1_data/2023_level1/validation et afficher les chemins des fichiers présents
validation_dir = DATA_DIR / "2023" / "validation"
for root, _, files in os.walk(validation_dir):
for file in files:
if task_id in file:
return os.path.join(root, file)
return None
def get_question(task_id: str) -> str:
"""
Given a GAIA level 1 task ID, return the corresponding question.
"""
metadata_file = os.path.join(
DATA_DIR, "2023", "validation", "metadata.level1.parquet"
)
if os.path.exists(metadata_file):
metadata_df = pd.read_parquet(metadata_file)
return metadata_df.loc[metadata_df["task_id"] == task_id, "Question"].values[0]
else:
print(f"Metadata file not found: {metadata_file}")
return ""
def ensure_validation_data(base_dir: Path):
"""
Ensure GAIA 2023 level1 validation files are present.
Returns (ok: bool, error_message: str | None).
"""
validation_dir = base_dir / "data" / "2023" / "validation"
metadata_file = validation_dir / "metadata.level1.parquet"
need_download_reason = None
# Fast fail: missing or empty validation directory
if not validation_dir.exists() or not any(validation_dir.iterdir()):
need_download_reason = f"Validation data not found in {validation_dir}"
# Metadata is required to validate expected files
elif not metadata_file.exists():
need_download_reason = (
f"Metadata file not found: {metadata_file}. Cannot verify expected files"
)
else:
try:
# Read only the needed column for speed/memory
metadata_df = pd.read_parquet(metadata_file, columns=["file_name"])
expected_files = {
str(name) for name in metadata_df["file_name"].dropna().unique()
}
present_files = {p.name for p in validation_dir.iterdir() if p.is_file()}
missing_files = expected_files - present_files
if missing_files:
need_download_reason = (
f"Missing {len(missing_files)} expected validation files"
)
except Exception as e:
need_download_reason = (
f"Error reading metadata ({metadata_file}): {e}. "
"Cannot verify expected files"
)
if need_download_reason is not None:
print(f"{need_download_reason}. Downloading full GAIA level 1 data...")
try:
get_full_gaia_level1_data()
print("Data download completed.")
except Exception as e:
error_message = f"Error downloading GAIA level 1 data: {e}"
print(error_message)
return False, error_message
else:
print("All expected validation files are present. Skipping data download.")
return True, None
if __name__ == "__main__":
# get_full_gaia_level1_data()
print(get_file_from_gaia_level1_data("cca530fc-4052-43b2-b130-b30968d8aa44"))
# print(get_question("cca530fc-4052-43b2-b130-b30968d8aa44"))