Final_Assignment_Template

Running

Final_Assignment_Template / load_data.py

update (app.py, graph.py, tools.py) : ajout récup des fichiers questions, et gestion du graph et des outils + premier test run (3 bonnes réponses)

cb20efa 7 days ago

raw

history blame contribute delete

4.14 kB

	# std lib
	import os
	from pathlib import Path

	# 3rd party imports
	import pandas as pd

	# local imports
	from huggingface_hub import snapshot_download

	DATA_DIR = Path(__file__).resolve().parent / "data"


	def get_full_gaia_level1_data():
	"""
	Download the full GAIA level 1 data and save it locally in data/.
	"""
	os.makedirs(DATA_DIR, exist_ok=True)

	# Fallback to common HF token names; token can be optional for public datasets.
	token = os.getenv("HF_FINAL_ASSIGNMENT_DRAFT") or os.getenv("HF_TOKEN")

	kwargs = {
	"repo_id": "gaia-benchmark/GAIA",
	"repo_type": "dataset",
	"local_dir": DATA_DIR,
	}
	if token:
	kwargs["token"] = token

	snapshot_download(**kwargs)


	def get_file_from_gaia_level1_data(task_id: str):
	"""
	Given a GAIA level 1 task ID, return the corresponding file path in the local directory data/gaia_level1_data.
	les fichiers servant au test HF sont dans data/gaia_level1_data/2023_level1/validation/
	"""
	# lire le contenu du dossier data/gaia_level1_data/2023_level1/validation et afficher les chemins des fichiers présents
	validation_dir = DATA_DIR / "2023" / "validation"
	for root, _, files in os.walk(validation_dir):
	for file in files:
	if task_id in file:
	return os.path.join(root, file)
	return None


	def get_question(task_id: str) -> str:
	"""
	Given a GAIA level 1 task ID, return the corresponding question.
	"""
	metadata_file = os.path.join(
	DATA_DIR, "2023", "validation", "metadata.level1.parquet"
	)
	if os.path.exists(metadata_file):
	metadata_df = pd.read_parquet(metadata_file)
	return metadata_df.loc[metadata_df["task_id"] == task_id, "Question"].values[0]
	else:
	print(f"Metadata file not found: {metadata_file}")
	return ""


	def ensure_validation_data(base_dir: Path):
	"""
	Ensure GAIA 2023 level1 validation files are present.
	Returns (ok: bool, error_message: str \| None).
	"""
	validation_dir = base_dir / "data" / "2023" / "validation"
	metadata_file = validation_dir / "metadata.level1.parquet"
	need_download_reason = None

	# Fast fail: missing or empty validation directory
	if not validation_dir.exists() or not any(validation_dir.iterdir()):
	need_download_reason = f"Validation data not found in {validation_dir}"
	# Metadata is required to validate expected files
	elif not metadata_file.exists():
	need_download_reason = (
	f"Metadata file not found: {metadata_file}. Cannot verify expected files"
	)
	else:
	try:
	# Read only the needed column for speed/memory
	metadata_df = pd.read_parquet(metadata_file, columns=["file_name"])
	expected_files = {
	str(name) for name in metadata_df["file_name"].dropna().unique()
	}

	present_files = {p.name for p in validation_dir.iterdir() if p.is_file()}
	missing_files = expected_files - present_files

	if missing_files:
	need_download_reason = (
	f"Missing {len(missing_files)} expected validation files"
	)
	except Exception as e:
	need_download_reason = (
	f"Error reading metadata ({metadata_file}): {e}. "
	"Cannot verify expected files"
	)

	if need_download_reason is not None:
	print(f"{need_download_reason}. Downloading full GAIA level 1 data...")
	try:
	get_full_gaia_level1_data()
	print("Data download completed.")
	except Exception as e:
	error_message = f"Error downloading GAIA level 1 data: {e}"
	print(error_message)
	return False, error_message
	else:
	print("All expected validation files are present. Skipping data download.")

	return True, None


	if __name__ == "__main__":
	# get_full_gaia_level1_data()
	print(get_file_from_gaia_level1_data("cca530fc-4052-43b2-b130-b30968d8aa44"))
	# print(get_question("cca530fc-4052-43b2-b130-b30968d8aa44"))