| | import os |
| | |
| | from typing import Union, Any, List |
| | from ..core.logging import logger |
| | from .benchmark import CodingBenchmark |
| | from ..core.module_utils import extract_code_blocks |
| | from typing import Union, Any, List, Callable |
| | from .lcb_utils.code_generation import ( |
| | CodeGenerationProblem, |
| | load_code_generation_dataset |
| | ) |
| | from .lcb_utils.test_output_prediction import ( |
| | TestOutputPredictionProblem, |
| | load_test_prediction_dataset |
| | ) |
| | from .lcb_utils.code_execution import ( |
| | CodeExecutionProblem, |
| | load_code_execution_dataset |
| | ) |
| | from .lcb_utils.evaluation import ( |
| | codegen_metrics, |
| | test_output_metrics, |
| | code_execution_metrics, |
| | reliability_guard |
| | ) |
| | from .lcb_utils.utils import extract_test_output_code, extract_execution_code |
| |
|
| |
|
| | VALID_SCENARIO = ["code_generation", "test_output_prediction", "code_execution"] |
| |
|
| | class LiveCodeBench(CodingBenchmark): |
| |
|
| | """Benchmark class for evaluating LLM capabilities on real-world programming tasks. |
| | |
| | LiveCodeBench provides a framework for evaluating different scenarios of code-related tasks: |
| | 1. Code Generation: generating code from problem descriptions |
| | 2. Test Output Prediction: predicting test outputs given test code |
| | 3. Code Execution: generating code that executes correctly |
| | |
| | The benchmark supports different evaluation modes, metrics, and can be customized |
| | with various parameters like timeouts, sample dates, and processing options. |
| | |
| | Attributes: |
| | k: An integer or list of integers specifying which pass@k metrics to compute |
| | version: Release version of the dataset to use |
| | num_process: Number of processes to use for evaluation |
| | start_date: Filter problems to those after this date |
| | end_date: Filter problems to those before this date |
| | scenario: Type of programming task to evaluate ("code_generation", |
| | "test_output_prediction", or "code_execution") |
| | use_cot_for_execution: Whether to use chain-of-thought processing for code execution |
| | """ |
| |
|
| | def __init__( |
| | self, |
| | path: str = None, |
| | mode: str = "all", |
| | timeout: int = 60, |
| | k: Union[int, list] = 1, |
| | num_process: int = 6, |
| | scenario: str = "code_generation", |
| | version: str = "release_latest", |
| | start_date: str = None, |
| | end_date: str = None, |
| | use_cot_for_execution: bool = False, |
| | **kwargs |
| | ): |
| | path = os.path.expanduser(path or "~/.evoagentx/data/livecodebench") |
| | self.k = k |
| | self.version = version |
| | self.num_process = num_process |
| | self.start_date = start_date |
| | self.end_date = end_date |
| | self.scenario = scenario |
| | self.name = 'livecodebench' |
| | self.use_cot_for_execution = use_cot_for_execution |
| | assert scenario in VALID_SCENARIO, f"Invalid scenario: {scenario}. Available choices: {VALID_SCENARIO}." |
| | super().__init__(name=type(self).__name__, path=path, mode=mode, timeout=timeout, **kwargs) |
| | |
| | def _load_data(self): |
| | if self.mode == "train" or self.mode == "all": |
| | self._train_data = None |
| | if self.mode == "dev" or self.mode == "all": |
| | self._dev_data = None |
| | if self.mode == "test" or self.mode == "all": |
| | self._test_data = self._load_test_data() |
| | |
| | def _load_test_data(self): |
| |
|
| | if self.scenario == "code_generation": |
| | logger.info(f"Loading code generation dataset from {self.path} with version {self.version}.") |
| | data: List[CodeGenerationProblem] = load_code_generation_dataset( |
| | release_version=self.version, |
| | cache_dir=self.path, |
| | start_date=self.start_date, |
| | end_date=self.end_date |
| | ) |
| | elif self.scenario == "test_output_prediction": |
| | logger.info(f"Loading test output prediction dataset from {self.path}.") |
| | data: List[TestOutputPredictionProblem] = load_test_prediction_dataset(cache_dir=self.path) |
| | elif self.scenario == "code_execution": |
| | logger.info(f"Loading code execution dataset from {self.path}.") |
| | data: List[CodeExecutionProblem] = load_code_execution_dataset(cache_dir=self.path) |
| | else: |
| | raise ValueError(f"Invalid scenario: {self.scenario}. Available choices: {VALID_SCENARIO}.") |
| |
|
| | return data |
| | |
| | def _get_id(self, example: Union[CodeGenerationProblem, TestOutputPredictionProblem]) -> str: |
| | return example.question_id |
| | |
| | def _get_label(self, example: Union[CodeGenerationProblem, TestOutputPredictionProblem]) -> dict: |
| | return example.get_evaluation_sample() |
| | |
| | async def async_evaluate(self, graph: Callable, example: Any) -> float: |
| |
|
| | |
| | prompt, entry_point = example.question_content, example.starter_code |
| | solution = await graph(prompt, entry_point) |
| | label = self._get_label(example) |
| | metrics = await super().async_evaluate(prediction=solution, label=label) |
| | return metrics["pass@1"] |
| | |
| | |
| | def evaluate(self, prediction: Any, label: Any) -> dict: |
| | """ |
| | Evaluate the solution code. |
| | |
| | Args: |
| | prediction (str | List[str]): The solution code(s). |
| | label (dict | List[dict]): The test cases and expected outputs. |
| | |
| | Returns: |
| | dict: The evaluation metrics (pass@k). |
| | """ |
| | |
| | |
| | |
| | |
| | prediction, label = self._check_evaluation_inputs(prediction, label) |
| | k_list = [self.k] if isinstance(self.k, int) else self.k |
| |
|
| | if self.scenario == "code_generation": |
| | solutions: List[str] = [extract_code_blocks(pred)[0] for pred in prediction] |
| | |
| | |
| | metrics, results, metadatas = codegen_metrics( |
| | samples_list=label, |
| | generations_list=[solutions], |
| | k_list=k_list, |
| | num_process_evaluate=self.num_process, |
| | timeout=self.timeout |
| | ) |
| | |
| | self.met = metrics |
| | self.res = results |
| | self.metadatas = metadatas |
| | |
| | elif self.scenario == "test_output_prediction": |
| | pred_outputs = [extract_test_output_code(pred) for pred in prediction] |
| | metrics, results = test_output_metrics( |
| | samples=label, |
| | generations=[pred_outputs], |
| | k_list=k_list, |
| | ) |
| | elif self.scenario == "code_execution": |
| | pred_outputs = [extract_execution_code(pred, self.use_cot_for_execution) for pred in prediction] |
| | metrics, results = code_execution_metrics( |
| | samples=label, |
| | generations=[pred_outputs], |
| | ) |
| | else: |
| | raise ValueError(f"Invalid scenario: {self.scenario}. Available choices: {VALID_SCENARIO}.") |
| | |
| | pass_at_k = {f"pass@{k}": float(metrics[f"pass@{k}"]) for k in k_list} |
| | return pass_at_k |
| | |
| | class AFlowLiveCodeBench(CodingBenchmark): |
| |
|
| | """Benchmark class for evaluating LLM capabilities on real-world programming tasks. |
| | |
| | LiveCodeBench provides a framework for evaluating different scenarios of code-related tasks: |
| | 1. Code Generation: generating code from problem descriptions |
| | 2. Test Output Prediction: predicting test outputs given test code |
| | 3. Code Execution: generating code that executes correctly |
| | |
| | The benchmark supports different evaluation modes, metrics, and can be customized |
| | with various parameters like timeouts, sample dates, and processing options. |
| | |
| | Attributes: |
| | k: An integer or list of integers specifying which pass@k metrics to compute |
| | version: Release version of the dataset to use |
| | num_process: Number of processes to use for evaluation |
| | start_date: Filter problems to those after this date |
| | end_date: Filter problems to those before this date |
| | scenario: Type of programming task to evaluate ("code_generation", |
| | "test_output_prediction", or "code_execution") |
| | use_cot_for_execution: Whether to use chain-of-thought processing for code execution |
| | """ |
| |
|
| | def __init__( |
| | self, |
| | path: str = None, |
| | mode: str = "all", |
| | timeout: int = 60, |
| | k: Union[int, list] = 1, |
| | num_process: int = 6, |
| | scenario: str = "code_generation", |
| | version: str = "release_latest", |
| | start_date: str = None, |
| | end_date: str = None, |
| | use_cot_for_execution: bool = False, |
| | **kwargs |
| | ): |
| | path = os.path.expanduser(path or "~/.evoagentx/data/livecodebench") |
| | self.k = k |
| | self.version = version |
| | self.num_process = num_process |
| | self.start_date = start_date |
| | self.end_date = end_date |
| | self.scenario = scenario |
| | self.use_cot_for_execution = use_cot_for_execution |
| | assert scenario in VALID_SCENARIO, f"Invalid scenario: {scenario}. Available choices: {VALID_SCENARIO}." |
| | super().__init__(name=type(self).__name__, path=path, mode=mode, timeout=timeout, **kwargs) |
| | |
| | def _load_data(self): |
| | if self.mode == "train" or self.mode == "all": |
| | self._train_data = None |
| | if self.mode == "dev" or self.mode == "all": |
| | self._dev_data = None |
| | if self.mode == "test" or self.mode == "all": |
| | self._test_data = self._load_test_data() |
| | |
| | def _load_test_data(self): |
| |
|
| | if self.scenario == "code_generation": |
| | logger.info(f"Loading code generation dataset from {self.path} with version {self.version}.") |
| | data: List[CodeGenerationProblem] = load_code_generation_dataset( |
| | release_version=self.version, |
| | cache_dir=self.path, |
| | start_date=self.start_date, |
| | end_date=self.end_date |
| | ) |
| | elif self.scenario == "test_output_prediction": |
| | logger.info(f"Loading test output prediction dataset from {self.path}.") |
| | data: List[TestOutputPredictionProblem] = load_test_prediction_dataset(cache_dir=self.path) |
| | elif self.scenario == "code_execution": |
| | logger.info(f"Loading code execution dataset from {self.path}.") |
| | data: List[CodeExecutionProblem] = load_code_execution_dataset(cache_dir=self.path) |
| | else: |
| | raise ValueError(f"Invalid scenario: {self.scenario}. Available choices: {VALID_SCENARIO}.") |
| |
|
| | return data |
| | |
| | def _get_id(self, example: Union[CodeGenerationProblem, TestOutputPredictionProblem]) -> str: |
| | return example.question_id |
| | |
| | def _get_label(self, example: Union[CodeGenerationProblem, TestOutputPredictionProblem]) -> dict: |
| | return example.get_evaluation_sample() |
| | |
| | async def async_evaluate(self, graph: Callable, example: Any) -> float: |
| |
|
| | |
| | prompt, entry_point = example.question_content, example.question_title |
| | solution = await graph(prompt, entry_point) |
| | label = self._get_label(example) |
| | metrics = await super().async_evaluate(prediction=solution, label=label) |
| | return metrics["pass@1"] |
| | |
| | def extract_test_cases_with_entry_point(self, entry_point: str): |
| |
|
| | hardcoded_cases = { |
| | "remove_odd": "", |
| | "replace_spaces": "", |
| | "snake_to_camel": "", |
| | "Split": "", |
| | "swap_List": "", |
| | "square_Sum": "", |
| | "sort_sublists": "", |
| | "unique_sublists": "", |
| | } |
| | if entry_point in hardcoded_cases: |
| | return hardcoded_cases[entry_point] |
| | |
| | for case in self._dev_data + self._test_data: |
| | print(entry_point) |
| | print(case.question_title) |
| | if case.question_title == entry_point: |
| | return case.private_test_cases |
| | |
| | return None |
| | |
| | |
| | def evaluate(self, prediction: Any, label: Any) -> dict: |
| | """ |
| | Evaluate the solution code. |
| | |
| | Args: |
| | prediction (str | List[str]): The solution code(s). |
| | label (dict | List[dict]): The test cases and expected outputs. |
| | |
| | Returns: |
| | dict: The evaluation metrics (pass@k). |
| | """ |
| | |
| | |
| | |
| | |
| | prediction, label = self._check_evaluation_inputs(prediction, label) |
| | k_list = [self.k] if isinstance(self.k, int) else self.k |
| |
|
| | if self.scenario == "code_generation": |
| | solutions: List[str] = [extract_code_blocks(pred)[0] for pred in prediction] |
| | |
| | |
| | metrics, results, metadatas = codegen_metrics( |
| | samples_list=label, |
| | generations_list=[solutions], |
| | k_list=k_list, |
| | num_process_evaluate=self.num_process, |
| | timeout=self.timeout |
| | ) |
| | |
| | elif self.scenario == "test_output_prediction": |
| | pred_outputs = [extract_test_output_code(pred) for pred in prediction] |
| | metrics, results = test_output_metrics( |
| | samples=label, |
| | generations=[pred_outputs], |
| | k_list=k_list, |
| | ) |
| | elif self.scenario == "code_execution": |
| | pred_outputs = [extract_execution_code(pred, self.use_cot_for_execution) for pred in prediction] |
| | metrics, results = code_execution_metrics( |
| | samples=label, |
| | generations=[pred_outputs], |
| | ) |
| | else: |
| | raise ValueError(f"Invalid scenario: {self.scenario}. Available choices: {VALID_SCENARIO}.") |
| | |
| | pass_at_k = {f"pass@{k}": float(metrics[f"pass@{k}"]) for k in k_list} |
| | return pass_at_k |