| |
|
|
| from ...evaluators.aflow_evaluator import AFlowEvaluator |
| from ...core.callbacks import suppress_logger_info |
| from ...core.logging import logger |
|
|
| class EvaluationUtils: |
|
|
| def __init__(self, root_path: str): |
| self.root_path = root_path |
| |
| async def evaluate_graph_async(self, optimizer, validation_n, data, initial=False): |
|
|
| evaluator = AFlowEvaluator(llm=optimizer.executor_llm) |
| sum_score = 0 |
| |
| for _ in range(validation_n): |
|
|
| with suppress_logger_info(): |
| score, avg_cost, total_cost, all_failed = await evaluator.graph_evaluate_async(optimizer.benchmark, optimizer.graph, is_test=False) |
| cur_round = optimizer.round + 1 if initial is False else optimizer.round |
| new_data = optimizer.data_utils.create_result_data(cur_round, score, avg_cost, total_cost) |
| data.append(new_data) |
|
|
| result_path = optimizer.data_utils.get_results_file_path(self.root_path) |
| optimizer.data_utils.save_results(result_path, data) |
| |
| sum_score += score |
|
|
| if all_failed: |
| logger.warning(f"All test cases failed in round {cur_round}. Stopping evaluation for this round.") |
| break |
| |
| return sum_score / validation_n |
|
|
| async def evaluate_graph_test_async(self, optimizer): |
|
|
| evaluator = AFlowEvaluator(llm=optimizer.executor_llm) |
| with suppress_logger_info(): |
| score, avg_cost, total_cost, all_failed = await evaluator.graph_evaluate_async(optimizer.benchmark, optimizer.graph, is_test=True) |
| return score, avg_cost, total_cost |