| import json |
| import pandas as pd |
| import streamlit as st |
|
|
|
|
| def clean_git_patch(git_patch): |
| if 'diff' in git_patch: |
| git_patch = git_patch[git_patch.index('diff'):] |
| return git_patch |
|
|
| def reformat_history(history): |
| new_history = [] |
| cur_turn = [] |
| for i, (action, observation) in enumerate(history): |
| |
| |
| if 'source' not in action: |
| return history |
|
|
| if i == 0: |
| assert action['action'] == 'message' |
| assert action['source'] == 'user' |
| |
| continue |
|
|
| if action['source'] == 'agent': |
| |
| if len(cur_turn) == 1: |
| new_history.append(cur_turn[0]) |
| elif len(cur_turn) == 2: |
| |
| agent_msg_action, agent_msg_obs = cur_turn[0] |
| assert agent_msg_obs['observation'] == 'null' |
| user_msg_action, user_msg_obs = cur_turn[1] |
| assert user_msg_obs['observation'] == 'null' |
| |
| user_msg_action_as_obs = { |
| 'observation': 'message', |
| 'source': 'user', |
| 'content': user_msg_action['args']['content'], |
| } |
| new_history.append((agent_msg_action, user_msg_action_as_obs)) |
| elif len(cur_turn) == 0: |
| pass |
| else: |
| st.write(f'Unsupported #interactions per iteration: {len(cur_turn)}') |
| st.json(cur_turn) |
| raise ValueError(f'Unsupported #interactions per iteration: {len(cur_turn)}') |
|
|
| |
| cur_turn = [] |
| cur_turn.append((action, observation)) |
| return new_history |
|
|
| def load_df_from_selected_filepaths(select_filepaths): |
| data = [] |
| if isinstance(select_filepaths, str): |
| select_filepaths = [select_filepaths] |
| for filepath in select_filepaths: |
| with open(filepath, 'r') as f: |
| for line in f.readlines(): |
| d = json.loads(line) |
| |
| if 'git_patch' in d: |
| d['git_patch'] = clean_git_patch(d['git_patch']) |
| d['history'] = reformat_history(d['history']) |
| data.append(d) |
| df = pd.DataFrame(data) |
| return df |
|
|
|
|
| def agg_stats(df): |
| stats = [] |
| for idx, entry in df.iterrows(): |
| history = entry['history'] |
| test_result = entry['test_result']['result'] |
| error = entry.get('error', None) |
| if error is not None: |
| agent_stuck_in_loop = "Agent got stuck in a loop" in error |
| contains_error = bool(error) and not agent_stuck_in_loop |
| else: |
| agent_stuck_in_loop = False |
| contains_error = False |
|
|
| |
| apply_test_patch_success = entry['test_result']['metadata'][ |
| '3_apply_test_patch_success' |
| ] |
| empty_generation = bool(entry['git_patch'].strip() == '') |
| test_cmd_exit_error = bool( |
| not entry['test_result']['metadata']['4_run_test_command_success'] |
| ) |
|
|
| |
| if 'fine_grained_report' in entry: |
| resolved_value = entry['fine_grained_report']['resolved'] |
| test_result['resolved'] = resolved_value if resolved_value is not None else False |
| test_result['test_timeout'] = entry['fine_grained_report']['test_timeout'] |
| test_result['test_errored'] = entry['fine_grained_report']['test_errored'] |
| test_result['patch_applied'] = entry['fine_grained_report']['applied'] |
| else: |
| test_result['resolved'] = ( |
| bool(test_result.get('resolved', False)) and not empty_generation |
| ) |
|
|
| |
| obs_lengths = [] |
| for _, (_, obs) in enumerate(history): |
| if 'content' in obs: |
| obs_lengths.append(len(obs['content'])) |
| obs_lengths = pd.Series(obs_lengths) |
|
|
| metrics = entry.get('metrics', {}) |
| cost = metrics.get('accumulated_cost', None) |
|
|
| d = { |
| 'idx': idx, |
| 'instance_id': entry['instance_id'], |
| 'agent_class': entry['metadata']['agent_class'], |
| 'model_name': entry['metadata']['model_name'], |
| 'n_turns': len(history), |
| **test_result, |
| 'agent_stuck_in_loop': agent_stuck_in_loop, |
| 'contains_error': contains_error, |
| 'cost': cost, |
| 'empty_generation': empty_generation, |
| 'apply_test_patch_success': apply_test_patch_success, |
| 'test_cmd_exit_error': test_cmd_exit_error, |
| 'obs_len_avg': round(obs_lengths.mean(), 0), |
| 'obs_len_std': round(obs_lengths.std(), 0), |
| 'obs_len_max': round(obs_lengths.max(), 0), |
| } |
| if 'swe_instance' in entry: |
| d.update( |
| { |
| 'repo': entry['swe_instance']['repo'], |
| } |
| ) |
| stats.append(d) |
| return pd.DataFrame(stats) |
|
|
| @st.cache_data |
| def get_resolved_stats_from_filepath(filepath): |
| df = load_df_from_selected_filepaths(filepath) |
| stats = agg_stats(df) |
| if not len(stats): |
| return { |
| 'success_rate': None, |
| 'n_solved': None, |
| 'n_error': None, |
| 'total': None, |
| 'total_cost': None, |
| } |
| tot_cost = stats['cost'].sum() |
| resolved = stats['resolved'].sum() / len(stats) |
| num_contains_error = stats['contains_error'].sum() |
| num_agent_stuck_in_loop = stats['agent_stuck_in_loop'].sum() |
| tot_instances = len(stats) |
| return { |
| 'success_rate': resolved, |
| 'n_solved': stats['resolved'].sum(), |
| 'n_error': num_contains_error, |
| 'n_stuck_in_loop': num_agent_stuck_in_loop, |
| 'total': tot_instances, |
| 'total_cost': tot_cost, |
| } |
|
|