Spaces:
Running
Running
| """ | |
| Baseline inference script for the API Integration Debugging Environment. | |
| This script demonstrates an LLM-powered agent interacting with the environment | |
| using the OpenAI API. It runs all 3 tasks (easy, medium, hard) and reports | |
| baseline scores. | |
| Usage: | |
| # Set your OpenAI API key | |
| export OPENAI_API_KEY=your-key-here | |
| # Run baseline | |
| python scripts/baseline_inference.py | |
| # Or specify a server URL | |
| python scripts/baseline_inference.py --server-url http://localhost:8000 | |
| """ | |
| import argparse | |
| import json | |
| import os | |
| import sys | |
| from typing import Any, Dict, List, Optional | |
| # Add parent directory to path | |
| sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| from models import ApiDebugAction, ApiDebugObservation | |
| from scenarios import get_all_task_ids, get_scenario | |
| from server.api_debug_env_environment import ApiDebugEnvironment | |
| def run_rule_based_baseline(task_id: str) -> Dict[str, Any]: | |
| """ | |
| Run a simple rule-based baseline agent (no LLM needed). | |
| Strategy: | |
| 1. Inspect all logs | |
| 2. Inspect all configs | |
| 3. Test all endpoints | |
| (Does not attempt fixes — tests reward signal for exploration-only behavior) | |
| """ | |
| env = ApiDebugEnvironment(task_id=task_id) | |
| obs = env.reset() | |
| total_reward = 0.0 | |
| # Phase 1: Inspect all logs | |
| for service in obs.available_targets: | |
| if obs.done: | |
| break | |
| obs = env.step(ApiDebugAction(action_type="inspect_logs", target=service)) | |
| total_reward += obs.reward | |
| # Phase 2: Inspect all configs | |
| for service in obs.available_targets: | |
| if obs.done: | |
| break | |
| obs = env.step(ApiDebugAction(action_type="inspect_config", target=service)) | |
| total_reward += obs.reward | |
| # Phase 3: Test all endpoints | |
| for service in obs.available_targets: | |
| if obs.done: | |
| break | |
| obs = env.step(ApiDebugAction(action_type="inspect_endpoint", target=service)) | |
| total_reward += obs.reward | |
| score = env.grade() | |
| return { | |
| "task_id": task_id, | |
| "score": score, | |
| "total_reward": round(total_reward, 4), | |
| "steps_used": env._state.step_count, | |
| "issues_found": len(env._issues_found), | |
| "issues_fixed": len(env._issues_fixed), | |
| "issues_total": len(env._scenario.issues) if env._scenario else 0, | |
| } | |
| def run_llm_baseline(task_id: str, api_key: Optional[str] = None) -> Dict[str, Any]: | |
| """ | |
| Run an LLM-powered baseline agent using OpenAI API. | |
| The LLM reads observations and decides what to do next. | |
| """ | |
| try: | |
| from openai import OpenAI | |
| except ImportError: | |
| print("OpenAI package not installed. Running rule-based baseline instead.") | |
| return run_rule_based_baseline(task_id) | |
| key = api_key or os.environ.get("OPENAI_API_KEY") | |
| if not key: | |
| print("No OPENAI_API_KEY set. Running rule-based baseline instead.") | |
| return run_rule_based_baseline(task_id) | |
| client = OpenAI(api_key=key) | |
| env = ApiDebugEnvironment(task_id=task_id) | |
| obs = env.reset() | |
| total_reward = 0.0 | |
| system_prompt = f"""You are an API debugging agent. Your task: {obs.task_description} | |
| Available actions: | |
| - inspect_logs: Read error logs for a service | |
| - inspect_config: See the configuration of a service | |
| - inspect_endpoint: Test-call an endpoint | |
| - submit_fix: Submit a config fix (requires fix_payload dict) | |
| Available targets: {obs.available_targets} | |
| Total issues to fix: {obs.issues_total} | |
| Respond with JSON: {{"action_type": "...", "target": "...", "fix_payload": {{...}} }} | |
| Only include fix_payload when action_type is "submit_fix".""" | |
| messages = [{"role": "system", "content": system_prompt}] | |
| while not obs.done: | |
| # Build observation message | |
| obs_text = f"""Step {env._state.step_count}/{env._scenario.max_steps if env._scenario else '?'} | |
| Remaining steps: {obs.remaining_steps} | |
| Issues found: {obs.issues_found}/{obs.issues_total} | |
| Issues fixed: {obs.issues_fixed}/{obs.issues_total} | |
| Last action result: {obs.action_result}""" | |
| if obs.logs: | |
| obs_text += f"\nLogs:\n" + "\n".join(obs.logs) | |
| if obs.config_snapshot: | |
| obs_text += f"\nConfig: {json.dumps(obs.config_snapshot, indent=2)}" | |
| if obs.api_response: | |
| obs_text += f"\nAPI Response: {json.dumps(obs.api_response, indent=2)}" | |
| if obs.hints: | |
| obs_text += f"\nHints: {'; '.join(obs.hints)}" | |
| messages.append({"role": "user", "content": obs_text}) | |
| try: | |
| response = client.chat.completions.create( | |
| model="gpt-4o-mini", | |
| messages=messages, | |
| temperature=0.2, | |
| max_tokens=500, | |
| response_format={"type": "json_object"}, | |
| ) | |
| action_json = json.loads(response.choices[0].message.content) | |
| messages.append({"role": "assistant", "content": json.dumps(action_json)}) | |
| action = ApiDebugAction( | |
| action_type=action_json.get("action_type", "inspect_logs"), | |
| target=action_json.get("target", obs.available_targets[0] if obs.available_targets else ""), | |
| fix_payload=action_json.get("fix_payload"), | |
| ) | |
| except Exception as e: | |
| print(f" LLM error: {e}. Falling back to inspect_logs.") | |
| action = ApiDebugAction( | |
| action_type="inspect_logs", | |
| target=obs.available_targets[0] if obs.available_targets else "", | |
| ) | |
| obs = env.step(action) | |
| total_reward += obs.reward | |
| score = env.grade() | |
| return { | |
| "task_id": task_id, | |
| "score": score, | |
| "total_reward": round(total_reward, 4), | |
| "steps_used": env._state.step_count, | |
| "issues_found": len(env._issues_found), | |
| "issues_fixed": len(env._issues_fixed), | |
| "issues_total": len(env._scenario.issues) if env._scenario else 0, | |
| } | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Baseline inference for API Debug Env") | |
| parser.add_argument("--mode", choices=["rule", "llm"], default="rule", | |
| help="Baseline mode: 'rule' for rule-based, 'llm' for LLM-powered") | |
| parser.add_argument("--api-key", type=str, default=None, | |
| help="OpenAI API key (or set OPENAI_API_KEY env var)") | |
| parser.add_argument("--task", type=str, default=None, | |
| help="Run specific task only (easy/medium/hard)") | |
| args = parser.parse_args() | |
| print("=" * 60) | |
| print("API Integration Debugging — Baseline Inference") | |
| print("=" * 60) | |
| task_ids = [args.task] if args.task else get_all_task_ids() | |
| all_results = {} | |
| for task_id in task_ids: | |
| print(f"\n{'─' * 40}") | |
| print(f"Task: {task_id}") | |
| print(f"{'─' * 40}") | |
| if args.mode == "llm": | |
| result = run_llm_baseline(task_id, args.api_key) | |
| else: | |
| result = run_rule_based_baseline(task_id) | |
| all_results[task_id] = result | |
| print(f" Score: {result['score']}") | |
| print(f" Reward: {result['total_reward']}") | |
| print(f" Steps: {result['steps_used']}") | |
| print(f" Issues found: {result['issues_found']}/{result['issues_total']}") | |
| print(f" Issues fixed: {result['issues_fixed']}/{result['issues_total']}") | |
| print(f"\n{'=' * 60}") | |
| print("Summary") | |
| print(f"{'=' * 60}") | |
| for tid, res in all_results.items(): | |
| print(f" {tid:8s} score={res['score']:.4f} fixed={res['issues_fixed']}/{res['issues_total']}") | |
| avg_score = sum(r["score"] for r in all_results.values()) / len(all_results) | |
| print(f"\n Average score: {avg_score:.4f}") | |
| return all_results | |
| if __name__ == "__main__": | |
| main() | |