api-debug-env / scripts /baseline_inference.py
yadnyeshkolte's picture
Upload folder using huggingface_hub
36dac03 verified
"""
Baseline inference script for the API Integration Debugging Environment.
This script demonstrates an LLM-powered agent interacting with the environment
using the OpenAI API. It runs all 3 tasks (easy, medium, hard) and reports
baseline scores.
Usage:
# Set your OpenAI API key
export OPENAI_API_KEY=your-key-here
# Run baseline
python scripts/baseline_inference.py
# Or specify a server URL
python scripts/baseline_inference.py --server-url http://localhost:8000
"""
import argparse
import json
import os
import sys
from typing import Any, Dict, List, Optional
# Add parent directory to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from models import ApiDebugAction, ApiDebugObservation
from scenarios import get_all_task_ids, get_scenario
from server.api_debug_env_environment import ApiDebugEnvironment
def run_rule_based_baseline(task_id: str) -> Dict[str, Any]:
"""
Run a simple rule-based baseline agent (no LLM needed).
Strategy:
1. Inspect all logs
2. Inspect all configs
3. Test all endpoints
(Does not attempt fixes — tests reward signal for exploration-only behavior)
"""
env = ApiDebugEnvironment(task_id=task_id)
obs = env.reset()
total_reward = 0.0
# Phase 1: Inspect all logs
for service in obs.available_targets:
if obs.done:
break
obs = env.step(ApiDebugAction(action_type="inspect_logs", target=service))
total_reward += obs.reward
# Phase 2: Inspect all configs
for service in obs.available_targets:
if obs.done:
break
obs = env.step(ApiDebugAction(action_type="inspect_config", target=service))
total_reward += obs.reward
# Phase 3: Test all endpoints
for service in obs.available_targets:
if obs.done:
break
obs = env.step(ApiDebugAction(action_type="inspect_endpoint", target=service))
total_reward += obs.reward
score = env.grade()
return {
"task_id": task_id,
"score": score,
"total_reward": round(total_reward, 4),
"steps_used": env._state.step_count,
"issues_found": len(env._issues_found),
"issues_fixed": len(env._issues_fixed),
"issues_total": len(env._scenario.issues) if env._scenario else 0,
}
def run_llm_baseline(task_id: str, api_key: Optional[str] = None) -> Dict[str, Any]:
"""
Run an LLM-powered baseline agent using OpenAI API.
The LLM reads observations and decides what to do next.
"""
try:
from openai import OpenAI
except ImportError:
print("OpenAI package not installed. Running rule-based baseline instead.")
return run_rule_based_baseline(task_id)
key = api_key or os.environ.get("OPENAI_API_KEY")
if not key:
print("No OPENAI_API_KEY set. Running rule-based baseline instead.")
return run_rule_based_baseline(task_id)
client = OpenAI(api_key=key)
env = ApiDebugEnvironment(task_id=task_id)
obs = env.reset()
total_reward = 0.0
system_prompt = f"""You are an API debugging agent. Your task: {obs.task_description}
Available actions:
- inspect_logs: Read error logs for a service
- inspect_config: See the configuration of a service
- inspect_endpoint: Test-call an endpoint
- submit_fix: Submit a config fix (requires fix_payload dict)
Available targets: {obs.available_targets}
Total issues to fix: {obs.issues_total}
Respond with JSON: {{"action_type": "...", "target": "...", "fix_payload": {{...}} }}
Only include fix_payload when action_type is "submit_fix"."""
messages = [{"role": "system", "content": system_prompt}]
while not obs.done:
# Build observation message
obs_text = f"""Step {env._state.step_count}/{env._scenario.max_steps if env._scenario else '?'}
Remaining steps: {obs.remaining_steps}
Issues found: {obs.issues_found}/{obs.issues_total}
Issues fixed: {obs.issues_fixed}/{obs.issues_total}
Last action result: {obs.action_result}"""
if obs.logs:
obs_text += f"\nLogs:\n" + "\n".join(obs.logs)
if obs.config_snapshot:
obs_text += f"\nConfig: {json.dumps(obs.config_snapshot, indent=2)}"
if obs.api_response:
obs_text += f"\nAPI Response: {json.dumps(obs.api_response, indent=2)}"
if obs.hints:
obs_text += f"\nHints: {'; '.join(obs.hints)}"
messages.append({"role": "user", "content": obs_text})
try:
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=messages,
temperature=0.2,
max_tokens=500,
response_format={"type": "json_object"},
)
action_json = json.loads(response.choices[0].message.content)
messages.append({"role": "assistant", "content": json.dumps(action_json)})
action = ApiDebugAction(
action_type=action_json.get("action_type", "inspect_logs"),
target=action_json.get("target", obs.available_targets[0] if obs.available_targets else ""),
fix_payload=action_json.get("fix_payload"),
)
except Exception as e:
print(f" LLM error: {e}. Falling back to inspect_logs.")
action = ApiDebugAction(
action_type="inspect_logs",
target=obs.available_targets[0] if obs.available_targets else "",
)
obs = env.step(action)
total_reward += obs.reward
score = env.grade()
return {
"task_id": task_id,
"score": score,
"total_reward": round(total_reward, 4),
"steps_used": env._state.step_count,
"issues_found": len(env._issues_found),
"issues_fixed": len(env._issues_fixed),
"issues_total": len(env._scenario.issues) if env._scenario else 0,
}
def main():
parser = argparse.ArgumentParser(description="Baseline inference for API Debug Env")
parser.add_argument("--mode", choices=["rule", "llm"], default="rule",
help="Baseline mode: 'rule' for rule-based, 'llm' for LLM-powered")
parser.add_argument("--api-key", type=str, default=None,
help="OpenAI API key (or set OPENAI_API_KEY env var)")
parser.add_argument("--task", type=str, default=None,
help="Run specific task only (easy/medium/hard)")
args = parser.parse_args()
print("=" * 60)
print("API Integration Debugging — Baseline Inference")
print("=" * 60)
task_ids = [args.task] if args.task else get_all_task_ids()
all_results = {}
for task_id in task_ids:
print(f"\n{'─' * 40}")
print(f"Task: {task_id}")
print(f"{'─' * 40}")
if args.mode == "llm":
result = run_llm_baseline(task_id, args.api_key)
else:
result = run_rule_based_baseline(task_id)
all_results[task_id] = result
print(f" Score: {result['score']}")
print(f" Reward: {result['total_reward']}")
print(f" Steps: {result['steps_used']}")
print(f" Issues found: {result['issues_found']}/{result['issues_total']}")
print(f" Issues fixed: {result['issues_fixed']}/{result['issues_total']}")
print(f"\n{'=' * 60}")
print("Summary")
print(f"{'=' * 60}")
for tid, res in all_results.items():
print(f" {tid:8s} score={res['score']:.4f} fixed={res['issues_fixed']}/{res['issues_total']}")
avg_score = sum(r["score"] for r in all_results.values()) / len(all_results)
print(f"\n Average score: {avg_score:.4f}")
return all_results
if __name__ == "__main__":
main()