# Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. """ Scenario definitions for the API Integration Debugging Environment. Each scenario models a realistic multi-service API ecosystem with: - Service dependency graphs (upstream/downstream relationships) - Cascading failures (upstream bugs propagate downstream) - Dynamic logs that update when issues are fixed - Expanded issue pools for seed-based random subset selection """ from dataclasses import dataclass, field from typing import Any, Dict, List, Optional, Tuple import random @dataclass class Issue: """A single issue in an API integration scenario.""" issue_id: str service: str description: str expected_fix: Dict[str, Any] fix_key: str # The key in the config that needs fixing log_hint: str # Log line that hints at this issue # --- New fields for cascading failures --- depends_on: List[str] = field(default_factory=list) # Issues that must be fixed before this one can be diagnosed cascade_effects: Dict[str, str] = field(default_factory=dict) # service -> error message caused by this issue being unfixed category: str = "configuration" # Issue category: configuration, authentication, networking, protocol severity: str = "error" # Severity: error, warning, critical root_cause_explanation: str = "" # Detailed explanation of why this issue occurs (for grading diagnosis quality) @dataclass class ServiceNode: """A node in the service dependency graph.""" name: str depends_on: List[str] = field(default_factory=list) # Services this one calls (upstream dependencies) health_status: str = "degraded" # healthy, degraded, error, unreachable @dataclass class Scenario: """A complete API debugging scenario with dependency graph.""" task_id: str difficulty: str description: str max_steps: int services: List[str] configs: Dict[str, Dict[str, Any]] logs: Dict[str, List[str]] issues: List[Issue] # --- New fields --- service_graph: Dict[str, ServiceNode] = field(default_factory=dict) # Service dependency graph dynamic_logs: Dict[str, Dict[str, List[str]]] = field(default_factory=dict) # service -> {issue_id: [new logs when fixed]} optimal_fix_order: List[str] = field(default_factory=list) # Optimal order to fix issues (for strategy scoring) context: str = "" # Additional scenario context for the agent def get_scenario(task_id: str, seed: Optional[int] = None) -> Scenario: """ Load a scenario by task ID with optional randomization. Args: task_id: One of 'easy', 'medium', 'hard' seed: Optional seed for deterministic but varied scenarios. When provided, selects a random subset of issues from the pool and randomizes log order. When None, returns the canonical scenario. """ scenario_builders = { "easy": _easy_scenario, "medium": _medium_scenario, "hard": _hard_scenario, } if task_id not in scenario_builders: raise ValueError(f"Unknown task_id: {task_id}. Must be one of: {list(scenario_builders.keys())}") scenario = scenario_builders[task_id](seed=seed) return scenario def get_all_task_ids() -> List[str]: """Return all available task IDs.""" return ["easy", "medium", "hard"] def _select_issues(pool: List[Issue], count: int, rng: random.Random) -> List[Issue]: """Select a random subset of issues from a pool, respecting dependencies.""" if count >= len(pool): selected = list(pool) else: # Build dependency-aware selection available = list(pool) selected = [] while len(selected) < count and available: # Pick a random issue issue = rng.choice(available) available.remove(issue) # Add its dependencies too if not already selected deps_satisfied = all( any(s.issue_id == dep for s in selected) for dep in issue.depends_on ) if deps_satisfied or not issue.depends_on: selected.append(issue) else: # Add dependencies first for dep_id in issue.depends_on: dep_issue = next((i for i in pool if i.issue_id == dep_id), None) if dep_issue and dep_issue not in selected: selected.append(dep_issue) if dep_issue in available: available.remove(dep_issue) selected.append(issue) # Shuffle log order for selected issues rng.shuffle(selected) return selected[:count] def _randomize_scenario(scenario: Scenario, seed: int) -> Scenario: """Apply seed-based randomization to a scenario.""" rng = random.Random(seed) # Shuffle log entries for each service for service_logs in scenario.logs.values(): rng.shuffle(service_logs) # Vary timestamps in log entries base_hour = rng.randint(8, 16) base_minute = rng.randint(0, 59) for service, log_list in scenario.logs.items(): new_logs = [] for i, log_line in enumerate(log_list): # Replace the timestamp portion minute = (base_minute + i * rng.randint(1, 5)) % 60 hour = base_hour + (base_minute + i * rng.randint(1, 5)) // 60 new_log = log_line if "2026-" in new_log: # Replace date with varied date day = rng.randint(20, 28) new_log = new_log.replace( "2026-03-25", f"2026-03-{day:02d}" ).replace( "2026-03-24", f"2026-03-{day-1:02d}" ) new_logs.append(new_log) scenario.logs[service] = new_logs return scenario # ─── Easy Scenario ─────────────────────────────────────────────────────────── def _easy_scenario(seed: Optional[int] = None) -> Scenario: """ Easy: Payment API integration failures. Agent must diagnose auth + content-type issues with clear log signals. Issue pool has 4 possible issues; canonical scenario uses 2. """ # Full issue pool (4 issues, canonical uses 2) issue_pool = [ Issue( issue_id="easy_auth", service="payment_client", description="Missing Authorization header — payment gateway requires Bearer token authentication", expected_fix={"headers.Authorization": "Bearer "}, fix_key="headers.Authorization", log_hint="Missing or invalid Authorization header", category="authentication", severity="critical", root_cause_explanation=( "The payment_client is missing the Authorization header entirely. " "The payment_gateway requires Bearer token auth on all /process requests. " "This results in HTTP 401 on every payment attempt." ), cascade_effects={ "payment_gateway": "All requests from payment_client rejected with 401" }, ), Issue( issue_id="easy_content_type", service="payment_client", description="Wrong Content-Type header (text/plain instead of application/json)", expected_fix={"headers.Content-Type": "application/json"}, fix_key="headers.Content-Type", log_hint="Content-Type must be application/json", category="protocol", severity="error", root_cause_explanation=( "The payment_client sends Content-Type: text/plain, but the gateway " "only accepts application/json. This causes HTTP 415 Unsupported Media Type. " "The gateway cannot parse the request body." ), cascade_effects={ "payment_gateway": "Request body parsing fails for payment_client requests" }, ), Issue( issue_id="easy_timeout", service="payment_client", description="Timeout set too low (5s) for payment processing that takes 8-12s", expected_fix={"timeout": 30}, fix_key="timeout", log_hint="Request timed out after 5s", category="networking", severity="error", root_cause_explanation=( "The payment_client has timeout=5s, but payment processing at the gateway " "takes 8-12s for fraud checks. Legitimate payments are timing out." ), ), Issue( issue_id="easy_base_url", service="payment_client", description="Base URL pointing to deprecated v1 endpoint instead of v2", expected_fix={"base_url": "https://api.paymentgateway.com/v2"}, fix_key="base_url", log_hint="API v1 is deprecated", category="configuration", severity="warning", root_cause_explanation=( "The payment_client uses /v1 which is deprecated and returning 301 redirects. " "The gateway v2 endpoint has different request schemas, causing deserialization errors." ), ), ] # Select issues based on seed if seed is not None: rng = random.Random(seed) issues = _select_issues(issue_pool, 2, rng) else: issues = issue_pool[:2] # Canonical: auth + content_type # Build logs based on selected issues client_logs = [ "[INFO] 2026-03-25T10:15:20Z Payment client initialized with base_url=https://api.paymentgateway.com/v2", ] gateway_logs = [ "[INFO] 2026-03-25T10:15:20Z Gateway ready, accepting application/json with Bearer auth", ] for issue in issues: if issue.issue_id == "easy_auth": client_logs.extend([ "[ERROR] 2026-03-25T10:15:23Z POST /process -> 401 Unauthorized", "[ERROR] 2026-03-25T10:15:23Z Response: {'error': 'Missing or invalid Authorization header'}", "[WARN] 2026-03-25T10:15:22Z Request headers: Content-Type=text/plain, Accept=application/json", ]) gateway_logs.append( "[WARN] 2026-03-25T10:15:23Z Rejected request: no Authorization header present" ) elif issue.issue_id == "easy_content_type": client_logs.extend([ "[ERROR] 2026-03-25T10:15:24Z POST /process -> 415 Unsupported Media Type", "[ERROR] 2026-03-25T10:15:24Z Response: {'error': 'Content-Type must be application/json'}", ]) gateway_logs.append( "[WARN] 2026-03-25T10:15:24Z Rejected request: unsupported Content-Type 'text/plain'" ) elif issue.issue_id == "easy_timeout": client_logs.extend([ "[ERROR] 2026-03-25T10:15:30Z POST /process -> Request timed out after 5s", "[WARN] 2026-03-25T10:15:30Z Payment processing takes 8-12s for fraud verification", ]) gateway_logs.append( "[INFO] 2026-03-25T10:15:30Z Processing payment... estimated time: 10s" ) elif issue.issue_id == "easy_base_url": client_logs.extend([ "[ERROR] 2026-03-25T10:15:21Z GET /v1/status -> 301 Moved Permanently", "[WARN] 2026-03-25T10:15:21Z API v1 is deprecated, migrate to /v2", ]) gateway_logs.append( "[WARN] 2026-03-25T10:15:21Z Deprecated v1 endpoint accessed" ) # Determine initial config based on selected issues configs = { "payment_client": { "base_url": "https://api.paymentgateway.com/v2", "headers": { "Content-Type": "application/json", "Accept": "application/json", }, "timeout": 30, "retry_count": 3, }, "payment_gateway": { "endpoint": "/process", "method": "POST", "required_headers": ["Authorization", "Content-Type"], "accepted_content_types": ["application/json"], "auth_scheme": "Bearer", "processing_time_ms": "8000-12000", }, } # Apply broken config for each selected issue for issue in issues: if issue.issue_id == "easy_auth": # Remove auth header (it shouldn't exist) configs["payment_client"]["headers"].pop("Authorization", None) elif issue.issue_id == "easy_content_type": configs["payment_client"]["headers"]["Content-Type"] = "text/plain" elif issue.issue_id == "easy_timeout": configs["payment_client"]["timeout"] = 5 elif issue.issue_id == "easy_base_url": configs["payment_client"]["base_url"] = "https://api.paymentgateway.com/v1" # Dynamic logs: what changes after fixing each issue dynamic_logs = {} for issue in issues: if issue.issue_id == "easy_auth": dynamic_logs["easy_auth"] = { "payment_client": ["[INFO] Authorization header set. Retrying request..."], "payment_gateway": ["[INFO] Authentication successful for payment_client"], } elif issue.issue_id == "easy_content_type": dynamic_logs["easy_content_type"] = { "payment_client": ["[INFO] Content-Type set to application/json. Request body parsed."], "payment_gateway": ["[INFO] Request body parsed successfully as JSON"], } elif issue.issue_id == "easy_timeout": dynamic_logs["easy_timeout"] = { "payment_client": ["[INFO] Timeout increased to 30s. Payment processing completing normally."], } elif issue.issue_id == "easy_base_url": dynamic_logs["easy_base_url"] = { "payment_client": ["[INFO] Migrated to v2 API endpoint. Requests routing correctly."], } # Service dependency graph service_graph = { "payment_client": ServiceNode( name="payment_client", depends_on=["payment_gateway"], health_status="error", ), "payment_gateway": ServiceNode( name="payment_gateway", depends_on=[], health_status="healthy", ), } scenario = Scenario( task_id="easy", difficulty="easy", description=( "A payment processing API integration is failing. " "The client is sending requests to the payment gateway but getting error responses. " "Diagnose the root causes by inspecting error logs and service configurations, " "then submit the correct configuration fixes." ), max_steps=15, services=["payment_client", "payment_gateway"], configs=configs, logs={"payment_client": client_logs, "payment_gateway": gateway_logs}, issues=issues, service_graph=service_graph, dynamic_logs=dynamic_logs, optimal_fix_order=[i.issue_id for i in issues], context=( "The payment_client sends HTTP requests to payment_gateway. " "payment_gateway requires Bearer authentication and JSON content type." ), ) if seed is not None: scenario = _randomize_scenario(scenario, seed) return scenario # ─── Medium Scenario ───────────────────────────────────────────────────────── def _medium_scenario(seed: Optional[int] = None) -> Scenario: """ Medium: Webhook chain with cascading failures. Service A -> Service B -> Service C, with rate limiting, retry, and auth issues. Issue pool has 5 possible issues; canonical scenario uses 3. Issues have dependencies — fixing rate_limit reveals the real retry issue. """ issue_pool = [ Issue( issue_id="medium_rate_limit", service="webhook_sender", description="Rate limit too high (100/s vs receiver's 10/s limit) causing 429 responses", expected_fix={"rate_limit.requests_per_second": 10}, fix_key="rate_limit.requests_per_second", log_hint="Rate limit exceeded: 100 req/s > 10 req/s allowed", category="networking", severity="error", root_cause_explanation=( "webhook_sender fires at 100 req/s but webhook_receiver only accepts 10 req/s. " "The excess requests get 429 Too Many Requests, and with only 1 retry, most events are dropped." ), cascade_effects={ "webhook_receiver": "Overwhelmed with requests, dropping 90% of events", "notification_service": "No events arriving downstream", }, ), Issue( issue_id="medium_retry", service="webhook_sender", description="Insufficient retry config: only 1 retry, no backoff, missing 429 in retry_on_status", expected_fix={ "retry.max_retries": 3, "retry.backoff_factor": 2, "retry.retry_on_status": [429, 500], }, fix_key="retry", log_hint="Retry attempt 1/1 failed. No more retries.", depends_on=["medium_rate_limit"], # The retry issue is masked by the rate limit issue — even with retries, # 100 req/s would still overwhelm the receiver category="configuration", severity="error", root_cause_explanation=( "Even after fixing the rate limit, the sender only retries once with no backoff. " "Transient 429s during bursts aren't retried because 429 isn't in retry_on_status. " "This causes event loss on any temporary load spike." ), ), Issue( issue_id="medium_signature", service="webhook_sender", description="Webhook signature header is empty — receiver rejects unsigned events", expected_fix={"headers.X-Webhook-Signature": "sha256="}, fix_key="headers.X-Webhook-Signature", log_hint="Signature validation FAILED: received empty signature", category="authentication", severity="critical", root_cause_explanation=( "webhook_sender has signing_secret configured but the X-Webhook-Signature header " "is empty string. webhook_receiver validates signatures and drops all unsigned " "events as potential spoofing attempts." ), cascade_effects={ "webhook_receiver": "Dropping all events as unsigned/spoofed", "notification_service": "Zero events forwarded from receiver", }, ), Issue( issue_id="medium_target_url", service="webhook_sender", description="Target URL pointing to wrong receiver endpoint (/webhook vs /hooks/incoming)", expected_fix={"target_url": "https://receiver.internal/hooks/incoming"}, fix_key="target_url", log_hint="404 Not Found on /webhook endpoint", category="configuration", severity="error", root_cause_explanation=( "webhook_sender posts to /webhook but the receiver listens on /hooks/incoming. " "All requests get 404 Not Found." ), ), Issue( issue_id="medium_content_encoding", service="webhook_sender", description="Payload compression enabled but receiver doesn't support gzip", expected_fix={"compression": "none"}, fix_key="compression", log_hint="Unsupported Content-Encoding: gzip", category="protocol", severity="warning", root_cause_explanation=( "webhook_sender compresses payloads with gzip but webhook_receiver " "doesn't have a decompression middleware. Requests fail with 415." ), ), ] if seed is not None: rng = random.Random(seed) issues = _select_issues(issue_pool, 3, rng) else: issues = issue_pool[:3] # Canonical: rate_limit, retry, signature # Build configs configs = { "webhook_sender": { "target_url": "https://receiver.internal/hooks/incoming", "headers": { "Content-Type": "application/json", "X-Webhook-Signature": "sha256=computed_hmac", }, "rate_limit": { "requests_per_second": 10, "burst_size": 20, }, "retry": { "max_retries": 3, "backoff_factor": 2, "retry_on_status": [429, 500], }, "signing_secret": "whsec_abc123secret", "compression": "none", }, "webhook_receiver": { "endpoint": "/hooks/incoming", "rate_limit": { "requests_per_second": 10, "burst_size": 20, }, "signature_validation": True, "expected_signature_header": "X-Webhook-Signature", "signing_secret": "whsec_abc123secret", "forward_to": "https://notifications.internal/notify", "supported_encodings": ["identity"], }, "notification_service": { "endpoint": "/notify", "accepts_from": ["webhook_receiver"], "status": "healthy", }, } # Apply broken config for each selected issue for issue in issues: if issue.issue_id == "medium_rate_limit": configs["webhook_sender"]["rate_limit"]["requests_per_second"] = 100 configs["webhook_sender"]["rate_limit"]["burst_size"] = 200 elif issue.issue_id == "medium_retry": configs["webhook_sender"]["retry"] = { "max_retries": 1, "backoff_factor": 0, "retry_on_status": [500], } elif issue.issue_id == "medium_signature": configs["webhook_sender"]["headers"]["X-Webhook-Signature"] = "" elif issue.issue_id == "medium_target_url": configs["webhook_sender"]["target_url"] = "https://receiver.internal/webhook" elif issue.issue_id == "medium_content_encoding": configs["webhook_sender"]["compression"] = "gzip" # Build logs based on selected issues sender_logs = [ "[INFO] 2026-03-25T10:59:59Z Webhook sender started. Signature header: X-Webhook-Signature", ] receiver_logs = [ "[INFO] 2026-03-25T10:59:59Z Receiver ready. Rate limit: 10 req/s. Signature validation: ON", ] notif_logs = [ "[INFO] 2026-03-25T10:59:59Z Notification service healthy. Waiting for events.", ] for issue in issues: if issue.issue_id == "medium_rate_limit": sender_logs.extend([ "[ERROR] 2026-03-25T11:00:01Z POST /hooks/incoming -> 429 Too Many Requests", "[ERROR] 2026-03-25T11:00:01Z Rate limited. Retry-After: 5s", "[WARN] 2026-03-25T11:00:00Z Sending at 100 req/s (burst=200)", ]) receiver_logs.append( "[WARN] 2026-03-25T11:00:01Z Rate limit exceeded: 100 req/s > 10 req/s allowed" ) elif issue.issue_id == "medium_retry": sender_logs.extend([ "[WARN] 2026-03-25T11:00:02Z Retry attempt 1/1 failed. No more retries.", "[ERROR] 2026-03-25T11:00:03Z Event evt_12345 dropped after retry exhaustion", ]) elif issue.issue_id == "medium_signature": receiver_logs.extend([ "[ERROR] 2026-03-25T11:00:02Z Signature validation FAILED: received empty signature", "[WARN] 2026-03-25T11:00:02Z Dropping event: invalid signature from webhook_sender", ]) elif issue.issue_id == "medium_target_url": sender_logs.extend([ "[ERROR] 2026-03-25T11:00:01Z POST /webhook -> 404 Not Found on /webhook endpoint", "[WARN] 2026-03-25T11:00:01Z Receiver endpoint may have changed", ]) elif issue.issue_id == "medium_content_encoding": receiver_logs.extend([ "[ERROR] 2026-03-25T11:00:02Z Unsupported Content-Encoding: gzip", "[WARN] 2026-03-25T11:00:02Z Cannot decompress payload from webhook_sender", ]) notif_logs.append("[WARN] 2026-03-25T11:00:05Z No events received in last 60s") # Dynamic logs dynamic_logs = { "medium_rate_limit": { "webhook_sender": ["[INFO] Rate limit adjusted to 10 req/s. 429 errors resolved."], "webhook_receiver": ["[INFO] Incoming request rate normalized. Processing events."], }, "medium_retry": { "webhook_sender": ["[INFO] Retry config updated: 3 retries with backoff. 429 now retried."], }, "medium_signature": { "webhook_sender": ["[INFO] Webhook signature computed and attached to requests."], "webhook_receiver": ["[INFO] Signature validation passed for incoming events."], }, "medium_target_url": { "webhook_sender": ["[INFO] Target URL corrected to /hooks/incoming. Requests routing OK."], }, "medium_content_encoding": { "webhook_sender": ["[INFO] Compression disabled. Receiver parsing payloads correctly."], }, } service_graph = { "webhook_sender": ServiceNode( name="webhook_sender", depends_on=["webhook_receiver"], health_status="error", ), "webhook_receiver": ServiceNode( name="webhook_receiver", depends_on=["notification_service"], health_status="degraded", ), "notification_service": ServiceNode( name="notification_service", depends_on=[], health_status="healthy", ), } # Determine optimal fix order (respect dependencies) issue_ids = [i.issue_id for i in issues] optimal_order = [] # Rate limit should be fixed before retry (dependency) if "medium_rate_limit" in issue_ids: optimal_order.append("medium_rate_limit") if "medium_retry" in issue_ids: optimal_order.append("medium_retry") for iid in issue_ids: if iid not in optimal_order: optimal_order.append(iid) scenario = Scenario( task_id="medium", difficulty="medium", description=( "A webhook-based notification system is dropping events. " "webhook_sender sends webhooks to webhook_receiver, which forwards to notification_service. " "Events are being lost due to multiple cascading failures in the webhook chain. " "Fix the webhook_sender configuration to restore event delivery." ), max_steps=25, services=["webhook_sender", "webhook_receiver", "notification_service"], configs=configs, logs={ "webhook_sender": sender_logs, "webhook_receiver": receiver_logs, "notification_service": notif_logs, }, issues=issues, service_graph=service_graph, dynamic_logs=dynamic_logs, optimal_fix_order=optimal_order, context=( "Event flow: webhook_sender -> webhook_receiver -> notification_service. " "webhook_receiver validates signatures and enforces rate limits. " "Fixing upstream issues may reveal additional downstream problems." ), ) if seed is not None: scenario = _randomize_scenario(scenario, seed) return scenario # ─── Hard Scenario ──────────────────────────────────────────────────────────── def _hard_scenario(seed: Optional[int] = None) -> Scenario: """ Hard: E-commerce order processing pipeline with cascading failures. order_service -> inventory_service -> shipping_service Plus api_gateway and auth_service. Issue pool has 7 possible issues; canonical scenario uses 5. Multiple dependency chains make this genuinely challenging. """ issue_pool = [ Issue( issue_id="hard_wrong_url", service="order_service", description="Order service calling deprecated /v1/check instead of /v2/reserve", expected_fix={"inventory_url": "https://inventory.internal/v2/reserve"}, fix_key="inventory_url", log_hint="Endpoint deprecated. Use /v2/reserve", category="configuration", severity="error", root_cause_explanation=( "order_service calls /v1/check which was deprecated. The API gateway returns " "301 Moved Permanently. The redirect goes to /v2/check (read-only) instead of " "/v2/reserve (write). Inventory is never actually reserved." ), cascade_effects={ "inventory_service": "Receiving read-only check requests instead of reservation requests", "api_gateway": "Generating 301 redirect responses for deprecated endpoints", }, ), Issue( issue_id="hard_timeout", service="order_service", description="Timeout too short (2s) for inventory service that takes ~4s to process", expected_fix={"timeout": 10}, fix_key="timeout", log_hint="Timeout after 2s waiting for inventory response", depends_on=["hard_wrong_url"], # Timeout issue is masked by wrong URL — fix URL first to see real timeout category="networking", severity="error", root_cause_explanation=( "order_service has timeout=2s but inventory_service takes ~4s for reservation " "(including DB lock + stock validation). After fixing the URL, requests now reach " "inventory but time out before completion." ), cascade_effects={ "inventory_service": "Connections killed mid-processing, leaving orphaned DB locks", }, ), Issue( issue_id="hard_async", service="order_service", description="Synchronous mode causes race conditions between concurrent orders", expected_fix={"async_mode": True}, fix_key="async_mode", log_hint="Race condition: order ord_998 processed before ord_997 completed", category="configuration", severity="critical", root_cause_explanation=( "order_service runs in sync mode, blocking the main thread on each inventory call. " "Concurrent orders queue up and when timeouts occur, orders are processed out of " "order, causing double-reservation and stock inconsistencies." ), ), Issue( issue_id="hard_expired_token", service="inventory_service", description="Expired auth token used for shipping service requests", expected_fix={"headers.Authorization": "Bearer valid_token_789"}, fix_key="headers.Authorization", log_hint="Auth token expired_token_456 is no longer valid", category="authentication", severity="critical", root_cause_explanation=( "inventory_service uses Bearer expired_token_456 to authenticate with " "shipping_service. This token expired on 2026-03-24. All shipment creation " "requests fail with 401, so reserved inventory is never shipped." ), cascade_effects={ "shipping_service": "Rejecting all requests from inventory_service", "auth_service": "Logging repeated failed token validations", }, ), Issue( issue_id="hard_token_refresh", service="inventory_service", description="No automatic token refresh mechanism configured", expected_fix={"token_refresh_url": "https://auth.internal/refresh", "auto_refresh": True}, fix_key="token_refresh_url", log_hint="Token validation failed: expired_token_456 expired", depends_on=["hard_expired_token"], # Token refresh is only relevant after fixing the expired token category="configuration", severity="error", root_cause_explanation=( "Even after replacing the expired token, there's no auto-refresh mechanism. " "Tokens expire every 24h, so without auto_refresh=True and a refresh URL, " "the same issue will recur tomorrow." ), ), Issue( issue_id="hard_circuit_breaker", service="order_service", description="No circuit breaker — failed requests keep hammering inventory_service", expected_fix={"circuit_breaker.enabled": True, "circuit_breaker.failure_threshold": 5}, fix_key="circuit_breaker", log_hint="Circuit breaker not configured", category="configuration", severity="warning", root_cause_explanation=( "Without a circuit breaker, order_service keeps sending requests to " "inventory_service even when it's consistently failing. This wastes resources " "and can cause a cascading overload." ), ), Issue( issue_id="hard_idempotency", service="order_service", description="Missing idempotency key — retried requests create duplicate orders", expected_fix={"headers.Idempotency-Key": "order-{order_id}"}, fix_key="headers.Idempotency-Key", log_hint="Duplicate order detected: ord_997 submitted twice", depends_on=["hard_async"], category="protocol", severity="error", root_cause_explanation=( "When async retries fire, there's no Idempotency-Key header to deduplicate " "requests. inventory_service creates duplicate reservations for the same order." ), ), ] if seed is not None: rng = random.Random(seed) issues = _select_issues(issue_pool, 5, rng) else: issues = issue_pool[:5] # Canonical: first 5 configs = { "order_service": { "name": "order_service", "inventory_url": "https://inventory.internal/v2/reserve", "headers": { "Content-Type": "application/json", "Authorization": "Bearer valid_token_123", }, "timeout": 10, "async_mode": True, "callback_url": "https://orders.internal/callback", "circuit_breaker": { "enabled": True, "failure_threshold": 5, }, }, "inventory_service": { "name": "inventory_service", "endpoint_version": "v2", "reserve_path": "/v2/reserve", "check_path": "/v2/check", "shipping_url": "https://shipping.internal/v1/create", "headers": { "Content-Type": "application/json", "Authorization": "Bearer valid_token_789", }, "timeout": 10, "processing_time_avg": 4, "token_refresh_url": "https://auth.internal/refresh", "auto_refresh": True, }, "shipping_service": { "name": "shipping_service", "create_path": "/v1/create", "requires_auth": True, "accepted_auth": ["Bearer"], "token_validation_url": "https://auth.internal/validate", "status": "healthy", }, "api_gateway": { "routes": { "/v1/check": "DEPRECATED — use /v2/check", "/v2/reserve": "inventory_service", "/v2/check": "inventory_service", "/v1/create": "shipping_service", }, "timeout": 30, }, "auth_service": { "valid_tokens": ["valid_token_123", "valid_token_789"], "expired_tokens": ["expired_token_456"], "token_refresh_endpoint": "/refresh", "token_ttl_hours": 24, }, } # Apply broken config for each selected issue for issue in issues: if issue.issue_id == "hard_wrong_url": configs["order_service"]["inventory_url"] = "https://inventory.internal/v1/check" elif issue.issue_id == "hard_timeout": configs["order_service"]["timeout"] = 2 elif issue.issue_id == "hard_async": configs["order_service"]["async_mode"] = False elif issue.issue_id == "hard_expired_token": configs["inventory_service"]["headers"]["Authorization"] = "Bearer expired_token_456" elif issue.issue_id == "hard_token_refresh": configs["inventory_service"].pop("token_refresh_url", None) configs["inventory_service"]["auto_refresh"] = False elif issue.issue_id == "hard_circuit_breaker": configs["order_service"]["circuit_breaker"] = {"enabled": False} elif issue.issue_id == "hard_idempotency": configs["order_service"]["headers"].pop("Idempotency-Key", None) # Build logs order_logs = [] inventory_logs = [] shipping_logs = [] gateway_logs = [] auth_logs = [ "[INFO] 2026-03-25T12:00:00Z Auth service ready. Valid tokens: 2, Expired: 1", ] for issue in issues: if issue.issue_id == "hard_wrong_url": order_logs.extend([ "[ERROR] 2026-03-25T12:00:05Z POST inventory.internal/v1/check -> 301 Moved Permanently", "[ERROR] 2026-03-25T12:00:05Z Response: {'error': 'Endpoint deprecated. Use /v2/reserve'}", ]) inventory_logs.append( "[INFO] 2026-03-25T12:00:05Z Received request on /v1/check -> redirecting to /v2/check" ) gateway_logs.extend([ "[WARN] 2026-03-25T12:00:05Z Deprecated endpoint /v1/check accessed by order_service", "[INFO] 2026-03-25T12:00:05Z Redirecting /v1/check -> /v2/check (301)", ]) elif issue.issue_id == "hard_timeout": order_logs.extend([ "[ERROR] 2026-03-25T12:00:07Z Timeout after 2s waiting for inventory response", "[ERROR] 2026-03-25T12:00:07Z Order ord_999 failed: inventory check timed out", ]) inventory_logs.append( "[WARN] 2026-03-25T12:00:06Z Processing reservation... avg time: 4s" ) elif issue.issue_id == "hard_async": order_logs.extend([ "[WARN] 2026-03-25T12:00:08Z Synchronous mode: blocking on inventory response", "[ERROR] 2026-03-25T12:00:09Z Race condition: order ord_998 processed before ord_997 completed", ]) elif issue.issue_id == "hard_expired_token": inventory_logs.extend([ "[ERROR] 2026-03-25T12:00:10Z POST shipping.internal/v1/create -> 401 Unauthorized", "[ERROR] 2026-03-25T12:00:10Z Auth token expired_token_456 is no longer valid", "[ERROR] 2026-03-25T12:00:10Z Cannot create shipment: authentication failed", ]) shipping_logs.append( "[WARN] 2026-03-25T12:00:10Z Rejected request: token 'expired_token_456' is expired" ) auth_logs.append( "[WARN] 2026-03-25T12:00:10Z Token validation failed: expired_token_456 expired at 2026-03-24T00:00:00Z" ) elif issue.issue_id == "hard_token_refresh": auth_logs.append( "[WARN] 2026-03-25T12:00:11Z Token validation failed: expired_token_456 expired. No refresh configured." ) elif issue.issue_id == "hard_circuit_breaker": order_logs.extend([ "[WARN] 2026-03-25T12:00:12Z Circuit breaker not configured, continuing to send requests after 10 failures", "[ERROR] 2026-03-25T12:00:12Z System overload: 50 pending requests to inventory_service", ]) elif issue.issue_id == "hard_idempotency": order_logs.append( "[ERROR] 2026-03-25T12:00:13Z Duplicate order detected: ord_997 submitted twice" ) inventory_logs.append( "[WARN] 2026-03-25T12:00:13Z Duplicate reservation request for order ord_997" ) if not shipping_logs: shipping_logs.append( "[INFO] 2026-03-25T12:00:00Z Shipping service healthy, awaiting authenticated requests" ) dynamic_logs = { "hard_wrong_url": { "order_service": ["[INFO] URL corrected to /v2/reserve. Inventory requests routing correctly."], "api_gateway": ["[INFO] order_service now using correct /v2/reserve endpoint."], }, "hard_timeout": { "order_service": ["[INFO] Timeout increased to 10s. Inventory responses completing."], "inventory_service": ["[INFO] Reservations completing successfully within timeout."], }, "hard_async": { "order_service": ["[INFO] Async mode enabled. Orders processing concurrently without blocking."], }, "hard_expired_token": { "inventory_service": ["[INFO] Auth token refreshed. Shipping service requests authenticated."], "shipping_service": ["[INFO] Authentication successful for inventory_service."], }, "hard_token_refresh": { "inventory_service": ["[INFO] Auto token refresh configured. Tokens will be refreshed before expiry."], }, "hard_circuit_breaker": { "order_service": ["[INFO] Circuit breaker enabled. Will stop sending after 5 consecutive failures."], }, "hard_idempotency": { "order_service": ["[INFO] Idempotency keys set. Duplicate requests will be safely deduplicated."], }, } service_graph = { "order_service": ServiceNode( name="order_service", depends_on=["inventory_service", "api_gateway"], health_status="error", ), "inventory_service": ServiceNode( name="inventory_service", depends_on=["shipping_service", "auth_service"], health_status="degraded", ), "shipping_service": ServiceNode( name="shipping_service", depends_on=[], health_status="healthy", ), "api_gateway": ServiceNode( name="api_gateway", depends_on=[], health_status="healthy", ), "auth_service": ServiceNode( name="auth_service", depends_on=[], health_status="healthy", ), } # Build optimal fix order respecting dependencies issue_ids = [i.issue_id for i in issues] optimal_order = [] ordered_preference = [ "hard_wrong_url", "hard_timeout", "hard_async", "hard_expired_token", "hard_token_refresh", "hard_circuit_breaker", "hard_idempotency", ] for iid in ordered_preference: if iid in issue_ids: optimal_order.append(iid) for iid in issue_ids: if iid not in optimal_order: optimal_order.append(iid) scenario = Scenario( task_id="hard", difficulty="hard", description=( "An e-commerce order processing pipeline is failing with cascading errors. " "Order Service calls Inventory Service, which calls Shipping Service. " "Multiple issues span the pipeline: wrong endpoints, timeouts, race conditions, " "expired authentication tokens, and missing resilience patterns. " "Some issues are masked by upstream failures — you must fix issues in the right " "order to diagnose the full chain." ), max_steps=40, services=["order_service", "inventory_service", "shipping_service", "api_gateway", "auth_service"], configs=configs, logs={ "order_service": order_logs, "inventory_service": inventory_logs, "shipping_service": shipping_logs, "api_gateway": gateway_logs, "auth_service": auth_logs, }, issues=issues, service_graph=service_graph, dynamic_logs=dynamic_logs, optimal_fix_order=optimal_order, context=( "Request flow: order_service -> api_gateway -> inventory_service -> shipping_service. " "auth_service provides token validation for all inter-service calls. " "Some issues are masked by upstream failures — fixing upstream issues may reveal " "new errors downstream. Pay attention to service dependencies." ), ) if seed is not None: scenario = _randomize_scenario(scenario, seed) return scenario