File size: 10,540 Bytes
5dd1bb4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 | """
SQLEnv Pydantic models β the data contracts between client and server.
These models define the typed interface for the SQLEnv RL environment,
following the OpenEnv pattern (see OpenEnv Tutorial for reference):
Action β what the agent sends each step
Observation β what the agent receives back
State β episode metadata (exposed via the state endpoint)
RL terminology β state vs observation
βββββββββββββββββββββββββββββββββββββ
In RL theory:
State (s) A COMPLETE description of the world. Nothing is hidden.
Observation (o) A PARTIAL description of a state, which may omit info.
In SQLEnv these map to:
EpisodeContext The full RL state (s). Lives on the server only.
Contains gold answers, reward accumulators, DB
connection, full query history β everything needed
to advance the simulation and compute rewards.
SQLObservation The observation (o). Sent to the agent over the wire.
Contains the question, truncated results, revealed
schema, budget, and action history. The agent NEVER
sees the gold answer, progress scores, or full DB.
SQLState OpenEnv's "State" base class β lightweight episode
metadata (episode_id, step_count). This is NOT the
RL state; it is a convenience for logging/debugging.
This separation is what makes SQLEnv a POMDP: the agent must act under
uncertainty, which is what makes exploration necessary and learnable.
"""
import sqlite3
from dataclasses import dataclass, field as dataclass_field
from openenv.core.env_server.interfaces import Message
from openenv.core.env_server.types import Action, Observation, State
from pydantic import Field
import torch
# ---------------------------------------------------------------------------
# Wire types: these cross the HTTP boundary between client and server
# ---------------------------------------------------------------------------
class SQLAction(Action):
"""What the agent sends each step.
The action space is intentionally small and structured so agents can
explicitly control the environment loop.
"""
action_type: str = Field(
...,
description="One of: DESCRIBE, SAMPLE, QUERY, ANSWER",
)
argument: str = Field(
...,
description=(
"Table name (DESCRIBE/SAMPLE), SQL string (QUERY), "
"or answer value (ANSWER)."
),
)
class SQLObservation(Observation):
"""What the agent receives after each step.
This is the agent's PARTIAL view of the world. Key design choices:
- schema_info starts with table names only; columns are revealed
incrementally as the agent DESCRIBEs tables.
- result is always a truncated string, never raw data. The agent sees
what a human analyst would see in a terminal β at most N rows of
formatted text. This keeps the observation bounded and forces the
agent to reason about what it sees rather than brute-force scanning.
- action_history gives the agent memory of its own trajectory without
the server needing to re-send full results from prior steps.
"""
# Inherited from Observation: done (bool), reward (float | None)
question: str = Field(..., description="The NL question to answer")
schema_info: str = Field(..., description="Known schema information")
result: str = Field(default="", description="Result of the last action")
error: str = Field(default="", description="Error message if action failed")
step_count: int = Field(default=0, description="Current step number")
budget_remaining: int = Field(default=0, description="Steps remaining")
action_history: list[str] = Field(
default_factory=list,
description="Summary of previous actions",
)
class SQLState(State):
"""Episode metadata exposed via GET /state.
This is the minimal public state β enough for logging and debugging,
but NOT the full internal bookkeeping (see EpisodeContext below).
"""
# # Inherited from State: episode_id (str | None), step_count (int)
# game_name: str = Field(
# "sql_env", description="Name of the game/environment"
# )
history_messages: list[Message] = Field(default_factory=list)
history_tokens: list[torch.Tensor] = Field(
default_factory=list
) # Same len as messages
current_action_type: str = Field(
default="QUERY",
description="Current action type: DESCRIBE, SAMPLE, QUERY, or ANSWER",
)
@dataclass
class QuestionRecord:
"""One question from the Spider dataset."""
question_id: str
question_text: str
database_name: str
gold_sql: str
gold_answer: str
answer_type: str
difficulty: str
tables_involved: list[str]
@dataclass
class EpisodeContext:
"""Per-episode server-side state (never sent to agent)."""
episode_id: str
db_connection: sqlite3.Connection
question_record: QuestionRecord
step_count: int = 0
budget: int = 15
described_tables: set[str] = dataclass_field(default_factory=set)
action_log: list[str] = dataclass_field(default_factory=list)
done: bool = False
gold_answer: str | None = None
gold_rows: list[tuple] = dataclass_field(default_factory=list)
query_hashes: set[str] = dataclass_field(default_factory=set)
best_progress: float = 0.0
cumulative_step_reward: float = 0.0
cumulative_new_info_reward: float = 0.0
# ---------------------------------------------------------------------------
# Conceptual internal state: what the server tracks per episode
# ---------------------------------------------------------------------------
#
# The classes below are a DESIGN OUTLINE, not runnable implementation.
# They describe the information the server needs to maintain during an
# episode so that it can:
#
# 1. Execute actions against the database
# 2. Compute the 3-layer reward signal
# 3. Enforce budget limits and anti-gaming measures
# 4. Build the next observation for the agent
#
# These are SERVER-ONLY β they never cross the HTTP boundary.
# Implementation will follow in server/environment.py during Phase 2.
#
#
# EpisodeContext β Per-episode server state
# ββββββββββββββββββββββββββββββββββββββββββ
# Conceptual fields:
#
# episode_id: str
# Unique identifier for this episode (UUID).
#
# question_record: QuestionRecord
# The selected question and its metadata:
# - question_id, question_text, database_name
# - gold_sql, gold_answer, answer_type, difficulty
# Loaded from the question set JSON at reset().
#
# db_connection: sqlite3.Connection
# Read-only connection to the episode's SQLite database.
# Opened at reset(), closed when the episode ends.
# Enforces: read-only mode, statement timeout (5s), SELECT-only.
#
# step_count: int
# Current step number (0 at reset, incremented each step()).
#
# budget: int
# Steps remaining. Starts at max_steps (default 15).
# Decremented on each non-ANSWER action. Episode terminates
# when budget hits 0 without an ANSWER.
#
# --- Schema tracking (for observation building) ---
#
# known_tables: set[str]
# Table names revealed to the agent. Starts with ALL table names
# (agent sees table names at reset), but column details are hidden.
#
# described_tables: dict[str, list[ColumnInfo]]
# Tables the agent has DESCRIBEd β their column info.
# Used to build the incrementally-revealed schema_info string.
#
# --- Reward tracking (Layer 1: Operational) ---
#
# query_hashes: set[str]
# Hashes of all SQL queries executed this episode.
# Used for repeat detection (r_repeat penalty).
#
# explored_entities: set[str]
# Set of "table.column" strings the agent has discovered.
# Used for r_new_info reward. Capped at 0.10 total per episode.
#
# cumulative_new_info_reward: float
# Running total of r_new_info awarded. Once this reaches the cap
# (0.10), no more r_new_info is given.
#
# --- Reward tracking (Layer 2: Progress) ---
#
# gold_result: Any
# The result of running gold_sql on the database, computed once
# at reset(). This is the reference for progress comparison.
#
# best_progress: float
# Best binned progress score achieved so far (one of
# {0, 0.25, 0.5, 0.75, 1.0}). Reward is given only when
# a QUERY result IMPROVES over this value.
#
# --- Reward tracking (aggregates) ---
#
# cumulative_step_reward: float
# Running sum of all per-step rewards (Layers 1 + 2).
# Clamped to [-0.2, +0.5] at episode end.
#
# --- Action history (for observation) ---
#
# action_log: list[str]
# Human-readable summaries of each action taken, e.g.:
# "DESCRIBE employees β 5 columns"
# "QUERY: SELECT COUNT(*) FROM orders β 42"
# "ANSWER: 42 β correct"
# Sent to the agent in SQLObservation.action_history so it has
# memory of its own trajectory.
#
#
# QuestionRecord β Metadata for a single question
# βββββββββββββββββββββββββββββββββββββββββββββββββ
# Conceptual fields:
#
# question_id: str e.g. "spider_dev_042"
# question_text: str The natural language question
# database_name: str Which SQLite database to load
# gold_sql: str Reference SQL (hidden from agent)
# gold_answer: str Expected answer (hidden from agent)
# answer_type: str One of: integer, float, string, list, table
# difficulty: str One of: easy, medium, hard
# tables_involved: list[str] Which tables the gold query touches
#
#
# ColumnInfo β Schema detail for a single column
# βββββββββββββββββββββββββββββββββββββββββββββββ
# Conceptual fields:
#
# name: str Column name
# dtype: str SQLite type (TEXT, INTEGER, REAL, etc.)
# is_primary_key: bool Whether this is a PK
# is_foreign_key: bool Whether this is a FK
# references: str | None "table.column" if FK, else None
#
|