Spaces:

tjhalanigrid
/

text2sql-demo

Running

App Files Files Community

text2sql-demo / src /text2sql_engine.py

tjhalanigrid

update file

4f2cd24 2 days ago

raw

history blame contribute delete

16.9 kB

	import sqlite3
	import torch
	import re
	import time
	from pathlib import Path
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
	from peft import PeftModel
	from src.sql_validator import SQLValidator
	from src.schema_encoder import SchemaEncoder

	PROJECT_ROOT = Path(__file__).resolve().parents[1]

	# ================================
	# DATABASE PATH AUTO DETECTION
	# ================================
	if (PROJECT_ROOT / "data/database").exists():
	DB_ROOT = PROJECT_ROOT / "data/database"
	else:
	DB_ROOT = PROJECT_ROOT / "final_databases"


	def normalize_question(q: str):
	q = q.lower().strip()
	q = re.sub(r"distinct\s+(\d+)", r"\1 distinct", q)
	q = re.sub(r"\s+", " ", q)
	return q


	def semantic_fix(question, sql):
	q = question.lower().strip()
	s = sql.lower()

	num_match = re.search(r'\b(?:show\|list\|top\|limit\|get\|first\|last)\s+(\d+)\b', q)

	if num_match and "limit" not in s and "count(" not in s:
	limit_val = num_match.group(1)
	sql = sql.rstrip(";")
	sql = f"{sql.strip()} LIMIT {limit_val}"

	return sql


	class Text2SQLEngine:
	def __init__(self,
	adapter_path=None,
	base_model_name="Salesforce/codet5-base",
	use_lora=True):

	self.device = "mps" if torch.backends.mps.is_available() else (
	"cuda" if torch.cuda.is_available() else "cpu"
	)

	self.validator = SQLValidator(DB_ROOT)
	self.schema_encoder = SchemaEncoder(DB_ROOT)

	self.dml_keywords = r'\b(delete\|update\|insert\|drop\|alter\|truncate)\b'

	print("Loading base model...")
	base = AutoModelForSeq2SeqLM.from_pretrained(base_model_name)

	if not use_lora:
	self.tokenizer = AutoTokenizer.from_pretrained(base_model_name)
	self.model = base.to(self.device)
	self.model.eval()
	return

	if (PROJECT_ROOT / "checkpoints/best_rlhf_model").exists():
	adapter_path = PROJECT_ROOT / "checkpoints/best_rlhf_model"
	else:
	adapter_path = PROJECT_ROOT / "best_rlhf_model"

	adapter_path = adapter_path.resolve()

	print("Loading tokenizer and LoRA adapter...")

	try:
	self.tokenizer = AutoTokenizer.from_pretrained(
	str(adapter_path),
	local_files_only=True
	)
	except Exception:
	self.tokenizer = AutoTokenizer.from_pretrained(base_model_name)

	self.model = PeftModel.from_pretrained(base, str(adapter_path)).to(self.device)
	self.model.eval()

	print("✅ RLHF model ready\n")

	def build_prompt(self, question, schema):
	return f"""You are an expert SQL generator.
	Database schema:
	{schema}
	Generate a valid SQLite query for the question.
	Question:
	{question}
	SQL:
	"""

	def get_schema(self, db_id):
	return self.schema_encoder.structured_schema(db_id)

	def extract_sql(self, text: str):

	text = text.strip()

	if "SQL:" in text:
	text = text.split("SQL:")[-1]

	match = re.search(r"select[\s\S]*", text, re.IGNORECASE)

	if match:
	text = match.group(0)

	return text.split(";")[0].strip()

	def clean_sql(self, sql: str):

	sql = sql.replace('"', "'")
	sql = re.sub(r"\s+", " ", sql)

	return sql.strip()

	def generate_sql(self, prompt):

	inputs = self.tokenizer(
	prompt,
	return_tensors="pt",
	truncation=True,
	max_length=512
	).to(self.device)

	with torch.no_grad():

	outputs = self.model.generate(
	**inputs,
	max_new_tokens=128,
	num_beams=5,
	early_stopping=True
	)

	decoded = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

	return self.clean_sql(self.extract_sql(decoded))

	def execute_sql(self, question, sql, db_id):

	if re.search(self.dml_keywords, sql, re.IGNORECASE):
	return sql, [], [], "❌ Security Alert"

	# FIXED DATABASE PATH
	db_path = DB_ROOT / f"{db_id}.sqlite"

	sql = self.clean_sql(sql)
	sql = semantic_fix(question, sql)

	try:

	conn = sqlite3.connect(db_path)

	cursor = conn.cursor()

	cursor.execute(sql)

	rows = cursor.fetchall()

	columns = [d[0] for d in cursor.description] if cursor.description else []

	conn.close()

	return sql, columns, rows, None

	except Exception as e:

	return sql, [], [], str(e)

	def ask(self, question, db_id):

	question = normalize_question(question)

	if re.search(self.dml_keywords, question, re.IGNORECASE):

	return {
	"question": question,
	"sql": "-- BLOCKED",
	"columns": [],
	"rows": [],
	"error": "Malicious prompt"
	}

	schema = self.get_schema(db_id)

	prompt = self.build_prompt(question, schema)

	raw_sql = self.generate_sql(prompt)

	final_sql, cols, rows, error = self.execute_sql(question, raw_sql, db_id)

	return {
	"question": question,
	"sql": final_sql,
	"columns": cols,
	"rows": rows,
	"error": error
	}


	_engine = None


	def get_engine():

	global _engine

	if _engine is None:
	_engine = Text2SQLEngine()

	return _engine



	# import sqlite3
	# import torch
	# import re
	# import os
	# from pathlib import Path
	# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
	# from peft import PeftModel
	# from src.sql_validator import SQLValidator
	# from src.schema_encoder import SchemaEncoder # Removed build_schema_graph import

	# PROJECT_ROOT = Path(__file__).resolve().parents[1]

	# # ================================
	# # DATABASE PATH AUTO DETECTION
	# # ================================
	# if (PROJECT_ROOT / "data/database").exists():
	# DB_ROOT = PROJECT_ROOT / "data/database"
	# else:
	# DB_ROOT = PROJECT_ROOT / "final_databases"


	# # ==========================================
	# # SCHEMA PARSING
	# # ==========================================
	# def build_schema_graph(schema_text):
	# """
	# Parses a structured schema text string into a dictionary graph.
	# Matches formats like: table_name(col1, col2, col3)
	# """
	# tables = {}
	# for match in re.findall(r'(\w+)\s\((.?)\)', schema_text):
	# table = match[0]
	# cols = [c.strip().split()[0] for c in match[1].split(",")]
	# tables[table] = cols
	# return tables


	# # ==========================================
	# # INPUT VALIDATION & RELEVANCE
	# # ==========================================
	# def is_valid_question(q: str):
	# q = q.strip().lower()

	# if len(q) < 3:
	# return False

	# words = re.findall(r"[a-zA-Z]+", q)
	# if len(words) < 1:
	# return False

	# return True


	# def is_relevant_to_db(question: str, schema_graph: dict):
	# q_words = set(re.findall(r'\b[a-z]{3,}\b', question.lower()))
	# stop_words = {"show", "list", "all", "and", "the", "get", "find", "how", "many", "what", "where", "which", "who", "give", "display", "count", "from", "for", "with", "that", "have", "has", "are", "there"}
	# q_words = q_words - stop_words

	# if not q_words:
	# return True

	# schema_words = set()
	# for table, cols in schema_graph.items():
	# schema_words.update(re.findall(r'\b[a-z]{3,}\b', table.lower()))
	# for col in cols:
	# schema_words.update(re.findall(r'\b[a-z]{3,}\b', col.lower()))

	# synonyms = {
	# "customer": ["client", "buyer", "shopper", "person", "people", "user"],
	# "employee": ["staff", "worker", "boss", "manager", "person", "people"],
	# "track": ["song", "music", "audio", "tune"],
	# "album": ["record", "cd", "music"],
	# "artist": ["singer", "band", "musician", "creator"],
	# "invoice": ["bill", "receipt", "purchase", "sale", "order", "buy", "bought", "cost"],
	# "city": ["town", "location", "place"],
	# "country": ["nation", "location", "place"],
	# "flight": ["plane", "airline", "trip", "fly", "airport"],
	# "student": ["pupil", "learner", "kid", "child"],
	# "club": ["group", "organization", "team"],
	# "course": ["class", "subject"],
	# "cinema": ["movie", "film", "theater", "screen"]
	# }

	# extended_schema_words = set(schema_words)
	# for db_word in schema_words:
	# if db_word in synonyms:
	# extended_schema_words.update(synonyms[db_word])

	# extended_schema_words.update({"id", "name", "total", "sum", "average", "avg", "min", "max", "number", "amount", "record", "data", "info", "information", "detail", "first", "last", "most", "least", "cheapest", "expensive", "best"})

	# for qw in q_words:
	# qw_singular = qw[:-1] if qw.endswith('s') else qw
	# if qw in extended_schema_words or qw_singular in extended_schema_words:
	# return True

	# return False

	# def normalize_question(q: str):
	# return re.sub(r"\s+", " ", q.lower().strip())

	# def semantic_fix(question, sql):
	# q = question.lower()
	# num_match = re.search(r'\b(?:show\|list\|top\|get)\s+(\d+)\b', q)

	# if num_match and "limit" not in sql.lower():
	# sql = f"{sql} LIMIT {num_match.group(1)}"

	# return sql

	# # ==========================================
	# # SCHEMA CONSTRAINTS (SIMULATED LOGIT BLOCKING)
	# # ==========================================
	# def apply_schema_constraints(sql, schema_graph):
	# sql = sql.lower()

	# used_tables = [t[1] for t in re.findall(r'(from\|join)\s+(\w+)', sql)]
	# for t in used_tables:
	# if t not in schema_graph:
	# return None

	# valid_columns = set()
	# for cols in schema_graph.values():
	# valid_columns.update(cols)

	# col_blocks = re.findall(r'select\s+(.*?)\s+from', sql)
	# for block in col_blocks:
	# for c in block.split(","):
	# c = c.strip().split()[-1]
	# if "." in c:
	# c = c.split(".")[-1]

	# if c != "*" and "(" not in c and c != "":
	# if c not in valid_columns:
	# return None

	# return sql

	# # ==========================================
	# # ENGINE
	# # ==========================================
	# class Text2SQLEngine:

	# def __init__(self,
	# adapter_path="checkpoints/best_rlhf_model_2",
	# base_model_name="Salesforce/codet5-base",
	# use_lora=True,
	# use_constrained_decoding=False):

	# self.device = "mps" if torch.backends.mps.is_available() else (
	# "cuda" if torch.cuda.is_available() else "cpu"
	# )

	# self.validator = SQLValidator(DB_ROOT)
	# self.schema_encoder = SchemaEncoder(DB_ROOT)

	# self.use_constrained_decoding = use_constrained_decoding
	# self.dml_keywords = r'\b(delete\|update\|insert\|drop\|alter\|truncate\|create)\b'

	# print(f"\n📦 Loading model on {self.device}...")

	# base = AutoModelForSeq2SeqLM.from_pretrained(base_model_name)

	# # Override the redundant special tokens to prevent the tokenizer crash
	# self.tokenizer = AutoTokenizer.from_pretrained(
	# base_model_name,
	# use_fast=False,
	# additional_special_tokens=[]
	# )

	# # 🔥 FIXED LOADA ADAPTER PATH LOGIC
	# if use_lora:
	# if adapter_path and (PROJECT_ROOT / adapter_path).exists():
	# adapter_path = PROJECT_ROOT / adapter_path
	# elif (PROJECT_ROOT / "checkpoints/best_rlhf_model_2").exists():
	# adapter_path = PROJECT_ROOT / "checkpoints/best_rlhf_model_2"
	# else:
	# adapter_path = PROJECT_ROOT / "best_rlhf_model_2"

	# adapter_path = adapter_path.resolve()

	# if adapter_path.exists():
	# try:
	# self.model = PeftModel.from_pretrained(
	# base,
	# str(adapter_path),
	# local_files_only=True
	# ).to(self.device)
	# print(f"✅ LoRA loaded from {adapter_path}")
	# except Exception as e:
	# print(f"⚠️ LoRA load failed: {e}")
	# self.model = base.to(self.device)
	# else:
	# print(f"⚠️ Adapter not found at {adapter_path}, using base model")
	# self.model = base.to(self.device)
	# else:
	# self.model = base.to(self.device)

	# self.model.eval()

	# def build_prompt(self, question, schema):
	# return f"""
	# You are an expert SQL generator.

	# IMPORTANT:
	# - Use correct tables and columns
	# - Use JOINs when needed

	# Schema:
	# {schema}

	# Question:
	# {question}

	# SQL:
	# """

	# def get_schema(self, db_id):
	# return self.schema_encoder.structured_schema(db_id)

	# def extract_sql(self, text):
	# match = re.search(r"(select\|with)[\s\S]*", text, re.IGNORECASE)
	# return match.group(0).split(";")[0].strip() if match else ""

	# def clean_sql(self, sql):
	# return re.sub(r"\s+", " ", sql.replace('"', "'")).strip()

	# def generate_sql(self, prompt):
	# inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)

	# with torch.no_grad():
	# outputs = self.model.generate(
	# **inputs,
	# max_new_tokens=128,
	# num_beams=8,
	# length_penalty=0.8,
	# early_stopping=True
	# )

	# decoded = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
	# return self.clean_sql(self.extract_sql(decoded))

	# def execute_sql(self, question, sql, db_id):

	# if re.search(self.dml_keywords, sql, re.IGNORECASE):
	# return "", [], [], "❌ Blocked malicious SQL (Contains INSERT/UPDATE/DELETE/DROP)"

	# # 🔥 FIXED DATABASE PATH
	# db_path = DB_ROOT / f"{db_id}.sqlite"
	# sql = semantic_fix(question, sql)

	# try:
	# conn = sqlite3.connect(db_path)
	# cursor = conn.cursor()
	# cursor.execute(sql)

	# rows = cursor.fetchall()
	# columns = [d[0] for d in cursor.description] if cursor.description else []

	# conn.close()
	# return sql, columns, rows, None

	# except Exception as e:
	# return sql, [], [], str(e)

	# def ask(self, question, db_id):

	# question = normalize_question(question)
	# question_context = f"Database question: {question}"

	# if re.search(self.dml_keywords, question_context, re.IGNORECASE):
	# return {"sql": "", "error": "❌ Blocked dangerous query from input text."}

	# if not is_valid_question(question_context):
	# return {"sql": "", "error": "❌ Invalid input. Please type words."}

	# schema = self.get_schema(db_id)
	# schema_graph = build_schema_graph(schema)

	# if not is_relevant_to_db(question, schema_graph):
	# return {"sql": "", "error": "❌ Question is completely out of domain for the selected database."}

	# sql = self.generate_sql(self.build_prompt(question_context, schema))

	# if self.use_constrained_decoding:
	# filtered_sql = apply_schema_constraints(sql, schema_graph)

	# if filtered_sql is None:
	# constraint_prompt = f"""
	# Use ONLY valid schema.
	# Schema:
	# {schema}

	# Question:
	# {question_context}

	# SQL:
	# """
	# sql_retry = self.generate_sql(constraint_prompt)
	# filtered_sql = apply_schema_constraints(sql_retry, schema_graph)

	# if filtered_sql:
	# sql = filtered_sql
	# else:
	# sql = sql_retry

	# final_sql, cols, rows, error = self.execute_sql(question_context, sql, db_id)

	# return {
	# "question": question_context,
	# "sql": final_sql,
	# "columns": cols,
	# "rows": rows,
	# "error": error
	# }

	# def get_engine(
	# adapter_path="checkpoints/best_rlhf_model_2",
	# base_model_name="Salesforce/codet5-base",
	# use_lora=True,
	# use_constrained=True
	# ):
	# return Text2SQLEngine(
	# adapter_path=adapter_path,
	# base_model_name=base_model_name,
	# use_lora=use_lora,
	# use_constrained_decoding=use_constrained
	# )