import evaluate import datasets from text2sql_eval.metrics.bleu import bleu_score _DESCRIPTION = "SQL token BLEU (0–1). Returns mean score in [0, 1]." def _to_str(x): if isinstance(x, (list, tuple)): return x[0] if x else "" return "" if x is None else str(x) class SQLBLEU(evaluate.Metric): def _info(self): return evaluate.MetricInfo( description=_DESCRIPTION, citation="Uses sacrebleu via text2sql-eval implementation.", features=datasets.Features( { "predictions": datasets.Value("string"), "references": datasets.Value("string"), } ), ) def _compute(self, predictions, references): scores = [] for p, r in zip(predictions, references): scores.append(float(bleu_score(_to_str(p), _to_str(r)))) mean = sum(scores) / len(scores) if scores else 0.0 return {"sql_bleu": mean}