@inproceedings{zhan-etal-2025-towards,
title = "Towards Database-Free Text-to-{SQL} Evaluation: A Graph-Based Metric for Functional Correctness",
author = "Zhan, Yi and
Cui, Longjie and
Weng, Han and
Wang, Guifeng and
Tian, Yu and
Liu, Boyi and
Yang, Yingxiang and
Yin, Xiaoming and
Xie, Jiajun and
Sun, Yang",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.308/",
pages = "4586--4610",
abstract = "Execution Accuracy and Exact Set Match are two predominant metrics for evaluating the functional correctness of SQL queries in modern Text-to-SQL tasks. However, both metrics have notable limitations: Exact Set Match fails when queries are functionally equivalent but syntactically different, while Execution Accuracy is prone to false positives due to inadequately prepared test databases, which can be costly to create, particularly in large-scale industrial applications. To overcome these challenges, we propose a novel graph-based metric, FuncEvalGMN, that effectively overcomes the deficiencies of the aforementioned metric designs. Our method utilizes a relational operator tree (ROT), referred to as RelNode, to extract rich semantic information from the logical execution plan of SQL queries, and embed it into a graph. We then train a graph neural network (GNN) to perform graph matching on pairs of SQL queries through graph contrastive learning. FuncEvalGMN offers two highly desired advantages: (i) it requires only the database schema to derive logical execution plans, eliminating the need for extensive test database preparation, and (ii) it demonstrates strong generalization capabilities on unseen datasets. These properties highlight FuncEvalGMN`s robustness as a reliable metric for assessing functional correctness across a wide range of Text-to-SQL applications."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhan-etal-2025-towards">
<titleInfo>
<title>Towards Database-Free Text-to-SQL Evaluation: A Graph-Based Metric for Functional Correctness</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yi</namePart>
<namePart type="family">Zhan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Longjie</namePart>
<namePart type="family">Cui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Han</namePart>
<namePart type="family">Weng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guifeng</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yu</namePart>
<namePart type="family">Tian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Boyi</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yingxiang</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaoming</namePart>
<namePart type="family">Yin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Xie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Execution Accuracy and Exact Set Match are two predominant metrics for evaluating the functional correctness of SQL queries in modern Text-to-SQL tasks. However, both metrics have notable limitations: Exact Set Match fails when queries are functionally equivalent but syntactically different, while Execution Accuracy is prone to false positives due to inadequately prepared test databases, which can be costly to create, particularly in large-scale industrial applications. To overcome these challenges, we propose a novel graph-based metric, FuncEvalGMN, that effectively overcomes the deficiencies of the aforementioned metric designs. Our method utilizes a relational operator tree (ROT), referred to as RelNode, to extract rich semantic information from the logical execution plan of SQL queries, and embed it into a graph. We then train a graph neural network (GNN) to perform graph matching on pairs of SQL queries through graph contrastive learning. FuncEvalGMN offers two highly desired advantages: (i) it requires only the database schema to derive logical execution plans, eliminating the need for extensive test database preparation, and (ii) it demonstrates strong generalization capabilities on unseen datasets. These properties highlight FuncEvalGMN‘s robustness as a reliable metric for assessing functional correctness across a wide range of Text-to-SQL applications.</abstract>
<identifier type="citekey">zhan-etal-2025-towards</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.308/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>4586</start>
<end>4610</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Towards Database-Free Text-to-SQL Evaluation: A Graph-Based Metric for Functional Correctness
%A Zhan, Yi
%A Cui, Longjie
%A Weng, Han
%A Wang, Guifeng
%A Tian, Yu
%A Liu, Boyi
%A Yang, Yingxiang
%A Yin, Xiaoming
%A Xie, Jiajun
%A Sun, Yang
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F zhan-etal-2025-towards
%X Execution Accuracy and Exact Set Match are two predominant metrics for evaluating the functional correctness of SQL queries in modern Text-to-SQL tasks. However, both metrics have notable limitations: Exact Set Match fails when queries are functionally equivalent but syntactically different, while Execution Accuracy is prone to false positives due to inadequately prepared test databases, which can be costly to create, particularly in large-scale industrial applications. To overcome these challenges, we propose a novel graph-based metric, FuncEvalGMN, that effectively overcomes the deficiencies of the aforementioned metric designs. Our method utilizes a relational operator tree (ROT), referred to as RelNode, to extract rich semantic information from the logical execution plan of SQL queries, and embed it into a graph. We then train a graph neural network (GNN) to perform graph matching on pairs of SQL queries through graph contrastive learning. FuncEvalGMN offers two highly desired advantages: (i) it requires only the database schema to derive logical execution plans, eliminating the need for extensive test database preparation, and (ii) it demonstrates strong generalization capabilities on unseen datasets. These properties highlight FuncEvalGMN‘s robustness as a reliable metric for assessing functional correctness across a wide range of Text-to-SQL applications.
%U https://aclanthology.org/2025.coling-main.308/
%P 4586-4610
Markdown (Informal)
[Towards Database-Free Text-to-SQL Evaluation: A Graph-Based Metric for Functional Correctness](https://aclanthology.org/2025.coling-main.308/) (Zhan et al., COLING 2025)
ACL
- Yi Zhan, Longjie Cui, Han Weng, Guifeng Wang, Yu Tian, Boyi Liu, Yingxiang Yang, Xiaoming Yin, Jiajun Xie, and Yang Sun. 2025. Towards Database-Free Text-to-SQL Evaluation: A Graph-Based Metric for Functional Correctness. In Proceedings of the 31st International Conference on Computational Linguistics, pages 4586–4610, Abu Dhabi, UAE. Association for Computational Linguistics.