@inproceedings{toteja-etal-2025-context,
title = "In-Context Reinforcement Learning with Retrieval-Augmented Generation for Text-to-{SQL}",
author = "Toteja, Rishit and
Sarkar, Arindam and
Comar, Prakash Mandayam",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.692/",
pages = "10390--10397",
abstract = "Text-to-SQL simplifies database interactions by enabling non-experts to convert their natural language (NL) questions to Structured Query Language (SQL) queries. With advancements in Large Language Models (LLM), in-context learning (ICL) has emerged as a popular choice for building Text-to-SQL systems. Real world, industry-scale databases, often comprise thousands of tables and hundreds of columns, and makes passing the entire schema as context to an LLM infeasibly expensive. This requisites access to the correct database and the set of tables. Recently Retrieval Augmented Generation (RAG) based methods have been proposed for retrieving relevant subset of databases and tables for a given query. However, we observe that the existing methods of synthetic query generation can generate predominantly simple queries which might not be sufficiently representative of complex, real world queries, thus, negatively affecting the quality of the generated SQL. To address this, we propose an innovative in-context reinforcement learning (ICRL) based framework which refines the question generation process by enhancing the model`s ability to produce intricate queries that practitioners may pose during inference. In contrast to the existing approaches, our framework ensures the generation of synthetic SQL queries which are diverse and complex. We demonstrate the effectiveness of our approach via multiple experiments comparing against the representative state-of-the-art models on public benchmark datasets and observe substantial improvements in performance and scalability. Our method achieves 15-20{\%} higher recall in database/table retrieval task compared to the existing state-of-the-art models for schema identification and upto 2{\%} higher execution accuracy for SQL generation."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="toteja-etal-2025-context">
<titleInfo>
<title>In-Context Reinforcement Learning with Retrieval-Augmented Generation for Text-to-SQL</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rishit</namePart>
<namePart type="family">Toteja</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arindam</namePart>
<namePart type="family">Sarkar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Prakash</namePart>
<namePart type="given">Mandayam</namePart>
<namePart type="family">Comar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Text-to-SQL simplifies database interactions by enabling non-experts to convert their natural language (NL) questions to Structured Query Language (SQL) queries. With advancements in Large Language Models (LLM), in-context learning (ICL) has emerged as a popular choice for building Text-to-SQL systems. Real world, industry-scale databases, often comprise thousands of tables and hundreds of columns, and makes passing the entire schema as context to an LLM infeasibly expensive. This requisites access to the correct database and the set of tables. Recently Retrieval Augmented Generation (RAG) based methods have been proposed for retrieving relevant subset of databases and tables for a given query. However, we observe that the existing methods of synthetic query generation can generate predominantly simple queries which might not be sufficiently representative of complex, real world queries, thus, negatively affecting the quality of the generated SQL. To address this, we propose an innovative in-context reinforcement learning (ICRL) based framework which refines the question generation process by enhancing the model‘s ability to produce intricate queries that practitioners may pose during inference. In contrast to the existing approaches, our framework ensures the generation of synthetic SQL queries which are diverse and complex. We demonstrate the effectiveness of our approach via multiple experiments comparing against the representative state-of-the-art models on public benchmark datasets and observe substantial improvements in performance and scalability. Our method achieves 15-20% higher recall in database/table retrieval task compared to the existing state-of-the-art models for schema identification and upto 2% higher execution accuracy for SQL generation.</abstract>
<identifier type="citekey">toteja-etal-2025-context</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.692/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>10390</start>
<end>10397</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T In-Context Reinforcement Learning with Retrieval-Augmented Generation for Text-to-SQL
%A Toteja, Rishit
%A Sarkar, Arindam
%A Comar, Prakash Mandayam
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F toteja-etal-2025-context
%X Text-to-SQL simplifies database interactions by enabling non-experts to convert their natural language (NL) questions to Structured Query Language (SQL) queries. With advancements in Large Language Models (LLM), in-context learning (ICL) has emerged as a popular choice for building Text-to-SQL systems. Real world, industry-scale databases, often comprise thousands of tables and hundreds of columns, and makes passing the entire schema as context to an LLM infeasibly expensive. This requisites access to the correct database and the set of tables. Recently Retrieval Augmented Generation (RAG) based methods have been proposed for retrieving relevant subset of databases and tables for a given query. However, we observe that the existing methods of synthetic query generation can generate predominantly simple queries which might not be sufficiently representative of complex, real world queries, thus, negatively affecting the quality of the generated SQL. To address this, we propose an innovative in-context reinforcement learning (ICRL) based framework which refines the question generation process by enhancing the model‘s ability to produce intricate queries that practitioners may pose during inference. In contrast to the existing approaches, our framework ensures the generation of synthetic SQL queries which are diverse and complex. We demonstrate the effectiveness of our approach via multiple experiments comparing against the representative state-of-the-art models on public benchmark datasets and observe substantial improvements in performance and scalability. Our method achieves 15-20% higher recall in database/table retrieval task compared to the existing state-of-the-art models for schema identification and upto 2% higher execution accuracy for SQL generation.
%U https://aclanthology.org/2025.coling-main.692/
%P 10390-10397
Markdown (Informal)
[In-Context Reinforcement Learning with Retrieval-Augmented Generation for Text-to-SQL](https://aclanthology.org/2025.coling-main.692/) (Toteja et al., COLING 2025)
ACL