@inproceedings{sterbentz-etal-2026-mixed,
title = "Mixed-Policy {GRPO} for Text-to-{SQL} with Off-Policy Data Generation",
author = "Sterbentz, Marko and
Glass, Michael and
Pham, Nhan H and
Subramanian, Dharmashankar and
Hammond, Kristian J",
editor = "Gupta, Vivek and
Ding, Kaize and
Kokel, Harsha and
Zhao, Yue and
Agarwal, Amit and
Wang, Yu and
Glass, Michael and
Zhang, Yu and
Srinivas, Kavitha and
Chen, Xiusi and
Hassanzadeh, Oktie and
Zhu, Qi and
Chang, Shuaichen and
Luo, Yuan",
booktitle = "Proceedings of the First Workshop on Structured Understanding, Retrieval, and Generation in the {LLM} Era ({SURG}e{LLM} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.surgellm-1.20/",
pages = "313--325",
ISBN = "979-8-89176-406-4",
abstract = "Recent advances in text-to-SQL have shown that methods such as Group Relative Policy Optimization (GRPO) can substantially improve reasoning performance, but these approaches remain inherently on-policy, limiting their ability to incorporate novel reasoning patterns. In this work, we address this limitation by leveraging existing datasets to generate high-quality off-policy rollouts, enabling mixed-policy training that exposes models to diverse and informative reasoning trajectories. We present the first application of mixed-policy GRPO to the text-to-SQL domain and introduce a systematic study of off-policy data generation strategies for this setting, including a novel method, Iterative Error Correction (IEC), which iteratively refines model outputs through targeted feedback. Our experiments show that mixed-policy GRPO outperforms both base models and on-policy GRPO, yielding average improvements of +4.7{\%} over base models and +4.1{\%} over on-policy GRPO across the Spider and BIRD benchmarks. Gains are particularly strong on BIRD, reaching up to +7.3{\%} over base models and +4.5{\%} over on-policy GRPO."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sterbentz-etal-2026-mixed">
<titleInfo>
<title>Mixed-Policy GRPO for Text-to-SQL with Off-Policy Data Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marko</namePart>
<namePart type="family">Sterbentz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Glass</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nhan</namePart>
<namePart type="given">H</namePart>
<namePart type="family">Pham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dharmashankar</namePart>
<namePart type="family">Subramanian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kristian</namePart>
<namePart type="given">J</namePart>
<namePart type="family">Hammond</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Structured Understanding, Retrieval, and Generation in the LLM Era (SURGeLLM 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kaize</namePart>
<namePart type="family">Ding</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Harsha</namePart>
<namePart type="family">Kokel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amit</namePart>
<namePart type="family">Agarwal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Glass</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yu</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kavitha</namePart>
<namePart type="family">Srinivas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiusi</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oktie</namePart>
<namePart type="family">Hassanzadeh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qi</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuaichen</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuan</namePart>
<namePart type="family">Luo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-406-4</identifier>
</relatedItem>
<abstract>Recent advances in text-to-SQL have shown that methods such as Group Relative Policy Optimization (GRPO) can substantially improve reasoning performance, but these approaches remain inherently on-policy, limiting their ability to incorporate novel reasoning patterns. In this work, we address this limitation by leveraging existing datasets to generate high-quality off-policy rollouts, enabling mixed-policy training that exposes models to diverse and informative reasoning trajectories. We present the first application of mixed-policy GRPO to the text-to-SQL domain and introduce a systematic study of off-policy data generation strategies for this setting, including a novel method, Iterative Error Correction (IEC), which iteratively refines model outputs through targeted feedback. Our experiments show that mixed-policy GRPO outperforms both base models and on-policy GRPO, yielding average improvements of +4.7% over base models and +4.1% over on-policy GRPO across the Spider and BIRD benchmarks. Gains are particularly strong on BIRD, reaching up to +7.3% over base models and +4.5% over on-policy GRPO.</abstract>
<identifier type="citekey">sterbentz-etal-2026-mixed</identifier>
<location>
<url>https://aclanthology.org/2026.surgellm-1.20/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>313</start>
<end>325</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Mixed-Policy GRPO for Text-to-SQL with Off-Policy Data Generation
%A Sterbentz, Marko
%A Glass, Michael
%A Pham, Nhan H.
%A Subramanian, Dharmashankar
%A Hammond, Kristian J.
%Y Gupta, Vivek
%Y Ding, Kaize
%Y Kokel, Harsha
%Y Zhao, Yue
%Y Agarwal, Amit
%Y Wang, Yu
%Y Glass, Michael
%Y Zhang, Yu
%Y Srinivas, Kavitha
%Y Chen, Xiusi
%Y Hassanzadeh, Oktie
%Y Zhu, Qi
%Y Chang, Shuaichen
%Y Luo, Yuan
%S Proceedings of the First Workshop on Structured Understanding, Retrieval, and Generation in the LLM Era (SURGeLLM 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-406-4
%F sterbentz-etal-2026-mixed
%X Recent advances in text-to-SQL have shown that methods such as Group Relative Policy Optimization (GRPO) can substantially improve reasoning performance, but these approaches remain inherently on-policy, limiting their ability to incorporate novel reasoning patterns. In this work, we address this limitation by leveraging existing datasets to generate high-quality off-policy rollouts, enabling mixed-policy training that exposes models to diverse and informative reasoning trajectories. We present the first application of mixed-policy GRPO to the text-to-SQL domain and introduce a systematic study of off-policy data generation strategies for this setting, including a novel method, Iterative Error Correction (IEC), which iteratively refines model outputs through targeted feedback. Our experiments show that mixed-policy GRPO outperforms both base models and on-policy GRPO, yielding average improvements of +4.7% over base models and +4.1% over on-policy GRPO across the Spider and BIRD benchmarks. Gains are particularly strong on BIRD, reaching up to +7.3% over base models and +4.5% over on-policy GRPO.
%U https://aclanthology.org/2026.surgellm-1.20/
%P 313-325
Markdown (Informal)
[Mixed-Policy GRPO for Text-to-SQL with Off-Policy Data Generation](https://aclanthology.org/2026.surgellm-1.20/) (Sterbentz et al., SURGeLLM 2026)
ACL
- Marko Sterbentz, Michael Glass, Nhan H Pham, Dharmashankar Subramanian, and Kristian J Hammond. 2026. Mixed-Policy GRPO for Text-to-SQL with Off-Policy Data Generation. In Proceedings of the First Workshop on Structured Understanding, Retrieval, and Generation in the LLM Era (SURGeLLM 2026), pages 313–325, San Diego, California, United States. Association for Computational Linguistics.