@inproceedings{zhang-etal-2026-plug,
title = "Plug-and-Play Data Module for Code {RL}: Adaptive Ambiguity Replay",
author = "Zhang, Jianqing and
Xia, Wei and
Hao, Zhezheng and
Wang, Hong and
Dong, Hande and
Lin, Qiang and
Liu, Yang and
Cao, Jian and
Yang, Qiang",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.886/",
pages = "17865--17875",
ISBN = "979-8-89176-395-1",
abstract = "Reinforcement learning (RL) is effective for improving code generation but suffers from data scarcity. While experience replay mitigates this, existing approaches rely on static, in-epoch metrics that overlook training dynamics, often introducing low-utility or outdated data. Analyzing RL dynamics via dataset cartography, we observe that ``ambiguous'' samples, which are vital for model generalization, rapidly migrate to ``easy-to-learn'' regions, diminishing their training value. To address this, we propose Adaptive Ambiguity Replay (A2R) for RL, a plug-and-play module that prioritizes cross-epoch ambiguous samples. To neutralize the noise from stale experiences, A2R incorporates an adaptive importance mechanism based on policy divergence to weigh replayed rollouts. Extensive experiments on nine LLMs (3B{--}14B) demonstrate that A2R outperforms state-of-the-art baselines on real-world code editing tasks across both unseen and learned domains. Our results highlight cross-epoch ambiguity as a key factor for effective replay in RL. Code: https://github.com/TsingZ0/verl-A2R"
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2026-plug">
<titleInfo>
<title>Plug-and-Play Data Module for Code RL: Adaptive Ambiguity Replay</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jianqing</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Xia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhezheng</namePart>
<namePart type="family">Hao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hong</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hande</namePart>
<namePart type="family">Dong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qiang</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jian</namePart>
<namePart type="family">Cao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qiang</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Reinforcement learning (RL) is effective for improving code generation but suffers from data scarcity. While experience replay mitigates this, existing approaches rely on static, in-epoch metrics that overlook training dynamics, often introducing low-utility or outdated data. Analyzing RL dynamics via dataset cartography, we observe that “ambiguous” samples, which are vital for model generalization, rapidly migrate to “easy-to-learn” regions, diminishing their training value. To address this, we propose Adaptive Ambiguity Replay (A2R) for RL, a plug-and-play module that prioritizes cross-epoch ambiguous samples. To neutralize the noise from stale experiences, A2R incorporates an adaptive importance mechanism based on policy divergence to weigh replayed rollouts. Extensive experiments on nine LLMs (3B–14B) demonstrate that A2R outperforms state-of-the-art baselines on real-world code editing tasks across both unseen and learned domains. Our results highlight cross-epoch ambiguity as a key factor for effective replay in RL. Code: https://github.com/TsingZ0/verl-A2R</abstract>
<identifier type="citekey">zhang-etal-2026-plug</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.886/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>17865</start>
<end>17875</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Plug-and-Play Data Module for Code RL: Adaptive Ambiguity Replay
%A Zhang, Jianqing
%A Xia, Wei
%A Hao, Zhezheng
%A Wang, Hong
%A Dong, Hande
%A Lin, Qiang
%A Liu, Yang
%A Cao, Jian
%A Yang, Qiang
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F zhang-etal-2026-plug
%X Reinforcement learning (RL) is effective for improving code generation but suffers from data scarcity. While experience replay mitigates this, existing approaches rely on static, in-epoch metrics that overlook training dynamics, often introducing low-utility or outdated data. Analyzing RL dynamics via dataset cartography, we observe that “ambiguous” samples, which are vital for model generalization, rapidly migrate to “easy-to-learn” regions, diminishing their training value. To address this, we propose Adaptive Ambiguity Replay (A2R) for RL, a plug-and-play module that prioritizes cross-epoch ambiguous samples. To neutralize the noise from stale experiences, A2R incorporates an adaptive importance mechanism based on policy divergence to weigh replayed rollouts. Extensive experiments on nine LLMs (3B–14B) demonstrate that A2R outperforms state-of-the-art baselines on real-world code editing tasks across both unseen and learned domains. Our results highlight cross-epoch ambiguity as a key factor for effective replay in RL. Code: https://github.com/TsingZ0/verl-A2R
%U https://aclanthology.org/2026.findings-acl.886/
%P 17865-17875
Markdown (Informal)
[Plug-and-Play Data Module for Code RL: Adaptive Ambiguity Replay](https://aclanthology.org/2026.findings-acl.886/) (Zhang et al., Findings 2026)
ACL
- Jianqing Zhang, Wei Xia, Zhezheng Hao, Hong Wang, Hande Dong, Qiang Lin, Yang Liu, Jian Cao, and Qiang Yang. 2026. Plug-and-Play Data Module for Code RL: Adaptive Ambiguity Replay. In Findings of the Association for Computational Linguistics: ACL 2026, pages 17865–17875, San Diego, California, United States. Association for Computational Linguistics.