@inproceedings{li-etal-2026-learnalign,
title = "{L}earn{A}lign: Data Selection for {LLM} Reinforcement Learning with Improved Gradient Alignment",
author = "Li, Shipeng and
Yang, Zhiqin and
Li, Shikun and
Xia, Xiaobo and
Liu, Hengyu and
Zhang, Xinghua and
Chen, Gaode and
Fang, Dong and
Tai, Ying and
Peng, Zhe",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.2009/",
pages = "40417--40434",
ISBN = "979-8-89176-395-1",
abstract = "Reinforcement learning with verifiable rewards (RLVR) has become a key technique for enhancing LLMs' reasoning abilities, yet its data inefficiency remains a major bottleneck. To address this critical yet challenging issue, we present a novel gradient-alignment-based method, named LearnAlign, which intelligently selects the learnable and representative training reasoning data for RLVR post-training. To overcome the well-known response-length bias in gradient norms, we introduce the data learnability based on the success rate, which indicates the learning potential of each data point. Experiments across five reasoning benchmarks show that our method significantly reduces training data requirements while achieving minor performance degradation or even improving performance compared to full-data training. Specifically, it reduces data requirements by up to 1,000 data points with better performance (77.5{\%}) than that on the full dataset on the GSM8K benchmark (77.0{\%}). Furthermore, its efficiency is demonstrated on both mathematical and code benchmarks by using much less data from the DAPO-MATH-17K dataset."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="li-etal-2026-learnalign">
<titleInfo>
<title>LearnAlign: Data Selection for LLM Reinforcement Learning with Improved Gradient Alignment</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shipeng</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhiqin</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shikun</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaobo</namePart>
<namePart type="family">Xia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hengyu</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xinghua</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gaode</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dong</namePart>
<namePart type="family">Fang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ying</namePart>
<namePart type="family">Tai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhe</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Reinforcement learning with verifiable rewards (RLVR) has become a key technique for enhancing LLMs’ reasoning abilities, yet its data inefficiency remains a major bottleneck. To address this critical yet challenging issue, we present a novel gradient-alignment-based method, named LearnAlign, which intelligently selects the learnable and representative training reasoning data for RLVR post-training. To overcome the well-known response-length bias in gradient norms, we introduce the data learnability based on the success rate, which indicates the learning potential of each data point. Experiments across five reasoning benchmarks show that our method significantly reduces training data requirements while achieving minor performance degradation or even improving performance compared to full-data training. Specifically, it reduces data requirements by up to 1,000 data points with better performance (77.5%) than that on the full dataset on the GSM8K benchmark (77.0%). Furthermore, its efficiency is demonstrated on both mathematical and code benchmarks by using much less data from the DAPO-MATH-17K dataset.</abstract>
<identifier type="citekey">li-etal-2026-learnalign</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.2009/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>40417</start>
<end>40434</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T LearnAlign: Data Selection for LLM Reinforcement Learning with Improved Gradient Alignment
%A Li, Shipeng
%A Yang, Zhiqin
%A Li, Shikun
%A Xia, Xiaobo
%A Liu, Hengyu
%A Zhang, Xinghua
%A Chen, Gaode
%A Fang, Dong
%A Tai, Ying
%A Peng, Zhe
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F li-etal-2026-learnalign
%X Reinforcement learning with verifiable rewards (RLVR) has become a key technique for enhancing LLMs’ reasoning abilities, yet its data inefficiency remains a major bottleneck. To address this critical yet challenging issue, we present a novel gradient-alignment-based method, named LearnAlign, which intelligently selects the learnable and representative training reasoning data for RLVR post-training. To overcome the well-known response-length bias in gradient norms, we introduce the data learnability based on the success rate, which indicates the learning potential of each data point. Experiments across five reasoning benchmarks show that our method significantly reduces training data requirements while achieving minor performance degradation or even improving performance compared to full-data training. Specifically, it reduces data requirements by up to 1,000 data points with better performance (77.5%) than that on the full dataset on the GSM8K benchmark (77.0%). Furthermore, its efficiency is demonstrated on both mathematical and code benchmarks by using much less data from the DAPO-MATH-17K dataset.
%U https://aclanthology.org/2026.findings-acl.2009/
%P 40417-40434
Markdown (Informal)
[LearnAlign: Data Selection for LLM Reinforcement Learning with Improved Gradient Alignment](https://aclanthology.org/2026.findings-acl.2009/) (Li et al., Findings 2026)
ACL
- Shipeng Li, Zhiqin Yang, Shikun Li, Xiaobo Xia, Hengyu Liu, Xinghua Zhang, Gaode Chen, Dong Fang, Ying Tai, and Zhe Peng. 2026. LearnAlign: Data Selection for LLM Reinforcement Learning with Improved Gradient Alignment. In Findings of the Association for Computational Linguistics: ACL 2026, pages 40417–40434, San Diego, California, United States. Association for Computational Linguistics.