@inproceedings{chen-etal-2026-aipo,
title = "{AIPO}: Adaptive Information Guided Token-Level Reinforcement Learning for Large Language Model Reasoning",
author = "Chen, Bin and
Ye, Hongfei and
Wang, Huiyang and
Liu, Wenxi and
Zhang, Yu and
Liu, Furui",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-long.2057/",
pages = "44441--44450",
ISBN = "979-8-89176-390-6",
abstract = "Reinforcement Learning with Verifiable Rewards (RLVR) improves the reasoning capability of Large Language Models (LLMs). Current RLVR trains LLMs on all generated tokens, rather than exploring which tokens actually contribute to reasoning. We propose AIPO(Adaptive{--}Information Policy Optimization), which focuses updates on those decisive tokens discovered on the fly. AIPO estimates each hidden state{'}s mutual information to score tokens. Policy gradients are then computed only on these critical tokens, using an advantage that blends information gain and verifiable correctness. To improve the efficiency of mutual-information estimation, AIPO adopts a Random{--}Fourier approximation of the Hilbert{--}Schmidt Independence Criterion. Across five math and science benchmarks, AIPO yields up to +20{\%} accuracy over strong RLVR baselines while updating merely 10{\%} of tokens, demonstrating superior efficiency and effectiveness. Our findings highlight the importance of information{--}driven token selection for efficient and effective reinforcement learning of LLM reasoning."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chen-etal-2026-aipo">
<titleInfo>
<title>AIPO: Adaptive Information Guided Token-Level Reinforcement Learning for Large Language Model Reasoning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bin</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hongfei</namePart>
<namePart type="family">Ye</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Huiyang</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenxi</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yu</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Furui</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-390-6</identifier>
</relatedItem>
<abstract>Reinforcement Learning with Verifiable Rewards (RLVR) improves the reasoning capability of Large Language Models (LLMs). Current RLVR trains LLMs on all generated tokens, rather than exploring which tokens actually contribute to reasoning. We propose AIPO(Adaptive–Information Policy Optimization), which focuses updates on those decisive tokens discovered on the fly. AIPO estimates each hidden state’s mutual information to score tokens. Policy gradients are then computed only on these critical tokens, using an advantage that blends information gain and verifiable correctness. To improve the efficiency of mutual-information estimation, AIPO adopts a Random–Fourier approximation of the Hilbert–Schmidt Independence Criterion. Across five math and science benchmarks, AIPO yields up to +20% accuracy over strong RLVR baselines while updating merely 10% of tokens, demonstrating superior efficiency and effectiveness. Our findings highlight the importance of information–driven token selection for efficient and effective reinforcement learning of LLM reasoning.</abstract>
<identifier type="citekey">chen-etal-2026-aipo</identifier>
<location>
<url>https://aclanthology.org/2026.acl-long.2057/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>44441</start>
<end>44450</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T AIPO: Adaptive Information Guided Token-Level Reinforcement Learning for Large Language Model Reasoning
%A Chen, Bin
%A Ye, Hongfei
%A Wang, Huiyang
%A Liu, Wenxi
%A Zhang, Yu
%A Liu, Furui
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-390-6
%F chen-etal-2026-aipo
%X Reinforcement Learning with Verifiable Rewards (RLVR) improves the reasoning capability of Large Language Models (LLMs). Current RLVR trains LLMs on all generated tokens, rather than exploring which tokens actually contribute to reasoning. We propose AIPO(Adaptive–Information Policy Optimization), which focuses updates on those decisive tokens discovered on the fly. AIPO estimates each hidden state’s mutual information to score tokens. Policy gradients are then computed only on these critical tokens, using an advantage that blends information gain and verifiable correctness. To improve the efficiency of mutual-information estimation, AIPO adopts a Random–Fourier approximation of the Hilbert–Schmidt Independence Criterion. Across five math and science benchmarks, AIPO yields up to +20% accuracy over strong RLVR baselines while updating merely 10% of tokens, demonstrating superior efficiency and effectiveness. Our findings highlight the importance of information–driven token selection for efficient and effective reinforcement learning of LLM reasoning.
%U https://aclanthology.org/2026.acl-long.2057/
%P 44441-44450
Markdown (Informal)
[AIPO: Adaptive Information Guided Token-Level Reinforcement Learning for Large Language Model Reasoning](https://aclanthology.org/2026.acl-long.2057/) (Chen et al., ACL 2026)
ACL