@inproceedings{huang-etal-2026-tackling,
title = "Tackling Distractor Documents in Multi-Hop {QA} with Reinforcement and Curriculum Learning",
author = "Huang, Jerry and
Madala, Siddarth and
Sidhu, Risham and
Niu, Cheng and
Peng, Hao and
Hockenmaier, Julia and
Zhang, Tong",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-eacl.294/",
pages = "5548--5561",
ISBN = "979-8-89176-386-9",
abstract = "Retrieval-augmented generation (RAG) systems rely on retrieval models for identifying relevant contexts and answer generation models for utilizing those contexts. However, retrievers exhibit imperfect recall and precision, limiting downstream performance. We introduce RAG-RL, an answer generation model trained for multi-hop question answering (MHQA) to not only generate answers but also to identify and cite relevant information from larger sets of retrieved contexts, shifting some of the burden of identifying relevant documents from the retriever to the answer generator. Our approach uses curriculum learning, where models are trained across retrieval settings with varying levels of noise. Our experiments show that training samples with fewer distractor documents enable models to acquire citation and reasoning skills with greater sample efficiency and generalizability, demonstrating strong model performance even as the number of irrelevant passages increases. We benchmark our methods on three open-domain MHQA datasets and report significant gains in answer and citation accuracy. Furthermore, our experiments provide empirical insights into how simpler training samples can give models stronger signals for learning specific skills (e.g., citation generation) and how different components of post-training (e.g., training set construction, rule-based rewards, training sample ordering, etc.) impact final model performance."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="huang-etal-2026-tackling">
<titleInfo>
<title>Tackling Distractor Documents in Multi-Hop QA with Reinforcement and Curriculum Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jerry</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Siddarth</namePart>
<namePart type="family">Madala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Risham</namePart>
<namePart type="family">Sidhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cheng</namePart>
<namePart type="family">Niu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hao</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Julia</namePart>
<namePart type="family">Hockenmaier</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tong</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-386-9</identifier>
</relatedItem>
<abstract>Retrieval-augmented generation (RAG) systems rely on retrieval models for identifying relevant contexts and answer generation models for utilizing those contexts. However, retrievers exhibit imperfect recall and precision, limiting downstream performance. We introduce RAG-RL, an answer generation model trained for multi-hop question answering (MHQA) to not only generate answers but also to identify and cite relevant information from larger sets of retrieved contexts, shifting some of the burden of identifying relevant documents from the retriever to the answer generator. Our approach uses curriculum learning, where models are trained across retrieval settings with varying levels of noise. Our experiments show that training samples with fewer distractor documents enable models to acquire citation and reasoning skills with greater sample efficiency and generalizability, demonstrating strong model performance even as the number of irrelevant passages increases. We benchmark our methods on three open-domain MHQA datasets and report significant gains in answer and citation accuracy. Furthermore, our experiments provide empirical insights into how simpler training samples can give models stronger signals for learning specific skills (e.g., citation generation) and how different components of post-training (e.g., training set construction, rule-based rewards, training sample ordering, etc.) impact final model performance.</abstract>
<identifier type="citekey">huang-etal-2026-tackling</identifier>
<location>
<url>https://aclanthology.org/2026.findings-eacl.294/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>5548</start>
<end>5561</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Tackling Distractor Documents in Multi-Hop QA with Reinforcement and Curriculum Learning
%A Huang, Jerry
%A Madala, Siddarth
%A Sidhu, Risham
%A Niu, Cheng
%A Peng, Hao
%A Hockenmaier, Julia
%A Zhang, Tong
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Findings of the Association for Computational Linguistics: EACL 2026
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-386-9
%F huang-etal-2026-tackling
%X Retrieval-augmented generation (RAG) systems rely on retrieval models for identifying relevant contexts and answer generation models for utilizing those contexts. However, retrievers exhibit imperfect recall and precision, limiting downstream performance. We introduce RAG-RL, an answer generation model trained for multi-hop question answering (MHQA) to not only generate answers but also to identify and cite relevant information from larger sets of retrieved contexts, shifting some of the burden of identifying relevant documents from the retriever to the answer generator. Our approach uses curriculum learning, where models are trained across retrieval settings with varying levels of noise. Our experiments show that training samples with fewer distractor documents enable models to acquire citation and reasoning skills with greater sample efficiency and generalizability, demonstrating strong model performance even as the number of irrelevant passages increases. We benchmark our methods on three open-domain MHQA datasets and report significant gains in answer and citation accuracy. Furthermore, our experiments provide empirical insights into how simpler training samples can give models stronger signals for learning specific skills (e.g., citation generation) and how different components of post-training (e.g., training set construction, rule-based rewards, training sample ordering, etc.) impact final model performance.
%U https://aclanthology.org/2026.findings-eacl.294/
%P 5548-5561
Markdown (Informal)
[Tackling Distractor Documents in Multi-Hop QA with Reinforcement and Curriculum Learning](https://aclanthology.org/2026.findings-eacl.294/) (Huang et al., Findings 2026)
ACL