@inproceedings{soh-etal-2025-use,
title = "You Only Use Reactive Attention Slice When Retrieving From Long Context",
author = "Soh, Yun Joon and
Huang, Hanxian and
Tian, Yuandong and
Zhao, Jishen",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.1125/",
pages = "20665--20686",
ISBN = "979-8-89176-335-7",
abstract = "Retrieval-Augmented Generation is a powerful method for enhancing language models (LMs), but existing retrieval techniques are limited.Embedding-based methods are often inaccurate due to their reliance on lexical similarity, while neural retrievers are computationally expensive to train.To overcome these issues, we introduce You Only Use Reactive Attention slice (YOURA), a training-free and fine-tuning-free attention-based retrieval technique. When retrieving, YOURA uses a novel reaction score heuristic, which quantifies how an LM{'}s self-attention ``reacts'' to a user query. We also propose a sentence extraction algorithm to efficiently preprocess the context.Evaluations on three open-source LMs using the LongBench and BABILong datasets show YOURA{'}s effectiveness. Our framework improves QA task accuracy by up to 15{\%} and inference throughput by up to 31{\%} compared to embedding-based retrieval."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="soh-etal-2025-use">
<titleInfo>
<title>You Only Use Reactive Attention Slice When Retrieving From Long Context</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yun</namePart>
<namePart type="given">Joon</namePart>
<namePart type="family">Soh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hanxian</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuandong</namePart>
<namePart type="family">Tian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jishen</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>Retrieval-Augmented Generation is a powerful method for enhancing language models (LMs), but existing retrieval techniques are limited.Embedding-based methods are often inaccurate due to their reliance on lexical similarity, while neural retrievers are computationally expensive to train.To overcome these issues, we introduce You Only Use Reactive Attention slice (YOURA), a training-free and fine-tuning-free attention-based retrieval technique. When retrieving, YOURA uses a novel reaction score heuristic, which quantifies how an LM’s self-attention “reacts” to a user query. We also propose a sentence extraction algorithm to efficiently preprocess the context.Evaluations on three open-source LMs using the LongBench and BABILong datasets show YOURA’s effectiveness. Our framework improves QA task accuracy by up to 15% and inference throughput by up to 31% compared to embedding-based retrieval.</abstract>
<identifier type="citekey">soh-etal-2025-use</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.1125/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>20665</start>
<end>20686</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T You Only Use Reactive Attention Slice When Retrieving From Long Context
%A Soh, Yun Joon
%A Huang, Hanxian
%A Tian, Yuandong
%A Zhao, Jishen
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F soh-etal-2025-use
%X Retrieval-Augmented Generation is a powerful method for enhancing language models (LMs), but existing retrieval techniques are limited.Embedding-based methods are often inaccurate due to their reliance on lexical similarity, while neural retrievers are computationally expensive to train.To overcome these issues, we introduce You Only Use Reactive Attention slice (YOURA), a training-free and fine-tuning-free attention-based retrieval technique. When retrieving, YOURA uses a novel reaction score heuristic, which quantifies how an LM’s self-attention “reacts” to a user query. We also propose a sentence extraction algorithm to efficiently preprocess the context.Evaluations on three open-source LMs using the LongBench and BABILong datasets show YOURA’s effectiveness. Our framework improves QA task accuracy by up to 15% and inference throughput by up to 31% compared to embedding-based retrieval.
%U https://aclanthology.org/2025.findings-emnlp.1125/
%P 20665-20686
Markdown (Informal)
[You Only Use Reactive Attention Slice When Retrieving From Long Context](https://aclanthology.org/2025.findings-emnlp.1125/) (Soh et al., Findings 2025)
ACL