@inproceedings{zhao-etal-2026-atir,
title = "{ATIR}: Towards Audio-Text Interleaved Contextual Retrieval",
author = "Zhao, Tong and
Zhang, Chenghao and
Zhu, Yutao and
Dou, Zhicheng",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-long.1006/",
pages = "22032--22046",
ISBN = "979-8-89176-390-6",
abstract = "Audio carries richer information than text, including emotion, speaker traits, and environmental context, while also enabling lower-latency processing compared to speech-to-text pipelines. However, recent multimodal information retrieval research has predominantly focused on images, largely overlooking audio, especially in the setting of interleaved audio-text contextual retrieval. In this work, we introduce the Audio-Text Interleaved contextual Retrieval (ATIR) task, where queries can alternate between audio and text modalities. We construct an ATIR benchmark by integrating several Automatic Speech Recognition (ASR), QA, and retrieval datasets, ultimately unifying four types of contextual retrieval tasks. This benchmark substantially addresses the limitations of existing audio retrieval datasets in semantic retrieval. To study this task, we evaluate several off-the-shelf retrievers and train our ATIR model based on a Multimodal Large Language Model (MLLM). We further propose a novel token compression mechanism, which is orthogonal to existing compression methods, to mitigate the challenge of excessive audio tokens in MLLM-based ATIR models. Experimental results show that our ATIR model achieves significant improvements over strong baselines."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhao-etal-2026-atir">
<titleInfo>
<title>ATIR: Towards Audio-Text Interleaved Contextual Retrieval</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tong</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chenghao</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yutao</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhicheng</namePart>
<namePart type="family">Dou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-390-6</identifier>
</relatedItem>
<abstract>Audio carries richer information than text, including emotion, speaker traits, and environmental context, while also enabling lower-latency processing compared to speech-to-text pipelines. However, recent multimodal information retrieval research has predominantly focused on images, largely overlooking audio, especially in the setting of interleaved audio-text contextual retrieval. In this work, we introduce the Audio-Text Interleaved contextual Retrieval (ATIR) task, where queries can alternate between audio and text modalities. We construct an ATIR benchmark by integrating several Automatic Speech Recognition (ASR), QA, and retrieval datasets, ultimately unifying four types of contextual retrieval tasks. This benchmark substantially addresses the limitations of existing audio retrieval datasets in semantic retrieval. To study this task, we evaluate several off-the-shelf retrievers and train our ATIR model based on a Multimodal Large Language Model (MLLM). We further propose a novel token compression mechanism, which is orthogonal to existing compression methods, to mitigate the challenge of excessive audio tokens in MLLM-based ATIR models. Experimental results show that our ATIR model achieves significant improvements over strong baselines.</abstract>
<identifier type="citekey">zhao-etal-2026-atir</identifier>
<location>
<url>https://aclanthology.org/2026.acl-long.1006/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>22032</start>
<end>22046</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ATIR: Towards Audio-Text Interleaved Contextual Retrieval
%A Zhao, Tong
%A Zhang, Chenghao
%A Zhu, Yutao
%A Dou, Zhicheng
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-390-6
%F zhao-etal-2026-atir
%X Audio carries richer information than text, including emotion, speaker traits, and environmental context, while also enabling lower-latency processing compared to speech-to-text pipelines. However, recent multimodal information retrieval research has predominantly focused on images, largely overlooking audio, especially in the setting of interleaved audio-text contextual retrieval. In this work, we introduce the Audio-Text Interleaved contextual Retrieval (ATIR) task, where queries can alternate between audio and text modalities. We construct an ATIR benchmark by integrating several Automatic Speech Recognition (ASR), QA, and retrieval datasets, ultimately unifying four types of contextual retrieval tasks. This benchmark substantially addresses the limitations of existing audio retrieval datasets in semantic retrieval. To study this task, we evaluate several off-the-shelf retrievers and train our ATIR model based on a Multimodal Large Language Model (MLLM). We further propose a novel token compression mechanism, which is orthogonal to existing compression methods, to mitigate the challenge of excessive audio tokens in MLLM-based ATIR models. Experimental results show that our ATIR model achieves significant improvements over strong baselines.
%U https://aclanthology.org/2026.acl-long.1006/
%P 22032-22046
Markdown (Informal)
[ATIR: Towards Audio-Text Interleaved Contextual Retrieval](https://aclanthology.org/2026.acl-long.1006/) (Zhao et al., ACL 2026)
ACL
- Tong Zhao, Chenghao Zhang, Yutao Zhu, and Zhicheng Dou. 2026. ATIR: Towards Audio-Text Interleaved Contextual Retrieval. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 22032–22046, San Diego, California, United States. Association for Computational Linguistics.