@inproceedings{yin-etal-2025-atri,
title = "{ATRI}: Mitigating Multilingual Audio Text Retrieval Inconsistencies by Reducing Data Distribution Errors",
author = "Yin, Yuguo and
Xie, Yuxin and
Yang, Wenyuan and
Yang, Dongchao and
Ru, Jinghan and
Zhuang, Xianwei and
Liang, Liming and
Zou, Yuexian",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.273/",
doi = "10.18653/v1/2025.acl-long.273",
pages = "5491--5504",
ISBN = "979-8-89176-251-0",
abstract = "Multilingual audio-text retrieval (ML-ATR) is a challenging task that aims to retrieve audio clips or multilingual texts from databases. However, existing ML-ATR schemes suffer from inconsistencies for instance similarity matching across languages. To address the inconsistency issue in multilingual audio-text retrieval, we first identify two intuitive factors that contribute to inconsistency: misalignment between audio and multilingual text embeddings, and error propagation in model optimization. By systematically analyzing these factors, we derive theoretical weight error upper bounds for quantifying their effects and find that the main source of inconsistency is the data distribution error during training. This finding motivates our solution to reduce data distribution errors.We propose a consistent ML-ATR scheme using 1-to-k contrastive learning and audio-English co-anchor contrastive learning, aiming to mitigate the negative impact of data distribution error on recall and consistency in ML-ATR. Experimental results on the translated AudioCaps and Clotho datasets show that our scheme achieves state-of-the-art performance on recall and consistency metrics for eight mainstream languages, including English. Our code will be available at https://github.com/ATRI-ACL/ATRI-ACL."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yin-etal-2025-atri">
<titleInfo>
<title>ATRI: Mitigating Multilingual Audio Text Retrieval Inconsistencies by Reducing Data Distribution Errors</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yuguo</namePart>
<namePart type="family">Yin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuxin</namePart>
<namePart type="family">Xie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenyuan</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dongchao</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jinghan</namePart>
<namePart type="family">Ru</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xianwei</namePart>
<namePart type="family">Zhuang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Liming</namePart>
<namePart type="family">Liang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuexian</namePart>
<namePart type="family">Zou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>Multilingual audio-text retrieval (ML-ATR) is a challenging task that aims to retrieve audio clips or multilingual texts from databases. However, existing ML-ATR schemes suffer from inconsistencies for instance similarity matching across languages. To address the inconsistency issue in multilingual audio-text retrieval, we first identify two intuitive factors that contribute to inconsistency: misalignment between audio and multilingual text embeddings, and error propagation in model optimization. By systematically analyzing these factors, we derive theoretical weight error upper bounds for quantifying their effects and find that the main source of inconsistency is the data distribution error during training. This finding motivates our solution to reduce data distribution errors.We propose a consistent ML-ATR scheme using 1-to-k contrastive learning and audio-English co-anchor contrastive learning, aiming to mitigate the negative impact of data distribution error on recall and consistency in ML-ATR. Experimental results on the translated AudioCaps and Clotho datasets show that our scheme achieves state-of-the-art performance on recall and consistency metrics for eight mainstream languages, including English. Our code will be available at https://github.com/ATRI-ACL/ATRI-ACL.</abstract>
<identifier type="citekey">yin-etal-2025-atri</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.273</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.273/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>5491</start>
<end>5504</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ATRI: Mitigating Multilingual Audio Text Retrieval Inconsistencies by Reducing Data Distribution Errors
%A Yin, Yuguo
%A Xie, Yuxin
%A Yang, Wenyuan
%A Yang, Dongchao
%A Ru, Jinghan
%A Zhuang, Xianwei
%A Liang, Liming
%A Zou, Yuexian
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F yin-etal-2025-atri
%X Multilingual audio-text retrieval (ML-ATR) is a challenging task that aims to retrieve audio clips or multilingual texts from databases. However, existing ML-ATR schemes suffer from inconsistencies for instance similarity matching across languages. To address the inconsistency issue in multilingual audio-text retrieval, we first identify two intuitive factors that contribute to inconsistency: misalignment between audio and multilingual text embeddings, and error propagation in model optimization. By systematically analyzing these factors, we derive theoretical weight error upper bounds for quantifying their effects and find that the main source of inconsistency is the data distribution error during training. This finding motivates our solution to reduce data distribution errors.We propose a consistent ML-ATR scheme using 1-to-k contrastive learning and audio-English co-anchor contrastive learning, aiming to mitigate the negative impact of data distribution error on recall and consistency in ML-ATR. Experimental results on the translated AudioCaps and Clotho datasets show that our scheme achieves state-of-the-art performance on recall and consistency metrics for eight mainstream languages, including English. Our code will be available at https://github.com/ATRI-ACL/ATRI-ACL.
%R 10.18653/v1/2025.acl-long.273
%U https://aclanthology.org/2025.acl-long.273/
%U https://doi.org/10.18653/v1/2025.acl-long.273
%P 5491-5504
Markdown (Informal)
[ATRI: Mitigating Multilingual Audio Text Retrieval Inconsistencies by Reducing Data Distribution Errors](https://aclanthology.org/2025.acl-long.273/) (Yin et al., ACL 2025)
ACL
- Yuguo Yin, Yuxin Xie, Wenyuan Yang, Dongchao Yang, Jinghan Ru, Xianwei Zhuang, Liming Liang, and Yuexian Zou. 2025. ATRI: Mitigating Multilingual Audio Text Retrieval Inconsistencies by Reducing Data Distribution Errors. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 5491–5504, Vienna, Austria. Association for Computational Linguistics.