@inproceedings{bao-etal-2025-alleviating,
title = "Alleviating Performance Degradation Caused by Out-of-Distribution Issues in Embedding-Based Retrieval",
author = "Bao, Haotong and
Zhang, Jianjin and
Chen, Qi and
Han, Weihao and
Zeng, Zhengxin and
Chang, Ruiheng and
Li, Mingzheng and
Sun, Hao and
Deng, Weiwei and
Sun, Feng and
Zhang, Qi",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.340/",
pages = "6418--6427",
ISBN = "979-8-89176-335-7",
abstract = "In Embedding Based Retrieval (EBR), Approximate Nearest Neighbor (ANN) algorithms are widely adopted for efficient large-scale search. However, recent studies reveal a query out-of-distribution (OOD) issue, where query and base embeddings follow mismatched distributions, significantly degrading ANN performance. In this work, we empirically verify the generality of this phenomenon and provide a quantitative analysis. To mitigate the distributional gap, we introduce a distribution regularizer into the encoder training objective, encouraging alignment between query and base embeddings. Extensive experiments across multiple datasets, encoders, and ANN indices show that our method consistently improves retrieval performance."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bao-etal-2025-alleviating">
<titleInfo>
<title>Alleviating Performance Degradation Caused by Out-of-Distribution Issues in Embedding-Based Retrieval</title>
</titleInfo>
<name type="personal">
<namePart type="given">Haotong</namePart>
<namePart type="family">Bao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jianjin</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qi</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Weihao</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhengxin</namePart>
<namePart type="family">Zeng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruiheng</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mingzheng</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hao</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Weiwei</namePart>
<namePart type="family">Deng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Feng</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qi</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>In Embedding Based Retrieval (EBR), Approximate Nearest Neighbor (ANN) algorithms are widely adopted for efficient large-scale search. However, recent studies reveal a query out-of-distribution (OOD) issue, where query and base embeddings follow mismatched distributions, significantly degrading ANN performance. In this work, we empirically verify the generality of this phenomenon and provide a quantitative analysis. To mitigate the distributional gap, we introduce a distribution regularizer into the encoder training objective, encouraging alignment between query and base embeddings. Extensive experiments across multiple datasets, encoders, and ANN indices show that our method consistently improves retrieval performance.</abstract>
<identifier type="citekey">bao-etal-2025-alleviating</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.340/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>6418</start>
<end>6427</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Alleviating Performance Degradation Caused by Out-of-Distribution Issues in Embedding-Based Retrieval
%A Bao, Haotong
%A Zhang, Jianjin
%A Chen, Qi
%A Han, Weihao
%A Zeng, Zhengxin
%A Chang, Ruiheng
%A Li, Mingzheng
%A Sun, Hao
%A Deng, Weiwei
%A Sun, Feng
%A Zhang, Qi
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F bao-etal-2025-alleviating
%X In Embedding Based Retrieval (EBR), Approximate Nearest Neighbor (ANN) algorithms are widely adopted for efficient large-scale search. However, recent studies reveal a query out-of-distribution (OOD) issue, where query and base embeddings follow mismatched distributions, significantly degrading ANN performance. In this work, we empirically verify the generality of this phenomenon and provide a quantitative analysis. To mitigate the distributional gap, we introduce a distribution regularizer into the encoder training objective, encouraging alignment between query and base embeddings. Extensive experiments across multiple datasets, encoders, and ANN indices show that our method consistently improves retrieval performance.
%U https://aclanthology.org/2025.findings-emnlp.340/
%P 6418-6427
Markdown (Informal)
[Alleviating Performance Degradation Caused by Out-of-Distribution Issues in Embedding-Based Retrieval](https://aclanthology.org/2025.findings-emnlp.340/) (Bao et al., Findings 2025)
ACL
- Haotong Bao, Jianjin Zhang, Qi Chen, Weihao Han, Zhengxin Zeng, Ruiheng Chang, Mingzheng Li, Hao Sun, Weiwei Deng, Feng Sun, and Qi Zhang. 2025. Alleviating Performance Degradation Caused by Out-of-Distribution Issues in Embedding-Based Retrieval. In Findings of the Association for Computational Linguistics: EMNLP 2025, pages 6418–6427, Suzhou, China. Association for Computational Linguistics.