@inproceedings{cohen-etal-2024-extremely,
title = "Extremely efficient online query encoding for dense retrieval",
author = "Cohen, Nachshon and
Fairstein, Yaron and
Kushilevitz, Guy",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2024",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-naacl.4",
doi = "10.18653/v1/2024.findings-naacl.4",
pages = "43--50",
abstract = "Existing dense retrieval systems utilize the same model architecture for encoding both the passages and the queries, even though queries are much shorter and simpler than passages. This leads to high latency of the query encoding, which is performed online and therefore might impact user experience. We show that combining a standard large passage encoder with a small efficient query encoder can provide significant latency drops with only a small decrease in quality. We offer a pretraining and training solution for multiple small query encoder architectures. Using a small transformer architecture we are able to decrease latency by up to $\sim12\times$, while $MRR@10$ on the MS MARCO dev set only decreases from 38.2 to 36.2. If this solution does not reach the desired latency requirements, we propose an efficient RNN as the query encoder, which processes the query prefix incrementally and only infers the last word after the query is issued. This shortens latency by $\sim38\times$ with only a minor drop in quality, reaching 35.5 $MRR@10$ score.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="cohen-etal-2024-extremely">
<titleInfo>
<title>Extremely efficient online query encoding for dense retrieval</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nachshon</namePart>
<namePart type="family">Cohen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yaron</namePart>
<namePart type="family">Fairstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guy</namePart>
<namePart type="family">Kushilevitz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Existing dense retrieval systems utilize the same model architecture for encoding both the passages and the queries, even though queries are much shorter and simpler than passages. This leads to high latency of the query encoding, which is performed online and therefore might impact user experience. We show that combining a standard large passage encoder with a small efficient query encoder can provide significant latency drops with only a small decrease in quality. We offer a pretraining and training solution for multiple small query encoder architectures. Using a small transformer architecture we are able to decrease latency by up to \sim12\times, while MRR@10 on the MS MARCO dev set only decreases from 38.2 to 36.2. If this solution does not reach the desired latency requirements, we propose an efficient RNN as the query encoder, which processes the query prefix incrementally and only infers the last word after the query is issued. This shortens latency by \sim38\times with only a minor drop in quality, reaching 35.5 MRR@10 score.</abstract>
<identifier type="citekey">cohen-etal-2024-extremely</identifier>
<identifier type="doi">10.18653/v1/2024.findings-naacl.4</identifier>
<location>
<url>https://aclanthology.org/2024.findings-naacl.4</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>43</start>
<end>50</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Extremely efficient online query encoding for dense retrieval
%A Cohen, Nachshon
%A Fairstein, Yaron
%A Kushilevitz, Guy
%Y Duh, Kevin
%Y Gomez, Helena
%Y Bethard, Steven
%S Findings of the Association for Computational Linguistics: NAACL 2024
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F cohen-etal-2024-extremely
%X Existing dense retrieval systems utilize the same model architecture for encoding both the passages and the queries, even though queries are much shorter and simpler than passages. This leads to high latency of the query encoding, which is performed online and therefore might impact user experience. We show that combining a standard large passage encoder with a small efficient query encoder can provide significant latency drops with only a small decrease in quality. We offer a pretraining and training solution for multiple small query encoder architectures. Using a small transformer architecture we are able to decrease latency by up to \sim12\times, while MRR@10 on the MS MARCO dev set only decreases from 38.2 to 36.2. If this solution does not reach the desired latency requirements, we propose an efficient RNN as the query encoder, which processes the query prefix incrementally and only infers the last word after the query is issued. This shortens latency by \sim38\times with only a minor drop in quality, reaching 35.5 MRR@10 score.
%R 10.18653/v1/2024.findings-naacl.4
%U https://aclanthology.org/2024.findings-naacl.4
%U https://doi.org/10.18653/v1/2024.findings-naacl.4
%P 43-50
Markdown (Informal)
[Extremely efficient online query encoding for dense retrieval](https://aclanthology.org/2024.findings-naacl.4) (Cohen et al., Findings 2024)
ACL