@inproceedings{zhou-devlin-2021-multi,
title = "Multi-Vector Attention Models for Deep Re-ranking",
author = "Zhou, Giulio and
Devlin, Jacob",
editor = "Moens, Marie-Francine and
Huang, Xuanjing and
Specia, Lucia and
Yih, Scott Wen-tau",
booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2021",
address = "Online and Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.emnlp-main.443",
doi = "10.18653/v1/2021.emnlp-main.443",
pages = "5452--5456",
abstract = "Large-scale document retrieval systems often utilize two styles of neural network models which live at two different ends of the joint computation vs. accuracy spectrum. The first style is dual encoder (or two-tower) models, where the query and document representations are computed completely independently and combined with a simple dot product operation. The second style is cross-attention models, where the query and document features are concatenated in the input layer and all computation is based on the joint query-document representation. Dual encoder models are typically used for retrieval and deep re-ranking, while cross-attention models are typically used for shallow re-ranking. In this paper, we present a lightweight architecture that explores this joint cost vs. accuracy trade-off based on multi-vector attention (MVA). We thoroughly evaluate our method on the MS-MARCO passage retrieval dataset and show how to efficiently trade off retrieval accuracy with joint computation and offline document storage cost. We show that a highly compressed document representation and inexpensive joint computation can be achieved through a combination of learned pooling tokens and aggressive downprojection. Our code and model checkpoints are open-source and available on GitHub.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhou-devlin-2021-multi">
<titleInfo>
<title>Multi-Vector Attention Models for Deep Re-ranking</title>
</titleInfo>
<name type="personal">
<namePart type="given">Giulio</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jacob</namePart>
<namePart type="family">Devlin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marie-Francine</namePart>
<namePart type="family">Moens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuanjing</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucia</namePart>
<namePart type="family">Specia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Scott</namePart>
<namePart type="given">Wen-tau</namePart>
<namePart type="family">Yih</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online and Punta Cana, Dominican Republic</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large-scale document retrieval systems often utilize two styles of neural network models which live at two different ends of the joint computation vs. accuracy spectrum. The first style is dual encoder (or two-tower) models, where the query and document representations are computed completely independently and combined with a simple dot product operation. The second style is cross-attention models, where the query and document features are concatenated in the input layer and all computation is based on the joint query-document representation. Dual encoder models are typically used for retrieval and deep re-ranking, while cross-attention models are typically used for shallow re-ranking. In this paper, we present a lightweight architecture that explores this joint cost vs. accuracy trade-off based on multi-vector attention (MVA). We thoroughly evaluate our method on the MS-MARCO passage retrieval dataset and show how to efficiently trade off retrieval accuracy with joint computation and offline document storage cost. We show that a highly compressed document representation and inexpensive joint computation can be achieved through a combination of learned pooling tokens and aggressive downprojection. Our code and model checkpoints are open-source and available on GitHub.</abstract>
<identifier type="citekey">zhou-devlin-2021-multi</identifier>
<identifier type="doi">10.18653/v1/2021.emnlp-main.443</identifier>
<location>
<url>https://aclanthology.org/2021.emnlp-main.443</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>5452</start>
<end>5456</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Multi-Vector Attention Models for Deep Re-ranking
%A Zhou, Giulio
%A Devlin, Jacob
%Y Moens, Marie-Francine
%Y Huang, Xuanjing
%Y Specia, Lucia
%Y Yih, Scott Wen-tau
%S Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing
%D 2021
%8 November
%I Association for Computational Linguistics
%C Online and Punta Cana, Dominican Republic
%F zhou-devlin-2021-multi
%X Large-scale document retrieval systems often utilize two styles of neural network models which live at two different ends of the joint computation vs. accuracy spectrum. The first style is dual encoder (or two-tower) models, where the query and document representations are computed completely independently and combined with a simple dot product operation. The second style is cross-attention models, where the query and document features are concatenated in the input layer and all computation is based on the joint query-document representation. Dual encoder models are typically used for retrieval and deep re-ranking, while cross-attention models are typically used for shallow re-ranking. In this paper, we present a lightweight architecture that explores this joint cost vs. accuracy trade-off based on multi-vector attention (MVA). We thoroughly evaluate our method on the MS-MARCO passage retrieval dataset and show how to efficiently trade off retrieval accuracy with joint computation and offline document storage cost. We show that a highly compressed document representation and inexpensive joint computation can be achieved through a combination of learned pooling tokens and aggressive downprojection. Our code and model checkpoints are open-source and available on GitHub.
%R 10.18653/v1/2021.emnlp-main.443
%U https://aclanthology.org/2021.emnlp-main.443
%U https://doi.org/10.18653/v1/2021.emnlp-main.443
%P 5452-5456
Markdown (Informal)
[Multi-Vector Attention Models for Deep Re-ranking](https://aclanthology.org/2021.emnlp-main.443) (Zhou & Devlin, EMNLP 2021)
ACL
- Giulio Zhou and Jacob Devlin. 2021. Multi-Vector Attention Models for Deep Re-ranking. In Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, pages 5452–5456, Online and Punta Cana, Dominican Republic. Association for Computational Linguistics.