@inproceedings{li-etal-2022-encoder,
title = "An Encoder Attribution Analysis for Dense Passage Retriever in Open-Domain Question Answering",
author = "Li, Minghan and
Ma, Xueguang and
Lin, Jimmy",
editor = "Verma, Apurv and
Pruksachatkun, Yada and
Chang, Kai-Wei and
Galstyan, Aram and
Dhamala, Jwala and
Cao, Yang Trista",
booktitle = "Proceedings of the 2nd Workshop on Trustworthy Natural Language Processing (TrustNLP 2022)",
month = jul,
year = "2022",
address = "Seattle, U.S.A.",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.trustnlp-1.1",
doi = "10.18653/v1/2022.trustnlp-1.1",
pages = "1--11",
abstract = "The bi-encoder design of dense passage retriever (DPR) is a key factor to its success in open-domain question answering (QA), yet it is unclear how DPR{'}s question encoder and passage encoder individually contributes to overall performance, which we refer to as the encoder attribution problem. The problem is important as it helps us identify the factors that affect individual encoders to further improve overall performance. In this paper, we formulate our analysis under a probabilistic framework called encoder marginalization, where we quantify the contribution of a single encoder by marginalizing other variables. First, we find that the passage encoder contributes more than the question encoder to in-domain retrieval accuracy. Second, we demonstrate how to find the affecting factors for each encoder, where we train DPR with different amounts of data and use encoder marginalization to analyze the results. We find that positive passage overlap and corpus coverage of training data have big impacts on the passage encoder, while the question encoder is mainly affected by training sample complexity under this setting. Based on this framework, we can devise data-efficient training regimes: for example, we manage to train a passage encoder on SQuAD using 60{\%} less training data without loss of accuracy.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="li-etal-2022-encoder">
<titleInfo>
<title>An Encoder Attribution Analysis for Dense Passage Retriever in Open-Domain Question Answering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Minghan</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xueguang</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jimmy</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Workshop on Trustworthy Natural Language Processing (TrustNLP 2022)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Apurv</namePart>
<namePart type="family">Verma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yada</namePart>
<namePart type="family">Pruksachatkun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai-Wei</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aram</namePart>
<namePart type="family">Galstyan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jwala</namePart>
<namePart type="family">Dhamala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="given">Trista</namePart>
<namePart type="family">Cao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Seattle, U.S.A.</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The bi-encoder design of dense passage retriever (DPR) is a key factor to its success in open-domain question answering (QA), yet it is unclear how DPR’s question encoder and passage encoder individually contributes to overall performance, which we refer to as the encoder attribution problem. The problem is important as it helps us identify the factors that affect individual encoders to further improve overall performance. In this paper, we formulate our analysis under a probabilistic framework called encoder marginalization, where we quantify the contribution of a single encoder by marginalizing other variables. First, we find that the passage encoder contributes more than the question encoder to in-domain retrieval accuracy. Second, we demonstrate how to find the affecting factors for each encoder, where we train DPR with different amounts of data and use encoder marginalization to analyze the results. We find that positive passage overlap and corpus coverage of training data have big impacts on the passage encoder, while the question encoder is mainly affected by training sample complexity under this setting. Based on this framework, we can devise data-efficient training regimes: for example, we manage to train a passage encoder on SQuAD using 60% less training data without loss of accuracy.</abstract>
<identifier type="citekey">li-etal-2022-encoder</identifier>
<identifier type="doi">10.18653/v1/2022.trustnlp-1.1</identifier>
<location>
<url>https://aclanthology.org/2022.trustnlp-1.1</url>
</location>
<part>
<date>2022-07</date>
<extent unit="page">
<start>1</start>
<end>11</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T An Encoder Attribution Analysis for Dense Passage Retriever in Open-Domain Question Answering
%A Li, Minghan
%A Ma, Xueguang
%A Lin, Jimmy
%Y Verma, Apurv
%Y Pruksachatkun, Yada
%Y Chang, Kai-Wei
%Y Galstyan, Aram
%Y Dhamala, Jwala
%Y Cao, Yang Trista
%S Proceedings of the 2nd Workshop on Trustworthy Natural Language Processing (TrustNLP 2022)
%D 2022
%8 July
%I Association for Computational Linguistics
%C Seattle, U.S.A.
%F li-etal-2022-encoder
%X The bi-encoder design of dense passage retriever (DPR) is a key factor to its success in open-domain question answering (QA), yet it is unclear how DPR’s question encoder and passage encoder individually contributes to overall performance, which we refer to as the encoder attribution problem. The problem is important as it helps us identify the factors that affect individual encoders to further improve overall performance. In this paper, we formulate our analysis under a probabilistic framework called encoder marginalization, where we quantify the contribution of a single encoder by marginalizing other variables. First, we find that the passage encoder contributes more than the question encoder to in-domain retrieval accuracy. Second, we demonstrate how to find the affecting factors for each encoder, where we train DPR with different amounts of data and use encoder marginalization to analyze the results. We find that positive passage overlap and corpus coverage of training data have big impacts on the passage encoder, while the question encoder is mainly affected by training sample complexity under this setting. Based on this framework, we can devise data-efficient training regimes: for example, we manage to train a passage encoder on SQuAD using 60% less training data without loss of accuracy.
%R 10.18653/v1/2022.trustnlp-1.1
%U https://aclanthology.org/2022.trustnlp-1.1
%U https://doi.org/10.18653/v1/2022.trustnlp-1.1
%P 1-11
Markdown (Informal)
[An Encoder Attribution Analysis for Dense Passage Retriever in Open-Domain Question Answering](https://aclanthology.org/2022.trustnlp-1.1) (Li et al., TrustNLP 2022)
ACL