@inproceedings{ye-etal-2022-robust,
title = "Robust Question Answering against Distribution Shifts with Test-Time Adaption: An Empirical Study",
author = "Ye, Hai and
Ding, Yuyang and
Li, Juntao and
Ng, Hwee Tou",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2022",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.findings-emnlp.460",
doi = "10.18653/v1/2022.findings-emnlp.460",
pages = "6179--6192",
abstract = "A deployed question answering (QA) model can easily fail when the test data has a distribution shift compared to the training data. Robustness tuning (RT) methods have been widely studied to enhance model robustness against distribution shifts before model deployment. However, can we improve a model after deployment? To answer this question, we evaluate test-time adaptation (TTA) to improve a model after deployment. We first introduce ColdQA, a unified evaluation benchmark for robust QA against text corruption and changes in language and domain. We then evaluate previous TTA methods on ColdQA and compare them to RT methods. We also propose a novel TTA method called online imitation learning (OIL). Through extensive experiments, we find that TTA is comparable to RT methods, and applying TTA after RT can significantly boost the performance on ColdQA. Our proposed OIL improves TTA to be more robust to variation in hyper-parameters and test distributions over time.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ye-etal-2022-robust">
<titleInfo>
<title>Robust Question Answering against Distribution Shifts with Test-Time Adaption: An Empirical Study</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hai</namePart>
<namePart type="family">Ye</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuyang</namePart>
<namePart type="family">Ding</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juntao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hwee</namePart>
<namePart type="given">Tou</namePart>
<namePart type="family">Ng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>A deployed question answering (QA) model can easily fail when the test data has a distribution shift compared to the training data. Robustness tuning (RT) methods have been widely studied to enhance model robustness against distribution shifts before model deployment. However, can we improve a model after deployment? To answer this question, we evaluate test-time adaptation (TTA) to improve a model after deployment. We first introduce ColdQA, a unified evaluation benchmark for robust QA against text corruption and changes in language and domain. We then evaluate previous TTA methods on ColdQA and compare them to RT methods. We also propose a novel TTA method called online imitation learning (OIL). Through extensive experiments, we find that TTA is comparable to RT methods, and applying TTA after RT can significantly boost the performance on ColdQA. Our proposed OIL improves TTA to be more robust to variation in hyper-parameters and test distributions over time.</abstract>
<identifier type="citekey">ye-etal-2022-robust</identifier>
<identifier type="doi">10.18653/v1/2022.findings-emnlp.460</identifier>
<location>
<url>https://aclanthology.org/2022.findings-emnlp.460</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>6179</start>
<end>6192</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Robust Question Answering against Distribution Shifts with Test-Time Adaption: An Empirical Study
%A Ye, Hai
%A Ding, Yuyang
%A Li, Juntao
%A Ng, Hwee Tou
%Y Goldberg, Yoav
%Y Kozareva, Zornitsa
%Y Zhang, Yue
%S Findings of the Association for Computational Linguistics: EMNLP 2022
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F ye-etal-2022-robust
%X A deployed question answering (QA) model can easily fail when the test data has a distribution shift compared to the training data. Robustness tuning (RT) methods have been widely studied to enhance model robustness against distribution shifts before model deployment. However, can we improve a model after deployment? To answer this question, we evaluate test-time adaptation (TTA) to improve a model after deployment. We first introduce ColdQA, a unified evaluation benchmark for robust QA against text corruption and changes in language and domain. We then evaluate previous TTA methods on ColdQA and compare them to RT methods. We also propose a novel TTA method called online imitation learning (OIL). Through extensive experiments, we find that TTA is comparable to RT methods, and applying TTA after RT can significantly boost the performance on ColdQA. Our proposed OIL improves TTA to be more robust to variation in hyper-parameters and test distributions over time.
%R 10.18653/v1/2022.findings-emnlp.460
%U https://aclanthology.org/2022.findings-emnlp.460
%U https://doi.org/10.18653/v1/2022.findings-emnlp.460
%P 6179-6192
Markdown (Informal)
[Robust Question Answering against Distribution Shifts with Test-Time Adaption: An Empirical Study](https://aclanthology.org/2022.findings-emnlp.460) (Ye et al., Findings 2022)
ACL