@inproceedings{pugaliya-etal-2019-bend,
title = "Bend but Don{'}t Break? Multi-Challenge Stress Test for {QA} Models",
author = "Pugaliya, Hemant and
Route, James and
Ma, Kaixin and
Geng, Yixuan and
Nyberg, Eric",
editor = "Fisch, Adam and
Talmor, Alon and
Jia, Robin and
Seo, Minjoon and
Choi, Eunsol and
Chen, Danqi",
booktitle = "Proceedings of the 2nd Workshop on Machine Reading for Question Answering",
month = nov,
year = "2019",
address = "Hong Kong, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/D19-5818",
doi = "10.18653/v1/D19-5818",
pages = "125--136",
abstract = "The field of question answering (QA) has seen rapid growth in new tasks and modeling approaches in recent years. Large scale datasets and focus on challenging linguistic phenomena have driven development in neural models, some of which have achieved parity with human performance in limited cases. However, an examination of state-of-the-art model output reveals that a gap remains in reasoning ability compared to a human, and performance tends to degrade when models are exposed to less-constrained tasks. We are interested in more clearly defining the strengths and limitations of leading models across diverse QA challenges, intending to help future researchers with identifying pathways to generalizable performance. We conduct extensive qualitative and quantitative analyses on the results of four models across four datasets and relate common errors to model capabilities. We also illustrate limitations in the datasets we examine and discuss a way forward for achieving generalizable models and datasets that broadly test QA capabilities.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pugaliya-etal-2019-bend">
<titleInfo>
<title>Bend but Don’t Break? Multi-Challenge Stress Test for QA Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hemant</namePart>
<namePart type="family">Pugaliya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">James</namePart>
<namePart type="family">Route</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kaixin</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yixuan</namePart>
<namePart type="family">Geng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eric</namePart>
<namePart type="family">Nyberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Workshop on Machine Reading for Question Answering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Adam</namePart>
<namePart type="family">Fisch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alon</namePart>
<namePart type="family">Talmor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Robin</namePart>
<namePart type="family">Jia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Minjoon</namePart>
<namePart type="family">Seo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eunsol</namePart>
<namePart type="family">Choi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Danqi</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Hong Kong, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The field of question answering (QA) has seen rapid growth in new tasks and modeling approaches in recent years. Large scale datasets and focus on challenging linguistic phenomena have driven development in neural models, some of which have achieved parity with human performance in limited cases. However, an examination of state-of-the-art model output reveals that a gap remains in reasoning ability compared to a human, and performance tends to degrade when models are exposed to less-constrained tasks. We are interested in more clearly defining the strengths and limitations of leading models across diverse QA challenges, intending to help future researchers with identifying pathways to generalizable performance. We conduct extensive qualitative and quantitative analyses on the results of four models across four datasets and relate common errors to model capabilities. We also illustrate limitations in the datasets we examine and discuss a way forward for achieving generalizable models and datasets that broadly test QA capabilities.</abstract>
<identifier type="citekey">pugaliya-etal-2019-bend</identifier>
<identifier type="doi">10.18653/v1/D19-5818</identifier>
<location>
<url>https://aclanthology.org/D19-5818</url>
</location>
<part>
<date>2019-11</date>
<extent unit="page">
<start>125</start>
<end>136</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Bend but Don’t Break? Multi-Challenge Stress Test for QA Models
%A Pugaliya, Hemant
%A Route, James
%A Ma, Kaixin
%A Geng, Yixuan
%A Nyberg, Eric
%Y Fisch, Adam
%Y Talmor, Alon
%Y Jia, Robin
%Y Seo, Minjoon
%Y Choi, Eunsol
%Y Chen, Danqi
%S Proceedings of the 2nd Workshop on Machine Reading for Question Answering
%D 2019
%8 November
%I Association for Computational Linguistics
%C Hong Kong, China
%F pugaliya-etal-2019-bend
%X The field of question answering (QA) has seen rapid growth in new tasks and modeling approaches in recent years. Large scale datasets and focus on challenging linguistic phenomena have driven development in neural models, some of which have achieved parity with human performance in limited cases. However, an examination of state-of-the-art model output reveals that a gap remains in reasoning ability compared to a human, and performance tends to degrade when models are exposed to less-constrained tasks. We are interested in more clearly defining the strengths and limitations of leading models across diverse QA challenges, intending to help future researchers with identifying pathways to generalizable performance. We conduct extensive qualitative and quantitative analyses on the results of four models across four datasets and relate common errors to model capabilities. We also illustrate limitations in the datasets we examine and discuss a way forward for achieving generalizable models and datasets that broadly test QA capabilities.
%R 10.18653/v1/D19-5818
%U https://aclanthology.org/D19-5818
%U https://doi.org/10.18653/v1/D19-5818
%P 125-136
Markdown (Informal)
[Bend but Don’t Break? Multi-Challenge Stress Test for QA Models](https://aclanthology.org/D19-5818) (Pugaliya et al., 2019)
ACL