@article{trivedi-etal-2022-musique,
title = "♫ {M}u{S}i{Q}ue: Multihop Questions via Single-hop Question Composition",
author = "Trivedi, Harsh and
Balasubramanian, Niranjan and
Khot, Tushar and
Sabharwal, Ashish",
editor = "Roark, Brian and
Nenkova, Ani",
journal = "Transactions of the Association for Computational Linguistics",
volume = "10",
year = "2022",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2022.tacl-1.31",
doi = "10.1162/tacl_a_00475",
pages = "539--554",
abstract = "Multihop reasoning remains an elusive goal as existing multihop benchmarks are known to be largely solvable via shortcuts. Can we create a question answering (QA) dataset that, by construction, requires proper multihop reasoning? To this end, we introduce a bottom{--}up approach that systematically selects composable pairs of single-hop questions that are connected, that is, where one reasoning step critically relies on information from another. This bottom{--}up methodology lets us explore a vast space of questions and add stringent filters as well as other mechanisms targeting connected reasoning. It provides fine-grained control over the construction process and the properties of the resulting k-hop questions. We use this methodology to create MuSiQue-Ans, a new multihop QA dataset with 25K 2{--}4 hop questions. Relative to existing datasets, MuSiQue-Ans is more difficult overall (3{\mbox{$\times$}} increase in human{--}machine gap), and harder to cheat via disconnected reasoning (e.g., a single-hop model has a 30-point drop in F1). We further add unanswerable contrast questions to produce a more stringent dataset, MuSiQue-Full. We hope our datasets will help the NLP community develop models that perform genuine multihop reasoning.1",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="trivedi-etal-2022-musique">
<titleInfo>
<title>♫ MuSiQue: Multihop Questions via Single-hop Question Composition</title>
</titleInfo>
<name type="personal">
<namePart type="given">Harsh</namePart>
<namePart type="family">Trivedi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Niranjan</namePart>
<namePart type="family">Balasubramanian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tushar</namePart>
<namePart type="family">Khot</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ashish</namePart>
<namePart type="family">Sabharwal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Multihop reasoning remains an elusive goal as existing multihop benchmarks are known to be largely solvable via shortcuts. Can we create a question answering (QA) dataset that, by construction, requires proper multihop reasoning? To this end, we introduce a bottom–up approach that systematically selects composable pairs of single-hop questions that are connected, that is, where one reasoning step critically relies on information from another. This bottom–up methodology lets us explore a vast space of questions and add stringent filters as well as other mechanisms targeting connected reasoning. It provides fine-grained control over the construction process and the properties of the resulting k-hop questions. We use this methodology to create MuSiQue-Ans, a new multihop QA dataset with 25K 2–4 hop questions. Relative to existing datasets, MuSiQue-Ans is more difficult overall (3\times increase in human–machine gap), and harder to cheat via disconnected reasoning (e.g., a single-hop model has a 30-point drop in F1). We further add unanswerable contrast questions to produce a more stringent dataset, MuSiQue-Full. We hope our datasets will help the NLP community develop models that perform genuine multihop reasoning.1</abstract>
<identifier type="citekey">trivedi-etal-2022-musique</identifier>
<identifier type="doi">10.1162/tacl_a_00475</identifier>
<location>
<url>https://aclanthology.org/2022.tacl-1.31</url>
</location>
<part>
<date>2022</date>
<detail type="volume"><number>10</number></detail>
<extent unit="page">
<start>539</start>
<end>554</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T ♫ MuSiQue: Multihop Questions via Single-hop Question Composition
%A Trivedi, Harsh
%A Balasubramanian, Niranjan
%A Khot, Tushar
%A Sabharwal, Ashish
%J Transactions of the Association for Computational Linguistics
%D 2022
%V 10
%I MIT Press
%C Cambridge, MA
%F trivedi-etal-2022-musique
%X Multihop reasoning remains an elusive goal as existing multihop benchmarks are known to be largely solvable via shortcuts. Can we create a question answering (QA) dataset that, by construction, requires proper multihop reasoning? To this end, we introduce a bottom–up approach that systematically selects composable pairs of single-hop questions that are connected, that is, where one reasoning step critically relies on information from another. This bottom–up methodology lets us explore a vast space of questions and add stringent filters as well as other mechanisms targeting connected reasoning. It provides fine-grained control over the construction process and the properties of the resulting k-hop questions. We use this methodology to create MuSiQue-Ans, a new multihop QA dataset with 25K 2–4 hop questions. Relative to existing datasets, MuSiQue-Ans is more difficult overall (3\times increase in human–machine gap), and harder to cheat via disconnected reasoning (e.g., a single-hop model has a 30-point drop in F1). We further add unanswerable contrast questions to produce a more stringent dataset, MuSiQue-Full. We hope our datasets will help the NLP community develop models that perform genuine multihop reasoning.1
%R 10.1162/tacl_a_00475
%U https://aclanthology.org/2022.tacl-1.31
%U https://doi.org/10.1162/tacl_a_00475
%P 539-554
Markdown (Informal)
[♫ MuSiQue: Multihop Questions via Single-hop Question Composition](https://aclanthology.org/2022.tacl-1.31) (Trivedi et al., TACL 2022)
ACL