@inproceedings{magnusson-etal-2023-reproducibility,
title = "Reproducibility in {NLP}: What Have We Learned from the Checklist?",
author = "Magnusson, Ian and
Smith, Noah A. and
Dodge, Jesse",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2023",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-acl.809/",
doi = "10.18653/v1/2023.findings-acl.809",
pages = "12789--12811",
abstract = "Scientific progress in NLP rests on the reproducibility of researchers' claims. The *CL conferences created the NLP Reproducibility Checklist in 2020 to be completed by authors at submission to remind them of key information to include. We provide the first analysis of the Checklist by examining 10,405 anonymous responses to it. First, we find evidence of an increase in reporting of information on efficiency, validation performance, summary statistics, and hyperparameters after the Checklist`s introduction. Further, we show acceptance rate grows for submissions with more Yes responses. We find that the 44{\%} of submissions that gather new data are 5{\%} less likely to be accepted than those that did not; the average reviewer-rated reproducibility of these submissions is also 2{\%} lower relative to the rest. We find that only 46{\%} of submissions claim to open-source their code, though submissions that do have 8{\%} higher reproducibility score relative to those that do not, the most for any item. We discuss what can be inferred about the state of reproducibility in NLP, and provide a set of recommendations for future conferences, including: a) allowing submitting code and appendices one week after the deadline, and b) measuring dataset reproducibility by a checklist of data collection practices."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="magnusson-etal-2023-reproducibility">
<titleInfo>
<title>Reproducibility in NLP: What Have We Learned from the Checklist?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ian</namePart>
<namePart type="family">Magnusson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Noah</namePart>
<namePart type="given">A</namePart>
<namePart type="family">Smith</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jesse</namePart>
<namePart type="family">Dodge</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Boyd-Graber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Scientific progress in NLP rests on the reproducibility of researchers’ claims. The *CL conferences created the NLP Reproducibility Checklist in 2020 to be completed by authors at submission to remind them of key information to include. We provide the first analysis of the Checklist by examining 10,405 anonymous responses to it. First, we find evidence of an increase in reporting of information on efficiency, validation performance, summary statistics, and hyperparameters after the Checklist‘s introduction. Further, we show acceptance rate grows for submissions with more Yes responses. We find that the 44% of submissions that gather new data are 5% less likely to be accepted than those that did not; the average reviewer-rated reproducibility of these submissions is also 2% lower relative to the rest. We find that only 46% of submissions claim to open-source their code, though submissions that do have 8% higher reproducibility score relative to those that do not, the most for any item. We discuss what can be inferred about the state of reproducibility in NLP, and provide a set of recommendations for future conferences, including: a) allowing submitting code and appendices one week after the deadline, and b) measuring dataset reproducibility by a checklist of data collection practices.</abstract>
<identifier type="citekey">magnusson-etal-2023-reproducibility</identifier>
<identifier type="doi">10.18653/v1/2023.findings-acl.809</identifier>
<location>
<url>https://aclanthology.org/2023.findings-acl.809/</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>12789</start>
<end>12811</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Reproducibility in NLP: What Have We Learned from the Checklist?
%A Magnusson, Ian
%A Smith, Noah A.
%A Dodge, Jesse
%Y Rogers, Anna
%Y Boyd-Graber, Jordan
%Y Okazaki, Naoaki
%S Findings of the Association for Computational Linguistics: ACL 2023
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F magnusson-etal-2023-reproducibility
%X Scientific progress in NLP rests on the reproducibility of researchers’ claims. The *CL conferences created the NLP Reproducibility Checklist in 2020 to be completed by authors at submission to remind them of key information to include. We provide the first analysis of the Checklist by examining 10,405 anonymous responses to it. First, we find evidence of an increase in reporting of information on efficiency, validation performance, summary statistics, and hyperparameters after the Checklist‘s introduction. Further, we show acceptance rate grows for submissions with more Yes responses. We find that the 44% of submissions that gather new data are 5% less likely to be accepted than those that did not; the average reviewer-rated reproducibility of these submissions is also 2% lower relative to the rest. We find that only 46% of submissions claim to open-source their code, though submissions that do have 8% higher reproducibility score relative to those that do not, the most for any item. We discuss what can be inferred about the state of reproducibility in NLP, and provide a set of recommendations for future conferences, including: a) allowing submitting code and appendices one week after the deadline, and b) measuring dataset reproducibility by a checklist of data collection practices.
%R 10.18653/v1/2023.findings-acl.809
%U https://aclanthology.org/2023.findings-acl.809/
%U https://doi.org/10.18653/v1/2023.findings-acl.809
%P 12789-12811
Markdown (Informal)
[Reproducibility in NLP: What Have We Learned from the Checklist?](https://aclanthology.org/2023.findings-acl.809/) (Magnusson et al., Findings 2023)
ACL