@inproceedings{gorman-bedrick-2019-need,
title = "We Need to Talk about Standard Splits",
author = "Gorman, Kyle and
Bedrick, Steven",
editor = "Korhonen, Anna and
Traum, David and
M{\`a}rquez, Llu{\'\i}s",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/P19-1267",
doi = "10.18653/v1/P19-1267",
pages = "2786--2791",
abstract = "It is standard practice in speech {\&} language technology to rank systems according to their performance on a test set held out for evaluation. However, few researchers apply statistical tests to determine whether differences in performance are likely to arise by chance, and few examine the stability of system ranking across multiple training-testing splits. We conduct replication and reproduction experiments with nine part-of-speech taggers published between 2000 and 2018, each of which claimed state-of-the-art performance on a widely-used {``}standard split{''}. While we replicate results on the standard split, we fail to reliably reproduce some rankings when we repeat this analysis with randomly generated training-testing splits. We argue that randomly generated splits should be used in system evaluation.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gorman-bedrick-2019-need">
<titleInfo>
<title>We Need to Talk about Standard Splits</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kyle</namePart>
<namePart type="family">Gorman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bedrick</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Korhonen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Traum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Màrquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Florence, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>It is standard practice in speech & language technology to rank systems according to their performance on a test set held out for evaluation. However, few researchers apply statistical tests to determine whether differences in performance are likely to arise by chance, and few examine the stability of system ranking across multiple training-testing splits. We conduct replication and reproduction experiments with nine part-of-speech taggers published between 2000 and 2018, each of which claimed state-of-the-art performance on a widely-used “standard split”. While we replicate results on the standard split, we fail to reliably reproduce some rankings when we repeat this analysis with randomly generated training-testing splits. We argue that randomly generated splits should be used in system evaluation.</abstract>
<identifier type="citekey">gorman-bedrick-2019-need</identifier>
<identifier type="doi">10.18653/v1/P19-1267</identifier>
<location>
<url>https://aclanthology.org/P19-1267</url>
</location>
<part>
<date>2019-07</date>
<extent unit="page">
<start>2786</start>
<end>2791</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T We Need to Talk about Standard Splits
%A Gorman, Kyle
%A Bedrick, Steven
%Y Korhonen, Anna
%Y Traum, David
%Y Màrquez, Lluís
%S Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics
%D 2019
%8 July
%I Association for Computational Linguistics
%C Florence, Italy
%F gorman-bedrick-2019-need
%X It is standard practice in speech & language technology to rank systems according to their performance on a test set held out for evaluation. However, few researchers apply statistical tests to determine whether differences in performance are likely to arise by chance, and few examine the stability of system ranking across multiple training-testing splits. We conduct replication and reproduction experiments with nine part-of-speech taggers published between 2000 and 2018, each of which claimed state-of-the-art performance on a widely-used “standard split”. While we replicate results on the standard split, we fail to reliably reproduce some rankings when we repeat this analysis with randomly generated training-testing splits. We argue that randomly generated splits should be used in system evaluation.
%R 10.18653/v1/P19-1267
%U https://aclanthology.org/P19-1267
%U https://doi.org/10.18653/v1/P19-1267
%P 2786-2791
Markdown (Informal)
[We Need to Talk about Standard Splits](https://aclanthology.org/P19-1267) (Gorman & Bedrick, ACL 2019)
ACL
- Kyle Gorman and Steven Bedrick. 2019. We Need to Talk about Standard Splits. In Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, pages 2786–2791, Florence, Italy. Association for Computational Linguistics.