@inproceedings{godbole-jia-2023-benchmarking,
title = "Benchmarking Long-tail Generalization with Likelihood Splits",
author = "Godbole, Ameya and
Jia, Robin",
booktitle = "Findings of the Association for Computational Linguistics: EACL 2023",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-eacl.71",
doi = "10.18653/v1/2023.findings-eacl.71",
pages = "963--983",
abstract = "In order to reliably process natural language, NLP systems must generalize to the long tail of rare utterances. We propose a method to create challenging benchmarks that require generalizing to the tail of the distribution by re-splitting existing datasets. We create {`}Likelihood Splits{'} where examples that are assigned lower likelihood by a pre-trained language model (LM) are placed in the test set, and more likely examples are in the training set. This simple approach can be customized to construct meaningful train-test splits for a wide range of tasks. Likelihood Splits surface more challenges than random splits: relative error rates of state-of-the-art models increase by 59{\%} for semantic parsing on Spider, 93{\%} for natural language inference on SNLI, and 33{\%} for yes/no question answering on BoolQ, on our splits compared with the corresponding random splits. Moreover, Likelihood Splits create fairer benchmarks than adversarial filtering; when the LM used to create the splits is also employed as the task model, our splits do not unfairly penalize the LM.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="godbole-jia-2023-benchmarking">
<titleInfo>
<title>Benchmarking Long-tail Generalization with Likelihood Splits</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ameya</namePart>
<namePart type="family">Godbole</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Robin</namePart>
<namePart type="family">Jia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2023</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dubrovnik, Croatia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In order to reliably process natural language, NLP systems must generalize to the long tail of rare utterances. We propose a method to create challenging benchmarks that require generalizing to the tail of the distribution by re-splitting existing datasets. We create ‘Likelihood Splits’ where examples that are assigned lower likelihood by a pre-trained language model (LM) are placed in the test set, and more likely examples are in the training set. This simple approach can be customized to construct meaningful train-test splits for a wide range of tasks. Likelihood Splits surface more challenges than random splits: relative error rates of state-of-the-art models increase by 59% for semantic parsing on Spider, 93% for natural language inference on SNLI, and 33% for yes/no question answering on BoolQ, on our splits compared with the corresponding random splits. Moreover, Likelihood Splits create fairer benchmarks than adversarial filtering; when the LM used to create the splits is also employed as the task model, our splits do not unfairly penalize the LM.</abstract>
<identifier type="citekey">godbole-jia-2023-benchmarking</identifier>
<identifier type="doi">10.18653/v1/2023.findings-eacl.71</identifier>
<location>
<url>https://aclanthology.org/2023.findings-eacl.71</url>
</location>
<part>
<date>2023-05</date>
<extent unit="page">
<start>963</start>
<end>983</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Benchmarking Long-tail Generalization with Likelihood Splits
%A Godbole, Ameya
%A Jia, Robin
%S Findings of the Association for Computational Linguistics: EACL 2023
%D 2023
%8 May
%I Association for Computational Linguistics
%C Dubrovnik, Croatia
%F godbole-jia-2023-benchmarking
%X In order to reliably process natural language, NLP systems must generalize to the long tail of rare utterances. We propose a method to create challenging benchmarks that require generalizing to the tail of the distribution by re-splitting existing datasets. We create ‘Likelihood Splits’ where examples that are assigned lower likelihood by a pre-trained language model (LM) are placed in the test set, and more likely examples are in the training set. This simple approach can be customized to construct meaningful train-test splits for a wide range of tasks. Likelihood Splits surface more challenges than random splits: relative error rates of state-of-the-art models increase by 59% for semantic parsing on Spider, 93% for natural language inference on SNLI, and 33% for yes/no question answering on BoolQ, on our splits compared with the corresponding random splits. Moreover, Likelihood Splits create fairer benchmarks than adversarial filtering; when the LM used to create the splits is also employed as the task model, our splits do not unfairly penalize the LM.
%R 10.18653/v1/2023.findings-eacl.71
%U https://aclanthology.org/2023.findings-eacl.71
%U https://doi.org/10.18653/v1/2023.findings-eacl.71
%P 963-983
Markdown (Informal)
[Benchmarking Long-tail Generalization with Likelihood Splits](https://aclanthology.org/2023.findings-eacl.71) (Godbole & Jia, Findings 2023)
ACL