@inproceedings{mishra-sachdeva-2020-need,
title = "Do We Need to Create Big Datasets to Learn a Task?",
author = "Mishra, Swaroop and
Sachdeva, Bhavdeep Singh",
editor = "Moosavi, Nafise Sadat and
Fan, Angela and
Shwartz, Vered and
Glava{\v{s}}, Goran and
Joty, Shafiq and
Wang, Alex and
Wolf, Thomas",
booktitle = "Proceedings of SustaiNLP: Workshop on Simple and Efficient Natural Language Processing",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.sustainlp-1.23",
doi = "10.18653/v1/2020.sustainlp-1.23",
pages = "169--173",
abstract = "Deep Learning research has been largely accelerated by the development of huge datasets such as Imagenet. The general trend has been to create big datasets to make a deep neural network learn. A huge amount of resources is being spent in creating these big datasets, developing models, training them, and iterating this process to dominate leaderboards. We argue that the trend of creating bigger datasets needs to be revised by better leveraging the power of pre-trained language models. Since the language models have already been pre-trained with huge amount of data and have basic linguistic knowledge, there is no need to create big datasets to learn a task. Instead, we need to create a dataset that is sufficient for the model to learn various task-specific terminologies, such as {`}Entailment{'}, {`}Neutral{'}, and {`}Contradiction{'} for NLI. As evidence, we show that RoBERTA is able to achieve near-equal performance on 2{\%} data of SNLI. We also observe competitive zero-shot generalization on several OOD datasets. In this paper, we propose a baseline algorithm to find the optimal dataset for learning a task.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mishra-sachdeva-2020-need">
<titleInfo>
<title>Do We Need to Create Big Datasets to Learn a Task?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Swaroop</namePart>
<namePart type="family">Mishra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bhavdeep</namePart>
<namePart type="given">Singh</namePart>
<namePart type="family">Sachdeva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of SustaiNLP: Workshop on Simple and Efficient Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nafise</namePart>
<namePart type="given">Sadat</namePart>
<namePart type="family">Moosavi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Angela</namePart>
<namePart type="family">Fan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vered</namePart>
<namePart type="family">Shwartz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Goran</namePart>
<namePart type="family">Glavaš</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shafiq</namePart>
<namePart type="family">Joty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Wolf</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Deep Learning research has been largely accelerated by the development of huge datasets such as Imagenet. The general trend has been to create big datasets to make a deep neural network learn. A huge amount of resources is being spent in creating these big datasets, developing models, training them, and iterating this process to dominate leaderboards. We argue that the trend of creating bigger datasets needs to be revised by better leveraging the power of pre-trained language models. Since the language models have already been pre-trained with huge amount of data and have basic linguistic knowledge, there is no need to create big datasets to learn a task. Instead, we need to create a dataset that is sufficient for the model to learn various task-specific terminologies, such as ‘Entailment’, ‘Neutral’, and ‘Contradiction’ for NLI. As evidence, we show that RoBERTA is able to achieve near-equal performance on 2% data of SNLI. We also observe competitive zero-shot generalization on several OOD datasets. In this paper, we propose a baseline algorithm to find the optimal dataset for learning a task.</abstract>
<identifier type="citekey">mishra-sachdeva-2020-need</identifier>
<identifier type="doi">10.18653/v1/2020.sustainlp-1.23</identifier>
<location>
<url>https://aclanthology.org/2020.sustainlp-1.23</url>
</location>
<part>
<date>2020-11</date>
<extent unit="page">
<start>169</start>
<end>173</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Do We Need to Create Big Datasets to Learn a Task?
%A Mishra, Swaroop
%A Sachdeva, Bhavdeep Singh
%Y Moosavi, Nafise Sadat
%Y Fan, Angela
%Y Shwartz, Vered
%Y Glavaš, Goran
%Y Joty, Shafiq
%Y Wang, Alex
%Y Wolf, Thomas
%S Proceedings of SustaiNLP: Workshop on Simple and Efficient Natural Language Processing
%D 2020
%8 November
%I Association for Computational Linguistics
%C Online
%F mishra-sachdeva-2020-need
%X Deep Learning research has been largely accelerated by the development of huge datasets such as Imagenet. The general trend has been to create big datasets to make a deep neural network learn. A huge amount of resources is being spent in creating these big datasets, developing models, training them, and iterating this process to dominate leaderboards. We argue that the trend of creating bigger datasets needs to be revised by better leveraging the power of pre-trained language models. Since the language models have already been pre-trained with huge amount of data and have basic linguistic knowledge, there is no need to create big datasets to learn a task. Instead, we need to create a dataset that is sufficient for the model to learn various task-specific terminologies, such as ‘Entailment’, ‘Neutral’, and ‘Contradiction’ for NLI. As evidence, we show that RoBERTA is able to achieve near-equal performance on 2% data of SNLI. We also observe competitive zero-shot generalization on several OOD datasets. In this paper, we propose a baseline algorithm to find the optimal dataset for learning a task.
%R 10.18653/v1/2020.sustainlp-1.23
%U https://aclanthology.org/2020.sustainlp-1.23
%U https://doi.org/10.18653/v1/2020.sustainlp-1.23
%P 169-173
Markdown (Informal)
[Do We Need to Create Big Datasets to Learn a Task?](https://aclanthology.org/2020.sustainlp-1.23) (Mishra & Sachdeva, sustainlp 2020)
ACL
- Swaroop Mishra and Bhavdeep Singh Sachdeva. 2020. Do We Need to Create Big Datasets to Learn a Task?. In Proceedings of SustaiNLP: Workshop on Simple and Efficient Natural Language Processing, pages 169–173, Online. Association for Computational Linguistics.