@inproceedings{edwards-etal-2020-go,
title = "Go Simple and Pre-Train on Domain-Specific Corpora: On the Role of Training Data for Text Classification",
author = "Edwards, Aleksandra and
Camacho-Collados, Jose and
De Ribaupierre, H{\'e}l{\`e}ne and
Preece, Alun",
editor = "Scott, Donia and
Bel, Nuria and
Zong, Chengqing",
booktitle = "Proceedings of the 28th International Conference on Computational Linguistics",
month = dec,
year = "2020",
address = "Barcelona, Spain (Online)",
publisher = "International Committee on Computational Linguistics",
url = "https://aclanthology.org/2020.coling-main.481",
doi = "10.18653/v1/2020.coling-main.481",
pages = "5522--5529",
abstract = "Pre-trained language models provide the foundations for state-of-the-art performance across a wide range of natural language processing tasks, including text classification. However, most classification datasets assume a large amount labeled data, which is commonly not the case in practical settings. In particular, in this paper we compare the performance of a light-weight linear classifier based on word embeddings, i.e., fastText (Joulin et al., 2017), versus a pre-trained language model, i.e., BERT (Devlin et al., 2019), across a wide range of datasets and classification tasks. In general, results show the importance of domain-specific unlabeled data, both in the form of word embeddings or language models. As for the comparison, BERT outperforms all baselines in standard datasets with large training sets. However, in settings with small training datasets a simple method like fastText coupled with domain-specific word embeddings performs equally well or better than BERT, even when pre-trained on domain-specific data.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="edwards-etal-2020-go">
<titleInfo>
<title>Go Simple and Pre-Train on Domain-Specific Corpora: On the Role of Training Data for Text Classification</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aleksandra</namePart>
<namePart type="family">Edwards</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jose</namePart>
<namePart type="family">Camacho-Collados</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hélène</namePart>
<namePart type="family">De Ribaupierre</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alun</namePart>
<namePart type="family">Preece</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 28th International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Donia</namePart>
<namePart type="family">Scott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nuria</namePart>
<namePart type="family">Bel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chengqing</namePart>
<namePart type="family">Zong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Committee on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Barcelona, Spain (Online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Pre-trained language models provide the foundations for state-of-the-art performance across a wide range of natural language processing tasks, including text classification. However, most classification datasets assume a large amount labeled data, which is commonly not the case in practical settings. In particular, in this paper we compare the performance of a light-weight linear classifier based on word embeddings, i.e., fastText (Joulin et al., 2017), versus a pre-trained language model, i.e., BERT (Devlin et al., 2019), across a wide range of datasets and classification tasks. In general, results show the importance of domain-specific unlabeled data, both in the form of word embeddings or language models. As for the comparison, BERT outperforms all baselines in standard datasets with large training sets. However, in settings with small training datasets a simple method like fastText coupled with domain-specific word embeddings performs equally well or better than BERT, even when pre-trained on domain-specific data.</abstract>
<identifier type="citekey">edwards-etal-2020-go</identifier>
<identifier type="doi">10.18653/v1/2020.coling-main.481</identifier>
<location>
<url>https://aclanthology.org/2020.coling-main.481</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>5522</start>
<end>5529</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Go Simple and Pre-Train on Domain-Specific Corpora: On the Role of Training Data for Text Classification
%A Edwards, Aleksandra
%A Camacho-Collados, Jose
%A De Ribaupierre, Hélène
%A Preece, Alun
%Y Scott, Donia
%Y Bel, Nuria
%Y Zong, Chengqing
%S Proceedings of the 28th International Conference on Computational Linguistics
%D 2020
%8 December
%I International Committee on Computational Linguistics
%C Barcelona, Spain (Online)
%F edwards-etal-2020-go
%X Pre-trained language models provide the foundations for state-of-the-art performance across a wide range of natural language processing tasks, including text classification. However, most classification datasets assume a large amount labeled data, which is commonly not the case in practical settings. In particular, in this paper we compare the performance of a light-weight linear classifier based on word embeddings, i.e., fastText (Joulin et al., 2017), versus a pre-trained language model, i.e., BERT (Devlin et al., 2019), across a wide range of datasets and classification tasks. In general, results show the importance of domain-specific unlabeled data, both in the form of word embeddings or language models. As for the comparison, BERT outperforms all baselines in standard datasets with large training sets. However, in settings with small training datasets a simple method like fastText coupled with domain-specific word embeddings performs equally well or better than BERT, even when pre-trained on domain-specific data.
%R 10.18653/v1/2020.coling-main.481
%U https://aclanthology.org/2020.coling-main.481
%U https://doi.org/10.18653/v1/2020.coling-main.481
%P 5522-5529
Markdown (Informal)
[Go Simple and Pre-Train on Domain-Specific Corpora: On the Role of Training Data for Text Classification](https://aclanthology.org/2020.coling-main.481) (Edwards et al., COLING 2020)
ACL