@inproceedings{dai-etal-2020-cost,
title = "Cost-effective Selection of Pretraining Data: A Case Study of Pretraining {BERT} on Social Media",
author = "Dai, Xiang and
Karimi, Sarvnaz and
Hachey, Ben and
Paris, Cecile",
editor = "Cohn, Trevor and
He, Yulan and
Liu, Yang",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2020",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.findings-emnlp.151",
doi = "10.18653/v1/2020.findings-emnlp.151",
pages = "1675--1681",
abstract = "Recent studies on domain-specific BERT models show that effectiveness on downstream tasks can be improved when models are pretrained on in-domain data. Often, the pretraining data used in these models are selected based on their subject matter, e.g., biology or computer science. Given the range of applications using social media text, and its unique language variety, we pretrain two models on tweets and forum text respectively, and empirically demonstrate the effectiveness of these two resources. In addition, we investigate how similarity measures can be used to nominate in-domain pretraining data. We publicly release our pretrained models at \url{https://bit.ly/35RpTf0}.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dai-etal-2020-cost">
<titleInfo>
<title>Cost-effective Selection of Pretraining Data: A Case Study of Pretraining BERT on Social Media</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xiang</namePart>
<namePart type="family">Dai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sarvnaz</namePart>
<namePart type="family">Karimi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ben</namePart>
<namePart type="family">Hachey</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cecile</namePart>
<namePart type="family">Paris</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2020</title>
</titleInfo>
<name type="personal">
<namePart type="given">Trevor</namePart>
<namePart type="family">Cohn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recent studies on domain-specific BERT models show that effectiveness on downstream tasks can be improved when models are pretrained on in-domain data. Often, the pretraining data used in these models are selected based on their subject matter, e.g., biology or computer science. Given the range of applications using social media text, and its unique language variety, we pretrain two models on tweets and forum text respectively, and empirically demonstrate the effectiveness of these two resources. In addition, we investigate how similarity measures can be used to nominate in-domain pretraining data. We publicly release our pretrained models at https://bit.ly/35RpTf0.</abstract>
<identifier type="citekey">dai-etal-2020-cost</identifier>
<identifier type="doi">10.18653/v1/2020.findings-emnlp.151</identifier>
<location>
<url>https://aclanthology.org/2020.findings-emnlp.151</url>
</location>
<part>
<date>2020-11</date>
<extent unit="page">
<start>1675</start>
<end>1681</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Cost-effective Selection of Pretraining Data: A Case Study of Pretraining BERT on Social Media
%A Dai, Xiang
%A Karimi, Sarvnaz
%A Hachey, Ben
%A Paris, Cecile
%Y Cohn, Trevor
%Y He, Yulan
%Y Liu, Yang
%S Findings of the Association for Computational Linguistics: EMNLP 2020
%D 2020
%8 November
%I Association for Computational Linguistics
%C Online
%F dai-etal-2020-cost
%X Recent studies on domain-specific BERT models show that effectiveness on downstream tasks can be improved when models are pretrained on in-domain data. Often, the pretraining data used in these models are selected based on their subject matter, e.g., biology or computer science. Given the range of applications using social media text, and its unique language variety, we pretrain two models on tweets and forum text respectively, and empirically demonstrate the effectiveness of these two resources. In addition, we investigate how similarity measures can be used to nominate in-domain pretraining data. We publicly release our pretrained models at https://bit.ly/35RpTf0.
%R 10.18653/v1/2020.findings-emnlp.151
%U https://aclanthology.org/2020.findings-emnlp.151
%U https://doi.org/10.18653/v1/2020.findings-emnlp.151
%P 1675-1681
Markdown (Informal)
[Cost-effective Selection of Pretraining Data: A Case Study of Pretraining BERT on Social Media](https://aclanthology.org/2020.findings-emnlp.151) (Dai et al., Findings 2020)
ACL