@inproceedings{yamaguchi-etal-2021-frustratingly,
title = "Frustratingly Simple Pretraining Alternatives to Masked Language Modeling",
author = "Yamaguchi, Atsuki and
Chrysostomou, George and
Margatina, Katerina and
Aletras, Nikolaos",
booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2021",
address = "Online and Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.emnlp-main.249",
doi = "10.18653/v1/2021.emnlp-main.249",
pages = "3116--3125",
abstract = "Masked language modeling (MLM), a self-supervised pretraining objective, is widely used in natural language processing for learning text representations. MLM trains a model to predict a random sample of input tokens that have been replaced by a [MASK] placeholder in a multi-class setting over the entire vocabulary. When pretraining, it is common to use alongside MLM other auxiliary objectives on the token or sequence level to improve downstream performance (e.g. next sentence prediction). However, no previous work so far has attempted in examining whether other simpler linguistically intuitive or not objectives can be used standalone as main pretraining objectives. In this paper, we explore five simple pretraining objectives based on token-level classification tasks as replacements of MLM. Empirical results on GLUE and SQUAD show that our proposed methods achieve comparable or better performance to MLM using a BERT-BASE architecture. We further validate our methods using smaller models, showing that pretraining a model with 41{\%} of the BERT-BASE{'}s parameters, BERT-MEDIUM results in only a 1{\%} drop in GLUE scores with our best objective.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yamaguchi-etal-2021-frustratingly">
<titleInfo>
<title>Frustratingly Simple Pretraining Alternatives to Masked Language Modeling</title>
</titleInfo>
<name type="personal">
<namePart type="given">Atsuki</namePart>
<namePart type="family">Yamaguchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">George</namePart>
<namePart type="family">Chrysostomou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Katerina</namePart>
<namePart type="family">Margatina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikolaos</namePart>
<namePart type="family">Aletras</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online and Punta Cana, Dominican Republic</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Masked language modeling (MLM), a self-supervised pretraining objective, is widely used in natural language processing for learning text representations. MLM trains a model to predict a random sample of input tokens that have been replaced by a [MASK] placeholder in a multi-class setting over the entire vocabulary. When pretraining, it is common to use alongside MLM other auxiliary objectives on the token or sequence level to improve downstream performance (e.g. next sentence prediction). However, no previous work so far has attempted in examining whether other simpler linguistically intuitive or not objectives can be used standalone as main pretraining objectives. In this paper, we explore five simple pretraining objectives based on token-level classification tasks as replacements of MLM. Empirical results on GLUE and SQUAD show that our proposed methods achieve comparable or better performance to MLM using a BERT-BASE architecture. We further validate our methods using smaller models, showing that pretraining a model with 41% of the BERT-BASE’s parameters, BERT-MEDIUM results in only a 1% drop in GLUE scores with our best objective.</abstract>
<identifier type="citekey">yamaguchi-etal-2021-frustratingly</identifier>
<identifier type="doi">10.18653/v1/2021.emnlp-main.249</identifier>
<location>
<url>https://aclanthology.org/2021.emnlp-main.249</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>3116</start>
<end>3125</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Frustratingly Simple Pretraining Alternatives to Masked Language Modeling
%A Yamaguchi, Atsuki
%A Chrysostomou, George
%A Margatina, Katerina
%A Aletras, Nikolaos
%S Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing
%D 2021
%8 November
%I Association for Computational Linguistics
%C Online and Punta Cana, Dominican Republic
%F yamaguchi-etal-2021-frustratingly
%X Masked language modeling (MLM), a self-supervised pretraining objective, is widely used in natural language processing for learning text representations. MLM trains a model to predict a random sample of input tokens that have been replaced by a [MASK] placeholder in a multi-class setting over the entire vocabulary. When pretraining, it is common to use alongside MLM other auxiliary objectives on the token or sequence level to improve downstream performance (e.g. next sentence prediction). However, no previous work so far has attempted in examining whether other simpler linguistically intuitive or not objectives can be used standalone as main pretraining objectives. In this paper, we explore five simple pretraining objectives based on token-level classification tasks as replacements of MLM. Empirical results on GLUE and SQUAD show that our proposed methods achieve comparable or better performance to MLM using a BERT-BASE architecture. We further validate our methods using smaller models, showing that pretraining a model with 41% of the BERT-BASE’s parameters, BERT-MEDIUM results in only a 1% drop in GLUE scores with our best objective.
%R 10.18653/v1/2021.emnlp-main.249
%U https://aclanthology.org/2021.emnlp-main.249
%U https://doi.org/10.18653/v1/2021.emnlp-main.249
%P 3116-3125
Markdown (Informal)
[Frustratingly Simple Pretraining Alternatives to Masked Language Modeling](https://aclanthology.org/2021.emnlp-main.249) (Yamaguchi et al., EMNLP 2021)
ACL