@inproceedings{chiang-yogatama-2023-distributional,
title = "The Distributional Hypothesis Does Not Fully Explain the Benefits of Masked Language Model Pretraining",
author = "Chiang, Ting-Rui and
Yogatama, Dani",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.emnlp-main.637",
doi = "10.18653/v1/2023.emnlp-main.637",
pages = "10305--10321",
abstract = "We analyze the masked language modeling pretraining objective function from the perspective of the Distributional Hypothesis. We investigate whether the better sample efficiency and the better generalization capability of models pretrained with masked language modeling can be attributed to the semantic similarity encoded in the pretraining data{'}s distributional property. Via a synthetic dataset, our analysis suggests that distributional property indeed leads to the better sample efficiency of pretrained masked language models, but does not fully explain the generalization capability. We also conduct an analysis over two real-world datasets and demonstrate that the distributional property does not explain the generalization ability of pretrained natural language models either. Our results illustrate our limited understanding of model pretraining and provide future research directions.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chiang-yogatama-2023-distributional">
<titleInfo>
<title>The Distributional Hypothesis Does Not Fully Explain the Benefits of Masked Language Model Pretraining</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ting-Rui</namePart>
<namePart type="family">Chiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dani</namePart>
<namePart type="family">Yogatama</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We analyze the masked language modeling pretraining objective function from the perspective of the Distributional Hypothesis. We investigate whether the better sample efficiency and the better generalization capability of models pretrained with masked language modeling can be attributed to the semantic similarity encoded in the pretraining data’s distributional property. Via a synthetic dataset, our analysis suggests that distributional property indeed leads to the better sample efficiency of pretrained masked language models, but does not fully explain the generalization capability. We also conduct an analysis over two real-world datasets and demonstrate that the distributional property does not explain the generalization ability of pretrained natural language models either. Our results illustrate our limited understanding of model pretraining and provide future research directions.</abstract>
<identifier type="citekey">chiang-yogatama-2023-distributional</identifier>
<identifier type="doi">10.18653/v1/2023.emnlp-main.637</identifier>
<location>
<url>https://aclanthology.org/2023.emnlp-main.637</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>10305</start>
<end>10321</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The Distributional Hypothesis Does Not Fully Explain the Benefits of Masked Language Model Pretraining
%A Chiang, Ting-Rui
%A Yogatama, Dani
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F chiang-yogatama-2023-distributional
%X We analyze the masked language modeling pretraining objective function from the perspective of the Distributional Hypothesis. We investigate whether the better sample efficiency and the better generalization capability of models pretrained with masked language modeling can be attributed to the semantic similarity encoded in the pretraining data’s distributional property. Via a synthetic dataset, our analysis suggests that distributional property indeed leads to the better sample efficiency of pretrained masked language models, but does not fully explain the generalization capability. We also conduct an analysis over two real-world datasets and demonstrate that the distributional property does not explain the generalization ability of pretrained natural language models either. Our results illustrate our limited understanding of model pretraining and provide future research directions.
%R 10.18653/v1/2023.emnlp-main.637
%U https://aclanthology.org/2023.emnlp-main.637
%U https://doi.org/10.18653/v1/2023.emnlp-main.637
%P 10305-10321
Markdown (Informal)
[The Distributional Hypothesis Does Not Fully Explain the Benefits of Masked Language Model Pretraining](https://aclanthology.org/2023.emnlp-main.637) (Chiang & Yogatama, EMNLP 2023)
ACL