@inproceedings{yamakoshi-etal-2022-probing,
title = "Probing {BERT}`s priors with serial reproduction chains",
author = "Yamakoshi, Takateru and
Griffiths, Thomas and
Hawkins, Robert",
editor = "Muresan, Smaranda and
Nakov, Preslav and
Villavicencio, Aline",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2022",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.findings-acl.314/",
doi = "10.18653/v1/2022.findings-acl.314",
pages = "3977--3992",
abstract = "Sampling is a promising bottom-up method for exposing what generative models have learned about language, but it remains unclear how to generate representative samples from popular masked language models (MLMs) like BERT. The MLM objective yields a dependency network with no guarantee of consistent conditional distributions, posing a problem for naive approaches. Drawing from theories of iterated learning in cognitive science, we explore the use of \textit{serial reproduction chains} to sample from BERT`s priors. In particular, we observe that a unique and consistent estimator of the ground-truth joint distribution is given by a Generative Stochastic Network (GSN) sampler, which randomly selects which token to mask and reconstruct on each step. We show that the lexical and syntactic statistics of sentences from GSN chains closely match the ground-truth corpus distribution and perform better than other methods in a large corpus of naturalness judgments. Our findings establish a firmer theoretical foundation for bottom-up probing and highlight richer deviations from human priors."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yamakoshi-etal-2022-probing">
<titleInfo>
<title>Probing BERT‘s priors with serial reproduction chains</title>
</titleInfo>
<name type="personal">
<namePart type="given">Takateru</namePart>
<namePart type="family">Yamakoshi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Griffiths</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Robert</namePart>
<namePart type="family">Hawkins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Smaranda</namePart>
<namePart type="family">Muresan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preslav</namePart>
<namePart type="family">Nakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aline</namePart>
<namePart type="family">Villavicencio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Sampling is a promising bottom-up method for exposing what generative models have learned about language, but it remains unclear how to generate representative samples from popular masked language models (MLMs) like BERT. The MLM objective yields a dependency network with no guarantee of consistent conditional distributions, posing a problem for naive approaches. Drawing from theories of iterated learning in cognitive science, we explore the use of serial reproduction chains to sample from BERT‘s priors. In particular, we observe that a unique and consistent estimator of the ground-truth joint distribution is given by a Generative Stochastic Network (GSN) sampler, which randomly selects which token to mask and reconstruct on each step. We show that the lexical and syntactic statistics of sentences from GSN chains closely match the ground-truth corpus distribution and perform better than other methods in a large corpus of naturalness judgments. Our findings establish a firmer theoretical foundation for bottom-up probing and highlight richer deviations from human priors.</abstract>
<identifier type="citekey">yamakoshi-etal-2022-probing</identifier>
<identifier type="doi">10.18653/v1/2022.findings-acl.314</identifier>
<location>
<url>https://aclanthology.org/2022.findings-acl.314/</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>3977</start>
<end>3992</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Probing BERT‘s priors with serial reproduction chains
%A Yamakoshi, Takateru
%A Griffiths, Thomas
%A Hawkins, Robert
%Y Muresan, Smaranda
%Y Nakov, Preslav
%Y Villavicencio, Aline
%S Findings of the Association for Computational Linguistics: ACL 2022
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland
%F yamakoshi-etal-2022-probing
%X Sampling is a promising bottom-up method for exposing what generative models have learned about language, but it remains unclear how to generate representative samples from popular masked language models (MLMs) like BERT. The MLM objective yields a dependency network with no guarantee of consistent conditional distributions, posing a problem for naive approaches. Drawing from theories of iterated learning in cognitive science, we explore the use of serial reproduction chains to sample from BERT‘s priors. In particular, we observe that a unique and consistent estimator of the ground-truth joint distribution is given by a Generative Stochastic Network (GSN) sampler, which randomly selects which token to mask and reconstruct on each step. We show that the lexical and syntactic statistics of sentences from GSN chains closely match the ground-truth corpus distribution and perform better than other methods in a large corpus of naturalness judgments. Our findings establish a firmer theoretical foundation for bottom-up probing and highlight richer deviations from human priors.
%R 10.18653/v1/2022.findings-acl.314
%U https://aclanthology.org/2022.findings-acl.314/
%U https://doi.org/10.18653/v1/2022.findings-acl.314
%P 3977-3992
Markdown (Informal)
[Probing BERT’s priors with serial reproduction chains](https://aclanthology.org/2022.findings-acl.314/) (Yamakoshi et al., Findings 2022)
ACL
- Takateru Yamakoshi, Thomas Griffiths, and Robert Hawkins. 2022. Probing BERT’s priors with serial reproduction chains. In Findings of the Association for Computational Linguistics: ACL 2022, pages 3977–3992, Dublin, Ireland. Association for Computational Linguistics.