@inproceedings{hua-etal-2021-noise,
title = "Noise Stability Regularization for Improving {BERT} Fine-tuning",
author = "Hua, Hang and
Li, Xingjian and
Dou, Dejing and
Xu, Chengzhong and
Luo, Jiebo",
editor = "Toutanova, Kristina and
Rumshisky, Anna and
Zettlemoyer, Luke and
Hakkani-Tur, Dilek and
Beltagy, Iz and
Bethard, Steven and
Cotterell, Ryan and
Chakraborty, Tanmoy and
Zhou, Yichao",
booktitle = "Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies",
month = jun,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.naacl-main.258",
doi = "10.18653/v1/2021.naacl-main.258",
pages = "3229--3241",
abstract = "Fine-tuning pre-trained language models suchas BERT has become a common practice dom-inating leaderboards across various NLP tasks. Despite its recent success and wide adoption,this process is unstable when there are onlya small number of training samples available. The brittleness of this process is often reflectedby the sensitivity to random seeds. In this pa-per, we propose to tackle this problem basedon the noise stability property of deep nets,which is investigated in recent literature (Aroraet al., 2018; Sanyal et al., 2020). Specifically,we introduce a novel and effective regulariza-tion method to improve fine-tuning on NLPtasks, referred to asLayer-wiseNoiseStabilityRegularization (LNSR). We extend the theo-ries about adding noise to the input and provethat our method gives a stabler regularizationeffect. We provide supportive evidence by ex-perimentally confirming that well-performingmodels show a low sensitivity to noise andfine-tuning with LNSR exhibits clearly bet-ter generalizability and stability. Furthermore,our method also demonstrates advantages overother state-of-the-art algorithms including L2-SP (Li et al., 2018), Mixout (Lee et al., 2020)and SMART (Jiang et al., 20)",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hua-etal-2021-noise">
<titleInfo>
<title>Noise Stability Regularization for Improving BERT Fine-tuning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hang</namePart>
<namePart type="family">Hua</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xingjian</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dejing</namePart>
<namePart type="family">Dou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chengzhong</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiebo</namePart>
<namePart type="family">Luo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kristina</namePart>
<namePart type="family">Toutanova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rumshisky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luke</namePart>
<namePart type="family">Zettlemoyer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dilek</namePart>
<namePart type="family">Hakkani-Tur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iz</namePart>
<namePart type="family">Beltagy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryan</namePart>
<namePart type="family">Cotterell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yichao</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Fine-tuning pre-trained language models suchas BERT has become a common practice dom-inating leaderboards across various NLP tasks. Despite its recent success and wide adoption,this process is unstable when there are onlya small number of training samples available. The brittleness of this process is often reflectedby the sensitivity to random seeds. In this pa-per, we propose to tackle this problem basedon the noise stability property of deep nets,which is investigated in recent literature (Aroraet al., 2018; Sanyal et al., 2020). Specifically,we introduce a novel and effective regulariza-tion method to improve fine-tuning on NLPtasks, referred to asLayer-wiseNoiseStabilityRegularization (LNSR). We extend the theo-ries about adding noise to the input and provethat our method gives a stabler regularizationeffect. We provide supportive evidence by ex-perimentally confirming that well-performingmodels show a low sensitivity to noise andfine-tuning with LNSR exhibits clearly bet-ter generalizability and stability. Furthermore,our method also demonstrates advantages overother state-of-the-art algorithms including L2-SP (Li et al., 2018), Mixout (Lee et al., 2020)and SMART (Jiang et al., 20)</abstract>
<identifier type="citekey">hua-etal-2021-noise</identifier>
<identifier type="doi">10.18653/v1/2021.naacl-main.258</identifier>
<location>
<url>https://aclanthology.org/2021.naacl-main.258</url>
</location>
<part>
<date>2021-06</date>
<extent unit="page">
<start>3229</start>
<end>3241</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Noise Stability Regularization for Improving BERT Fine-tuning
%A Hua, Hang
%A Li, Xingjian
%A Dou, Dejing
%A Xu, Chengzhong
%A Luo, Jiebo
%Y Toutanova, Kristina
%Y Rumshisky, Anna
%Y Zettlemoyer, Luke
%Y Hakkani-Tur, Dilek
%Y Beltagy, Iz
%Y Bethard, Steven
%Y Cotterell, Ryan
%Y Chakraborty, Tanmoy
%Y Zhou, Yichao
%S Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies
%D 2021
%8 June
%I Association for Computational Linguistics
%C Online
%F hua-etal-2021-noise
%X Fine-tuning pre-trained language models suchas BERT has become a common practice dom-inating leaderboards across various NLP tasks. Despite its recent success and wide adoption,this process is unstable when there are onlya small number of training samples available. The brittleness of this process is often reflectedby the sensitivity to random seeds. In this pa-per, we propose to tackle this problem basedon the noise stability property of deep nets,which is investigated in recent literature (Aroraet al., 2018; Sanyal et al., 2020). Specifically,we introduce a novel and effective regulariza-tion method to improve fine-tuning on NLPtasks, referred to asLayer-wiseNoiseStabilityRegularization (LNSR). We extend the theo-ries about adding noise to the input and provethat our method gives a stabler regularizationeffect. We provide supportive evidence by ex-perimentally confirming that well-performingmodels show a low sensitivity to noise andfine-tuning with LNSR exhibits clearly bet-ter generalizability and stability. Furthermore,our method also demonstrates advantages overother state-of-the-art algorithms including L2-SP (Li et al., 2018), Mixout (Lee et al., 2020)and SMART (Jiang et al., 20)
%R 10.18653/v1/2021.naacl-main.258
%U https://aclanthology.org/2021.naacl-main.258
%U https://doi.org/10.18653/v1/2021.naacl-main.258
%P 3229-3241
Markdown (Informal)
[Noise Stability Regularization for Improving BERT Fine-tuning](https://aclanthology.org/2021.naacl-main.258) (Hua et al., NAACL 2021)
ACL
- Hang Hua, Xingjian Li, Dejing Dou, Chengzhong Xu, and Jiebo Luo. 2021. Noise Stability Regularization for Improving BERT Fine-tuning. In Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pages 3229–3241, Online. Association for Computational Linguistics.