@inproceedings{leung-tan-2022-efficient,
title = "Efficient Semi-supervised Consistency Training for Natural Language Understanding",
author = "Leung, George and
Tan, Joshua",
editor = "Loukina, Anastassia and
Gangadharaiah, Rashmi and
Min, Bonan",
booktitle = "Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies: Industry Track",
month = jul,
year = "2022",
address = "Hybrid: Seattle, Washington + Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.naacl-industry.11",
doi = "10.18653/v1/2022.naacl-industry.11",
pages = "86--93",
abstract = "Manually labeled training data is expensive, noisy, and often scarce, such as when developing new features or localizing existing features for a new region. In cases where labeled data is limited but unlabeled data is abundant, semi-supervised learning methods such as consistency training can be used to improve model performance, by training models to output consistent predictions between original and augmented versions of unlabeled data. In this work, we explore different data augmentation methods for consistency training (CT) on Natural Language Understanding (NLU) domain classification (DC) in the limited labeled data regime. We explore three types of augmentation techniques (human paraphrasing, back-translation, and dropout) for unlabeled data and train DC models to jointly minimize both the supervised loss and the consistency loss on unlabeled data. Our results demonstrate that DC models trained with CT methods and dropout based augmentation on only 0.1{\%} (2,998 instances) of labeled data with the remainder as unlabeled can achieve a top-1 relative accuracy reduction of 12.25{\%} compared to fully supervised model trained with 100{\%} of labeled data, outperforming fully supervised models trained on 10x that amount of labeled data. The dropout-based augmentation achieves similar performance compare to back-translation based augmentation with much less computational resources. This paves the way for applications of using large scale unlabeled data for semi-supervised learning in production NLU systems.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="leung-tan-2022-efficient">
<titleInfo>
<title>Efficient Semi-supervised Consistency Training for Natural Language Understanding</title>
</titleInfo>
<name type="personal">
<namePart type="given">George</namePart>
<namePart type="family">Leung</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joshua</namePart>
<namePart type="family">Tan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anastassia</namePart>
<namePart type="family">Loukina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rashmi</namePart>
<namePart type="family">Gangadharaiah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bonan</namePart>
<namePart type="family">Min</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Hybrid: Seattle, Washington + Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Manually labeled training data is expensive, noisy, and often scarce, such as when developing new features or localizing existing features for a new region. In cases where labeled data is limited but unlabeled data is abundant, semi-supervised learning methods such as consistency training can be used to improve model performance, by training models to output consistent predictions between original and augmented versions of unlabeled data. In this work, we explore different data augmentation methods for consistency training (CT) on Natural Language Understanding (NLU) domain classification (DC) in the limited labeled data regime. We explore three types of augmentation techniques (human paraphrasing, back-translation, and dropout) for unlabeled data and train DC models to jointly minimize both the supervised loss and the consistency loss on unlabeled data. Our results demonstrate that DC models trained with CT methods and dropout based augmentation on only 0.1% (2,998 instances) of labeled data with the remainder as unlabeled can achieve a top-1 relative accuracy reduction of 12.25% compared to fully supervised model trained with 100% of labeled data, outperforming fully supervised models trained on 10x that amount of labeled data. The dropout-based augmentation achieves similar performance compare to back-translation based augmentation with much less computational resources. This paves the way for applications of using large scale unlabeled data for semi-supervised learning in production NLU systems.</abstract>
<identifier type="citekey">leung-tan-2022-efficient</identifier>
<identifier type="doi">10.18653/v1/2022.naacl-industry.11</identifier>
<location>
<url>https://aclanthology.org/2022.naacl-industry.11</url>
</location>
<part>
<date>2022-07</date>
<extent unit="page">
<start>86</start>
<end>93</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Efficient Semi-supervised Consistency Training for Natural Language Understanding
%A Leung, George
%A Tan, Joshua
%Y Loukina, Anastassia
%Y Gangadharaiah, Rashmi
%Y Min, Bonan
%S Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies: Industry Track
%D 2022
%8 July
%I Association for Computational Linguistics
%C Hybrid: Seattle, Washington + Online
%F leung-tan-2022-efficient
%X Manually labeled training data is expensive, noisy, and often scarce, such as when developing new features or localizing existing features for a new region. In cases where labeled data is limited but unlabeled data is abundant, semi-supervised learning methods such as consistency training can be used to improve model performance, by training models to output consistent predictions between original and augmented versions of unlabeled data. In this work, we explore different data augmentation methods for consistency training (CT) on Natural Language Understanding (NLU) domain classification (DC) in the limited labeled data regime. We explore three types of augmentation techniques (human paraphrasing, back-translation, and dropout) for unlabeled data and train DC models to jointly minimize both the supervised loss and the consistency loss on unlabeled data. Our results demonstrate that DC models trained with CT methods and dropout based augmentation on only 0.1% (2,998 instances) of labeled data with the remainder as unlabeled can achieve a top-1 relative accuracy reduction of 12.25% compared to fully supervised model trained with 100% of labeled data, outperforming fully supervised models trained on 10x that amount of labeled data. The dropout-based augmentation achieves similar performance compare to back-translation based augmentation with much less computational resources. This paves the way for applications of using large scale unlabeled data for semi-supervised learning in production NLU systems.
%R 10.18653/v1/2022.naacl-industry.11
%U https://aclanthology.org/2022.naacl-industry.11
%U https://doi.org/10.18653/v1/2022.naacl-industry.11
%P 86-93
Markdown (Informal)
[Efficient Semi-supervised Consistency Training for Natural Language Understanding](https://aclanthology.org/2022.naacl-industry.11) (Leung & Tan, NAACL 2022)
ACL