@inproceedings{jin-etal-2021-seed,
title = "Seed Word Selection for Weakly-Supervised Text Classification with Unsupervised Error Estimation",
author = "Jin, Yiping and
Bhatia, Akshay and
Wanvarie, Dittaya",
editor = "Durmus, Esin and
Gupta, Vivek and
Liu, Nelson and
Peng, Nanyun and
Su, Yu",
booktitle = "Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Student Research Workshop",
month = jun,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.naacl-srw.14",
doi = "10.18653/v1/2021.naacl-srw.14",
pages = "112--118",
abstract = "Weakly-supervised text classification aims to induce text classifiers from only a few user-provided seed words. The vast majority of previous work assumes high-quality seed words are given. However, the expert-annotated seed words are sometimes non-trivial to come up with. Furthermore, in the weakly-supervised learning setting, we do not have any labeled document to measure the seed words{'} efficacy, making the seed word selection process {``}a walk in the dark{''}. In this work, we remove the need for expert-curated seed words by first mining (noisy) candidate seed words associated with the category names. We then train interim models with individual candidate seed words. Lastly, we estimate the interim models{'} error rate in an unsupervised manner. The seed words that yield the lowest estimated error rates are added to the final seed word set. A comprehensive evaluation of six binary classification tasks on four popular datasets demonstrates that the proposed method outperforms a baseline using only category name seed words and obtained comparable performance as a counterpart using expert-annotated seed words.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jin-etal-2021-seed">
<titleInfo>
<title>Seed Word Selection for Weakly-Supervised Text Classification with Unsupervised Error Estimation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yiping</namePart>
<namePart type="family">Jin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Akshay</namePart>
<namePart type="family">Bhatia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dittaya</namePart>
<namePart type="family">Wanvarie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Student Research Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Esin</namePart>
<namePart type="family">Durmus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nelson</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nanyun</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yu</namePart>
<namePart type="family">Su</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Weakly-supervised text classification aims to induce text classifiers from only a few user-provided seed words. The vast majority of previous work assumes high-quality seed words are given. However, the expert-annotated seed words are sometimes non-trivial to come up with. Furthermore, in the weakly-supervised learning setting, we do not have any labeled document to measure the seed words’ efficacy, making the seed word selection process “a walk in the dark”. In this work, we remove the need for expert-curated seed words by first mining (noisy) candidate seed words associated with the category names. We then train interim models with individual candidate seed words. Lastly, we estimate the interim models’ error rate in an unsupervised manner. The seed words that yield the lowest estimated error rates are added to the final seed word set. A comprehensive evaluation of six binary classification tasks on four popular datasets demonstrates that the proposed method outperforms a baseline using only category name seed words and obtained comparable performance as a counterpart using expert-annotated seed words.</abstract>
<identifier type="citekey">jin-etal-2021-seed</identifier>
<identifier type="doi">10.18653/v1/2021.naacl-srw.14</identifier>
<location>
<url>https://aclanthology.org/2021.naacl-srw.14</url>
</location>
<part>
<date>2021-06</date>
<extent unit="page">
<start>112</start>
<end>118</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Seed Word Selection for Weakly-Supervised Text Classification with Unsupervised Error Estimation
%A Jin, Yiping
%A Bhatia, Akshay
%A Wanvarie, Dittaya
%Y Durmus, Esin
%Y Gupta, Vivek
%Y Liu, Nelson
%Y Peng, Nanyun
%Y Su, Yu
%S Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Student Research Workshop
%D 2021
%8 June
%I Association for Computational Linguistics
%C Online
%F jin-etal-2021-seed
%X Weakly-supervised text classification aims to induce text classifiers from only a few user-provided seed words. The vast majority of previous work assumes high-quality seed words are given. However, the expert-annotated seed words are sometimes non-trivial to come up with. Furthermore, in the weakly-supervised learning setting, we do not have any labeled document to measure the seed words’ efficacy, making the seed word selection process “a walk in the dark”. In this work, we remove the need for expert-curated seed words by first mining (noisy) candidate seed words associated with the category names. We then train interim models with individual candidate seed words. Lastly, we estimate the interim models’ error rate in an unsupervised manner. The seed words that yield the lowest estimated error rates are added to the final seed word set. A comprehensive evaluation of six binary classification tasks on four popular datasets demonstrates that the proposed method outperforms a baseline using only category name seed words and obtained comparable performance as a counterpart using expert-annotated seed words.
%R 10.18653/v1/2021.naacl-srw.14
%U https://aclanthology.org/2021.naacl-srw.14
%U https://doi.org/10.18653/v1/2021.naacl-srw.14
%P 112-118
Markdown (Informal)
[Seed Word Selection for Weakly-Supervised Text Classification with Unsupervised Error Estimation](https://aclanthology.org/2021.naacl-srw.14) (Jin et al., NAACL 2021)
ACL