@inproceedings{wang-etal-2023-benchmark,
title = "A Benchmark on Extremely Weakly Supervised Text Classification: Reconcile Seed Matching and Prompting Approaches",
author = "Wang, Zihan and
Wang, Tianle and
Mekala, Dheeraj and
Shang, Jingbo",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2023",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-acl.244",
doi = "10.18653/v1/2023.findings-acl.244",
pages = "3944--3962",
abstract = "Extremely Weakly Supervised Text Classification (XWS-TC) refers to text classification based on minimal high-level human guidance, such as a few label-indicative seed words or classification instructions. There are two mainstream approaches for XWS-TC, however, never being rigorously compared: (1) training classifiers based on pseudo-labels generated by (softly) matching seed words (Seed) and (2) prompting (and calibrating) language models using classification instruction (and raw texts) to decode label words (Prompt). This paper presents the first XWS-TC benchmark to compare the two approaches on fair grounds, where the datasets, supervisions, and hyperparameter choices are standardized across methods. Our benchmarking results suggest that (1) Both Seed and Prompt approaches are competitive and there is no clear winner; (2) Seed is empirically more tolerant than Prompt to human guidance (e.g., seed words, classification instructions, and label words) changes; (3) Seed is empirically more selective than Prompt to the pre-trained language models; (4) Recent Seed and Prompt methods have close connections and a clustering post-processing step based on raw in-domain texts is a strong performance booster to both. We hope this benchmark serves as a guideline in selecting XWS-TC methods in different scenarios and stimulate interest in developing guidance- and model-robust XWS-TC methods.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-etal-2023-benchmark">
<titleInfo>
<title>A Benchmark on Extremely Weakly Supervised Text Classification: Reconcile Seed Matching and Prompting Approaches</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zihan</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tianle</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dheeraj</namePart>
<namePart type="family">Mekala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jingbo</namePart>
<namePart type="family">Shang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Boyd-Graber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Extremely Weakly Supervised Text Classification (XWS-TC) refers to text classification based on minimal high-level human guidance, such as a few label-indicative seed words or classification instructions. There are two mainstream approaches for XWS-TC, however, never being rigorously compared: (1) training classifiers based on pseudo-labels generated by (softly) matching seed words (Seed) and (2) prompting (and calibrating) language models using classification instruction (and raw texts) to decode label words (Prompt). This paper presents the first XWS-TC benchmark to compare the two approaches on fair grounds, where the datasets, supervisions, and hyperparameter choices are standardized across methods. Our benchmarking results suggest that (1) Both Seed and Prompt approaches are competitive and there is no clear winner; (2) Seed is empirically more tolerant than Prompt to human guidance (e.g., seed words, classification instructions, and label words) changes; (3) Seed is empirically more selective than Prompt to the pre-trained language models; (4) Recent Seed and Prompt methods have close connections and a clustering post-processing step based on raw in-domain texts is a strong performance booster to both. We hope this benchmark serves as a guideline in selecting XWS-TC methods in different scenarios and stimulate interest in developing guidance- and model-robust XWS-TC methods.</abstract>
<identifier type="citekey">wang-etal-2023-benchmark</identifier>
<identifier type="doi">10.18653/v1/2023.findings-acl.244</identifier>
<location>
<url>https://aclanthology.org/2023.findings-acl.244</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>3944</start>
<end>3962</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Benchmark on Extremely Weakly Supervised Text Classification: Reconcile Seed Matching and Prompting Approaches
%A Wang, Zihan
%A Wang, Tianle
%A Mekala, Dheeraj
%A Shang, Jingbo
%Y Rogers, Anna
%Y Boyd-Graber, Jordan
%Y Okazaki, Naoaki
%S Findings of the Association for Computational Linguistics: ACL 2023
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F wang-etal-2023-benchmark
%X Extremely Weakly Supervised Text Classification (XWS-TC) refers to text classification based on minimal high-level human guidance, such as a few label-indicative seed words or classification instructions. There are two mainstream approaches for XWS-TC, however, never being rigorously compared: (1) training classifiers based on pseudo-labels generated by (softly) matching seed words (Seed) and (2) prompting (and calibrating) language models using classification instruction (and raw texts) to decode label words (Prompt). This paper presents the first XWS-TC benchmark to compare the two approaches on fair grounds, where the datasets, supervisions, and hyperparameter choices are standardized across methods. Our benchmarking results suggest that (1) Both Seed and Prompt approaches are competitive and there is no clear winner; (2) Seed is empirically more tolerant than Prompt to human guidance (e.g., seed words, classification instructions, and label words) changes; (3) Seed is empirically more selective than Prompt to the pre-trained language models; (4) Recent Seed and Prompt methods have close connections and a clustering post-processing step based on raw in-domain texts is a strong performance booster to both. We hope this benchmark serves as a guideline in selecting XWS-TC methods in different scenarios and stimulate interest in developing guidance- and model-robust XWS-TC methods.
%R 10.18653/v1/2023.findings-acl.244
%U https://aclanthology.org/2023.findings-acl.244
%U https://doi.org/10.18653/v1/2023.findings-acl.244
%P 3944-3962
Markdown (Informal)
[A Benchmark on Extremely Weakly Supervised Text Classification: Reconcile Seed Matching and Prompting Approaches](https://aclanthology.org/2023.findings-acl.244) (Wang et al., Findings 2023)
ACL