@inproceedings{bates-gurevych-2024-like,
title = "Like a Good Nearest Neighbor: Practical Content Moderation and Text Classification",
author = "Bates, Luke and
Gurevych, Iryna",
editor = "Graham, Yvette and
Purver, Matthew",
booktitle = "Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = mar,
year = "2024",
address = "St. Julian{'}s, Malta",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.eacl-long.17",
pages = "276--297",
abstract = "Few-shot text classification systems have impressive capabilities but are infeasible to deploy and use reliably due to their dependence on prompting and billion-parameter language models. SetFit (Tunstall, 2022) is a recent, practical approach that fine-tunes a Sentence Transformer under a contrastive learning paradigm and achieves similar results to more unwieldy systems. Inexpensive text classification is important for addressing the problem of domain drift in all classification tasks, and especially in detecting harmful content, which plagues social media platforms. Here, we propose Like a Good Nearest Neighbor (LaGoNN), a modification to SetFit that introduces no learnable parameters but alters input text with information from its nearest neighbor, for example, the label and text, in the training data, making novel data appear similar to an instance on which the model was optimized. LaGoNN is effective at flagging undesirable content and text classification, and improves SetFit{'}s performance. To demonstrate LaGoNN{'}s value, we conduct a thorough study of text classification systems in the context of content moderation under four label distributions, and in general and multilingual classification settings.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bates-gurevych-2024-like">
<titleInfo>
<title>Like a Good Nearest Neighbor: Practical Content Moderation and Text Classification</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luke</namePart>
<namePart type="family">Bates</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iryna</namePart>
<namePart type="family">Gurevych</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yvette</namePart>
<namePart type="family">Graham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matthew</namePart>
<namePart type="family">Purver</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">St. Julian’s, Malta</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Few-shot text classification systems have impressive capabilities but are infeasible to deploy and use reliably due to their dependence on prompting and billion-parameter language models. SetFit (Tunstall, 2022) is a recent, practical approach that fine-tunes a Sentence Transformer under a contrastive learning paradigm and achieves similar results to more unwieldy systems. Inexpensive text classification is important for addressing the problem of domain drift in all classification tasks, and especially in detecting harmful content, which plagues social media platforms. Here, we propose Like a Good Nearest Neighbor (LaGoNN), a modification to SetFit that introduces no learnable parameters but alters input text with information from its nearest neighbor, for example, the label and text, in the training data, making novel data appear similar to an instance on which the model was optimized. LaGoNN is effective at flagging undesirable content and text classification, and improves SetFit’s performance. To demonstrate LaGoNN’s value, we conduct a thorough study of text classification systems in the context of content moderation under four label distributions, and in general and multilingual classification settings.</abstract>
<identifier type="citekey">bates-gurevych-2024-like</identifier>
<location>
<url>https://aclanthology.org/2024.eacl-long.17</url>
</location>
<part>
<date>2024-03</date>
<extent unit="page">
<start>276</start>
<end>297</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Like a Good Nearest Neighbor: Practical Content Moderation and Text Classification
%A Bates, Luke
%A Gurevych, Iryna
%Y Graham, Yvette
%Y Purver, Matthew
%S Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2024
%8 March
%I Association for Computational Linguistics
%C St. Julian’s, Malta
%F bates-gurevych-2024-like
%X Few-shot text classification systems have impressive capabilities but are infeasible to deploy and use reliably due to their dependence on prompting and billion-parameter language models. SetFit (Tunstall, 2022) is a recent, practical approach that fine-tunes a Sentence Transformer under a contrastive learning paradigm and achieves similar results to more unwieldy systems. Inexpensive text classification is important for addressing the problem of domain drift in all classification tasks, and especially in detecting harmful content, which plagues social media platforms. Here, we propose Like a Good Nearest Neighbor (LaGoNN), a modification to SetFit that introduces no learnable parameters but alters input text with information from its nearest neighbor, for example, the label and text, in the training data, making novel data appear similar to an instance on which the model was optimized. LaGoNN is effective at flagging undesirable content and text classification, and improves SetFit’s performance. To demonstrate LaGoNN’s value, we conduct a thorough study of text classification systems in the context of content moderation under four label distributions, and in general and multilingual classification settings.
%U https://aclanthology.org/2024.eacl-long.17
%P 276-297
Markdown (Informal)
[Like a Good Nearest Neighbor: Practical Content Moderation and Text Classification](https://aclanthology.org/2024.eacl-long.17) (Bates & Gurevych, EACL 2024)
ACL