@inproceedings{juuti-etal-2020-little,
title = "A little goes a long way: Improving toxic language classification despite data scarcity",
author = {Juuti, Mika and
Gr{\"o}ndahl, Tommi and
Flanagan, Adrian and
Asokan, N.},
editor = "Cohn, Trevor and
He, Yulan and
Liu, Yang",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2020",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.findings-emnlp.269",
doi = "10.18653/v1/2020.findings-emnlp.269",
pages = "2991--3009",
abstract = "Detection of some types of toxic language is hampered by extreme scarcity of labeled training data. Data augmentation {--} generating new synthetic data from a labeled seed dataset {--} can help. The efficacy of data augmentation on toxic language classification has not been fully explored. We present the first systematic study on how data augmentation techniques impact performance across toxic language classifiers, ranging from shallow logistic regression architectures to BERT {--} a state-of-the-art pretrained Transformer network. We compare the performance of eight techniques on very scarce seed datasets. We show that while BERT performed the best, shallow classifiers performed comparably when trained on data augmented with a combination of three techniques, including GPT-2-generated sentences. We discuss the interplay of performance and computational overhead, which can inform the choice of techniques under different constraints.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="juuti-etal-2020-little">
<titleInfo>
<title>A little goes a long way: Improving toxic language classification despite data scarcity</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mika</namePart>
<namePart type="family">Juuti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tommi</namePart>
<namePart type="family">Gröndahl</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adrian</namePart>
<namePart type="family">Flanagan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">N</namePart>
<namePart type="family">Asokan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2020</title>
</titleInfo>
<name type="personal">
<namePart type="given">Trevor</namePart>
<namePart type="family">Cohn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Detection of some types of toxic language is hampered by extreme scarcity of labeled training data. Data augmentation – generating new synthetic data from a labeled seed dataset – can help. The efficacy of data augmentation on toxic language classification has not been fully explored. We present the first systematic study on how data augmentation techniques impact performance across toxic language classifiers, ranging from shallow logistic regression architectures to BERT – a state-of-the-art pretrained Transformer network. We compare the performance of eight techniques on very scarce seed datasets. We show that while BERT performed the best, shallow classifiers performed comparably when trained on data augmented with a combination of three techniques, including GPT-2-generated sentences. We discuss the interplay of performance and computational overhead, which can inform the choice of techniques under different constraints.</abstract>
<identifier type="citekey">juuti-etal-2020-little</identifier>
<identifier type="doi">10.18653/v1/2020.findings-emnlp.269</identifier>
<location>
<url>https://aclanthology.org/2020.findings-emnlp.269</url>
</location>
<part>
<date>2020-11</date>
<extent unit="page">
<start>2991</start>
<end>3009</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A little goes a long way: Improving toxic language classification despite data scarcity
%A Juuti, Mika
%A Gröndahl, Tommi
%A Flanagan, Adrian
%A Asokan, N.
%Y Cohn, Trevor
%Y He, Yulan
%Y Liu, Yang
%S Findings of the Association for Computational Linguistics: EMNLP 2020
%D 2020
%8 November
%I Association for Computational Linguistics
%C Online
%F juuti-etal-2020-little
%X Detection of some types of toxic language is hampered by extreme scarcity of labeled training data. Data augmentation – generating new synthetic data from a labeled seed dataset – can help. The efficacy of data augmentation on toxic language classification has not been fully explored. We present the first systematic study on how data augmentation techniques impact performance across toxic language classifiers, ranging from shallow logistic regression architectures to BERT – a state-of-the-art pretrained Transformer network. We compare the performance of eight techniques on very scarce seed datasets. We show that while BERT performed the best, shallow classifiers performed comparably when trained on data augmented with a combination of three techniques, including GPT-2-generated sentences. We discuss the interplay of performance and computational overhead, which can inform the choice of techniques under different constraints.
%R 10.18653/v1/2020.findings-emnlp.269
%U https://aclanthology.org/2020.findings-emnlp.269
%U https://doi.org/10.18653/v1/2020.findings-emnlp.269
%P 2991-3009
Markdown (Informal)
[A little goes a long way: Improving toxic language classification despite data scarcity](https://aclanthology.org/2020.findings-emnlp.269) (Juuti et al., Findings 2020)
ACL