@inproceedings{schmidhuber-2021-universitat,
title = {Universit{\"a}t Regensburg {M}ax{S} at {G}erm{E}val 2021 Task 1: Synthetic Data in Toxic Comment Classification},
author = "Schmidhuber, Maximilian",
editor = "Risch, Julian and
Stoll, Anke and
Wilms, Lena and
Wiegand, Michael",
booktitle = "Proceedings of the GermEval 2021 Shared Task on the Identification of Toxic, Engaging, and Fact-Claiming Comments",
month = sep,
year = "2021",
address = "Duesseldorf, Germany",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.germeval-1.9",
pages = "62--68",
abstract = "We report on our submission to Task 1 of the GermEval 2021 challenge {--} toxic comment classification. We investigate different ways of bolstering scarce training data to improve off-the-shelf model performance on a toxic comment classification task. To help address the limitations of a small dataset, we use data synthetically generated by a German GPT-2 model. The use of synthetic data has only recently been taking off as a possible solution to ad- dressing training data sparseness in NLP, and initial results are promising. However, our model did not see measurable improvement through the use of synthetic data. We discuss possible reasons for this finding and explore future works in the field.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="schmidhuber-2021-universitat">
<titleInfo>
<title>Universität Regensburg MaxS at GermEval 2021 Task 1: Synthetic Data in Toxic Comment Classification</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maximilian</namePart>
<namePart type="family">Schmidhuber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the GermEval 2021 Shared Task on the Identification of Toxic, Engaging, and Fact-Claiming Comments</title>
</titleInfo>
<name type="personal">
<namePart type="given">Julian</namePart>
<namePart type="family">Risch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anke</namePart>
<namePart type="family">Stoll</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lena</namePart>
<namePart type="family">Wilms</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Wiegand</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Duesseldorf, Germany</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We report on our submission to Task 1 of the GermEval 2021 challenge – toxic comment classification. We investigate different ways of bolstering scarce training data to improve off-the-shelf model performance on a toxic comment classification task. To help address the limitations of a small dataset, we use data synthetically generated by a German GPT-2 model. The use of synthetic data has only recently been taking off as a possible solution to ad- dressing training data sparseness in NLP, and initial results are promising. However, our model did not see measurable improvement through the use of synthetic data. We discuss possible reasons for this finding and explore future works in the field.</abstract>
<identifier type="citekey">schmidhuber-2021-universitat</identifier>
<location>
<url>https://aclanthology.org/2021.germeval-1.9</url>
</location>
<part>
<date>2021-09</date>
<extent unit="page">
<start>62</start>
<end>68</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Universität Regensburg MaxS at GermEval 2021 Task 1: Synthetic Data in Toxic Comment Classification
%A Schmidhuber, Maximilian
%Y Risch, Julian
%Y Stoll, Anke
%Y Wilms, Lena
%Y Wiegand, Michael
%S Proceedings of the GermEval 2021 Shared Task on the Identification of Toxic, Engaging, and Fact-Claiming Comments
%D 2021
%8 September
%I Association for Computational Linguistics
%C Duesseldorf, Germany
%F schmidhuber-2021-universitat
%X We report on our submission to Task 1 of the GermEval 2021 challenge – toxic comment classification. We investigate different ways of bolstering scarce training data to improve off-the-shelf model performance on a toxic comment classification task. To help address the limitations of a small dataset, we use data synthetically generated by a German GPT-2 model. The use of synthetic data has only recently been taking off as a possible solution to ad- dressing training data sparseness in NLP, and initial results are promising. However, our model did not see measurable improvement through the use of synthetic data. We discuss possible reasons for this finding and explore future works in the field.
%U https://aclanthology.org/2021.germeval-1.9
%P 62-68
Markdown (Informal)
[Universität Regensburg MaxS at GermEval 2021 Task 1: Synthetic Data in Toxic Comment Classification](https://aclanthology.org/2021.germeval-1.9) (Schmidhuber, GermEval 2021)
ACL