@inproceedings{pelicon-etal-2024-denoising,
title = "Denoising Labeled Data for Comment Moderation Using Active Learning",
author = "Pelicon, Andra{\v{z}} and
Karan, Mladen and
Shekhar, Ravi and
Purver, Matthew and
Pollak, Senja",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.413",
pages = "4626--4633",
abstract = "Noisily labeled textual data is ample on internet platforms that allow user-created content. Training models, such as offensive language detection models for comment moderation, on such data may prove difficult as the noise in the labels prevents the model to converge. In this work, we propose to use active learning methods for the purposes of denoising training data for model training. The goal is to sample examples the most informative examples with noisy labels with active learning and send them to the oracle for reannotation thus reducing the overall cost of reannotation. In this setting we tested three existing active learning methods, namely DBAL, Variance of Gradients (VoG) and BADGE. The proposed approach to data denoising is tested on the problem of offensive language detection. We observe that active learning can be effectively used for the purposes of data denoising, however care should be taken when choosing the algorithm for this purpose.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pelicon-etal-2024-denoising">
<titleInfo>
<title>Denoising Labeled Data for Comment Moderation Using Active Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Andraž</namePart>
<namePart type="family">Pelicon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mladen</namePart>
<namePart type="family">Karan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ravi</namePart>
<namePart type="family">Shekhar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matthew</namePart>
<namePart type="family">Purver</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Senja</namePart>
<namePart type="family">Pollak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Noisily labeled textual data is ample on internet platforms that allow user-created content. Training models, such as offensive language detection models for comment moderation, on such data may prove difficult as the noise in the labels prevents the model to converge. In this work, we propose to use active learning methods for the purposes of denoising training data for model training. The goal is to sample examples the most informative examples with noisy labels with active learning and send them to the oracle for reannotation thus reducing the overall cost of reannotation. In this setting we tested three existing active learning methods, namely DBAL, Variance of Gradients (VoG) and BADGE. The proposed approach to data denoising is tested on the problem of offensive language detection. We observe that active learning can be effectively used for the purposes of data denoising, however care should be taken when choosing the algorithm for this purpose.</abstract>
<identifier type="citekey">pelicon-etal-2024-denoising</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.413</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>4626</start>
<end>4633</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Denoising Labeled Data for Comment Moderation Using Active Learning
%A Pelicon, Andraž
%A Karan, Mladen
%A Shekhar, Ravi
%A Purver, Matthew
%A Pollak, Senja
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F pelicon-etal-2024-denoising
%X Noisily labeled textual data is ample on internet platforms that allow user-created content. Training models, such as offensive language detection models for comment moderation, on such data may prove difficult as the noise in the labels prevents the model to converge. In this work, we propose to use active learning methods for the purposes of denoising training data for model training. The goal is to sample examples the most informative examples with noisy labels with active learning and send them to the oracle for reannotation thus reducing the overall cost of reannotation. In this setting we tested three existing active learning methods, namely DBAL, Variance of Gradients (VoG) and BADGE. The proposed approach to data denoising is tested on the problem of offensive language detection. We observe that active learning can be effectively used for the purposes of data denoising, however care should be taken when choosing the algorithm for this purpose.
%U https://aclanthology.org/2024.lrec-main.413
%P 4626-4633
Markdown (Informal)
[Denoising Labeled Data for Comment Moderation Using Active Learning](https://aclanthology.org/2024.lrec-main.413) (Pelicon et al., LREC-COLING 2024)
ACL
- Andraž Pelicon, Mladen Karan, Ravi Shekhar, Matthew Purver, and Senja Pollak. 2024. Denoising Labeled Data for Comment Moderation Using Active Learning. In Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), pages 4626–4633, Torino, Italia. ELRA and ICCL.