@inproceedings{dinkov-etal-2019-detecting,
title = "Detecting Toxicity in News Articles: Application to {B}ulgarian",
author = "Dinkov, Yoan and
Koychev, Ivan and
Nakov, Preslav",
editor = "Mitkov, Ruslan and
Angelova, Galia",
booktitle = "Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2019)",
month = sep,
year = "2019",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd.",
url = "https://aclanthology.org/R19-1029",
doi = "10.26615/978-954-452-056-4_029",
pages = "247--258",
abstract = "Online media aim for reaching ever bigger audience and for attracting ever longer attention span. This competition creates an environment that rewards sensational, fake, and toxic news. To help limit their spread and impact, we propose and develop a news toxicity detector that can recognize various types of toxic content. While previous research primarily focused on English, here we target Bulgarian. We created a new dataset by crawling a website that for five years has been collecting Bulgarian news articles that were manually categorized into eight toxicity groups. Then we trained a multi-class classifier with nine categories: eight toxic and one non-toxic. We experimented with different representations based on ElMo, BERT, and XLM, as well as with a variety of domain-specific features. Due to the small size of our dataset, we created a separate model for each feature type, and we ultimately combined these models into a meta-classifier. The evaluation results show an accuracy of 59.0{\%} and a macro-F1 score of 39.7{\%}, which represent sizable improvements over the majority-class baseline (Acc=30.3{\%}, macro-F1=5.2{\%}).",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dinkov-etal-2019-detecting">
<titleInfo>
<title>Detecting Toxicity in News Articles: Application to Bulgarian</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yoan</namePart>
<namePart type="family">Dinkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ivan</namePart>
<namePart type="family">Koychev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preslav</namePart>
<namePart type="family">Nakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2019)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Galia</namePart>
<namePart type="family">Angelova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd.</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Online media aim for reaching ever bigger audience and for attracting ever longer attention span. This competition creates an environment that rewards sensational, fake, and toxic news. To help limit their spread and impact, we propose and develop a news toxicity detector that can recognize various types of toxic content. While previous research primarily focused on English, here we target Bulgarian. We created a new dataset by crawling a website that for five years has been collecting Bulgarian news articles that were manually categorized into eight toxicity groups. Then we trained a multi-class classifier with nine categories: eight toxic and one non-toxic. We experimented with different representations based on ElMo, BERT, and XLM, as well as with a variety of domain-specific features. Due to the small size of our dataset, we created a separate model for each feature type, and we ultimately combined these models into a meta-classifier. The evaluation results show an accuracy of 59.0% and a macro-F1 score of 39.7%, which represent sizable improvements over the majority-class baseline (Acc=30.3%, macro-F1=5.2%).</abstract>
<identifier type="citekey">dinkov-etal-2019-detecting</identifier>
<identifier type="doi">10.26615/978-954-452-056-4_029</identifier>
<location>
<url>https://aclanthology.org/R19-1029</url>
</location>
<part>
<date>2019-09</date>
<extent unit="page">
<start>247</start>
<end>258</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Detecting Toxicity in News Articles: Application to Bulgarian
%A Dinkov, Yoan
%A Koychev, Ivan
%A Nakov, Preslav
%Y Mitkov, Ruslan
%Y Angelova, Galia
%S Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2019)
%D 2019
%8 September
%I INCOMA Ltd.
%C Varna, Bulgaria
%F dinkov-etal-2019-detecting
%X Online media aim for reaching ever bigger audience and for attracting ever longer attention span. This competition creates an environment that rewards sensational, fake, and toxic news. To help limit their spread and impact, we propose and develop a news toxicity detector that can recognize various types of toxic content. While previous research primarily focused on English, here we target Bulgarian. We created a new dataset by crawling a website that for five years has been collecting Bulgarian news articles that were manually categorized into eight toxicity groups. Then we trained a multi-class classifier with nine categories: eight toxic and one non-toxic. We experimented with different representations based on ElMo, BERT, and XLM, as well as with a variety of domain-specific features. Due to the small size of our dataset, we created a separate model for each feature type, and we ultimately combined these models into a meta-classifier. The evaluation results show an accuracy of 59.0% and a macro-F1 score of 39.7%, which represent sizable improvements over the majority-class baseline (Acc=30.3%, macro-F1=5.2%).
%R 10.26615/978-954-452-056-4_029
%U https://aclanthology.org/R19-1029
%U https://doi.org/10.26615/978-954-452-056-4_029
%P 247-258
Markdown (Informal)
[Detecting Toxicity in News Articles: Application to Bulgarian](https://aclanthology.org/R19-1029) (Dinkov et al., RANLP 2019)
ACL