@inproceedings{hamdy-etal-2020-nlpup,
title = "nlp{UP} at {S}em{E}val-2020 Task 12 : A Blazing Fast System for Offensive Language Detection",
author = "Hamdy, Ehab and
Mitrovi{\'c}, Jelena and
Granitzer, Michael",
editor = "Herbelot, Aurelie and
Zhu, Xiaodan and
Palmer, Alexis and
Schneider, Nathan and
May, Jonathan and
Shutova, Ekaterina",
booktitle = "Proceedings of the Fourteenth Workshop on Semantic Evaluation",
month = dec,
year = "2020",
address = "Barcelona (online)",
publisher = "International Committee for Computational Linguistics",
url = "https://aclanthology.org/2020.semeval-1.278/",
doi = "10.18653/v1/2020.semeval-1.278",
pages = "2098--2104",
abstract = "In this paper, we introduce our submission for the SemEval Task 12, sub-tasks A and B for offensive language identification and categorization in English tweets. This year the data set for Task A is significantly larger than in the previous year. Therefore, we have adapted the BlazingText algorithm to extract embedding representation and classify texts after filtering and sanitizing the dataset according to the conventional text patterns on social media. We have gained both advantages of a speedy training process and obtained a good F1 score of 90.88{\%} on the test set. For sub-task B, we opted to fine-tune a Bidirectional Encoder Representation from a Transformer (BERT) to accommodate the limited data for categorizing offensive tweets. We have achieved an F1 score of only 56.86{\%}, but after experimenting with various label assignment thresholds in the pre-processing steps, the F1 score improved to 64{\%}."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hamdy-etal-2020-nlpup">
<titleInfo>
<title>nlpUP at SemEval-2020 Task 12 : A Blazing Fast System for Offensive Language Detection</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ehab</namePart>
<namePart type="family">Hamdy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jelena</namePart>
<namePart type="family">Mitrović</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Granitzer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fourteenth Workshop on Semantic Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aurelie</namePart>
<namePart type="family">Herbelot</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaodan</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexis</namePart>
<namePart type="family">Palmer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nathan</namePart>
<namePart type="family">Schneider</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jonathan</namePart>
<namePart type="family">May</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Committee for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Barcelona (online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper, we introduce our submission for the SemEval Task 12, sub-tasks A and B for offensive language identification and categorization in English tweets. This year the data set for Task A is significantly larger than in the previous year. Therefore, we have adapted the BlazingText algorithm to extract embedding representation and classify texts after filtering and sanitizing the dataset according to the conventional text patterns on social media. We have gained both advantages of a speedy training process and obtained a good F1 score of 90.88% on the test set. For sub-task B, we opted to fine-tune a Bidirectional Encoder Representation from a Transformer (BERT) to accommodate the limited data for categorizing offensive tweets. We have achieved an F1 score of only 56.86%, but after experimenting with various label assignment thresholds in the pre-processing steps, the F1 score improved to 64%.</abstract>
<identifier type="citekey">hamdy-etal-2020-nlpup</identifier>
<identifier type="doi">10.18653/v1/2020.semeval-1.278</identifier>
<location>
<url>https://aclanthology.org/2020.semeval-1.278/</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>2098</start>
<end>2104</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T nlpUP at SemEval-2020 Task 12 : A Blazing Fast System for Offensive Language Detection
%A Hamdy, Ehab
%A Mitrović, Jelena
%A Granitzer, Michael
%Y Herbelot, Aurelie
%Y Zhu, Xiaodan
%Y Palmer, Alexis
%Y Schneider, Nathan
%Y May, Jonathan
%Y Shutova, Ekaterina
%S Proceedings of the Fourteenth Workshop on Semantic Evaluation
%D 2020
%8 December
%I International Committee for Computational Linguistics
%C Barcelona (online)
%F hamdy-etal-2020-nlpup
%X In this paper, we introduce our submission for the SemEval Task 12, sub-tasks A and B for offensive language identification and categorization in English tweets. This year the data set for Task A is significantly larger than in the previous year. Therefore, we have adapted the BlazingText algorithm to extract embedding representation and classify texts after filtering and sanitizing the dataset according to the conventional text patterns on social media. We have gained both advantages of a speedy training process and obtained a good F1 score of 90.88% on the test set. For sub-task B, we opted to fine-tune a Bidirectional Encoder Representation from a Transformer (BERT) to accommodate the limited data for categorizing offensive tweets. We have achieved an F1 score of only 56.86%, but after experimenting with various label assignment thresholds in the pre-processing steps, the F1 score improved to 64%.
%R 10.18653/v1/2020.semeval-1.278
%U https://aclanthology.org/2020.semeval-1.278/
%U https://doi.org/10.18653/v1/2020.semeval-1.278
%P 2098-2104
Markdown (Informal)
[nlpUP at SemEval-2020 Task 12 : A Blazing Fast System for Offensive Language Detection](https://aclanthology.org/2020.semeval-1.278/) (Hamdy et al., SemEval 2020)
ACL