@inproceedings{ginga-uban-2024-scitechbaitro,
title = "{S}ci{T}ech{B}ait{RO}: {C}lick{B}ait Detection for {R}omanian Science and Technology News",
author = "G{\^\i}nga, Raluca-Andreea and
Uban, Ana Sabina",
editor = "Dementieva, Daryna and
Ignat, Oana and
Jin, Zhijing and
Mihalcea, Rada and
Piatti, Giorgio and
Tetreault, Joel and
Wilson, Steven and
Zhao, Jieyu",
booktitle = "Proceedings of the Third Workshop on NLP for Positive Impact",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.nlp4pi-1.17",
pages = "188--201",
abstract = "In this paper, we introduce a new annotated corpus of clickbait news in a low-resource language - Romanian, and a rarely covered domain - science and technology news: SciTechBaitRO. It is one of the first and the largest corpus (almost 11,000 examples) of annotated clickbait texts for the Romanian language and the first one to focus on the sci-tech domain, to our knowledge. We evaluate the possibility of automatically detecting clickbait through a series of data analysis and machine learning experiments with varied features and models, including a range of linguistic features, classical machine learning models, deep learning and pre-trained models. We compare the performance of models using different kinds of features, and show that the best results are given by the BERT models, with results of up to 89{\%} F1 score. We additionally evaluate the models in a cross-domain setting for news belonging to other categories (i.e. politics, sports, entertainment) and demonstrate their capacity to generalize by detecting clickbait news outside of domain with high F1-scores.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ginga-uban-2024-scitechbaitro">
<titleInfo>
<title>SciTechBaitRO: ClickBait Detection for Romanian Science and Technology News</title>
</titleInfo>
<name type="personal">
<namePart type="given">Raluca-Andreea</namePart>
<namePart type="family">Gînga</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ana</namePart>
<namePart type="given">Sabina</namePart>
<namePart type="family">Uban</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Third Workshop on NLP for Positive Impact</title>
</titleInfo>
<name type="personal">
<namePart type="given">Daryna</namePart>
<namePart type="family">Dementieva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oana</namePart>
<namePart type="family">Ignat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhijing</namePart>
<namePart type="family">Jin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rada</namePart>
<namePart type="family">Mihalcea</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Giorgio</namePart>
<namePart type="family">Piatti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joel</namePart>
<namePart type="family">Tetreault</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Wilson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jieyu</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper, we introduce a new annotated corpus of clickbait news in a low-resource language - Romanian, and a rarely covered domain - science and technology news: SciTechBaitRO. It is one of the first and the largest corpus (almost 11,000 examples) of annotated clickbait texts for the Romanian language and the first one to focus on the sci-tech domain, to our knowledge. We evaluate the possibility of automatically detecting clickbait through a series of data analysis and machine learning experiments with varied features and models, including a range of linguistic features, classical machine learning models, deep learning and pre-trained models. We compare the performance of models using different kinds of features, and show that the best results are given by the BERT models, with results of up to 89% F1 score. We additionally evaluate the models in a cross-domain setting for news belonging to other categories (i.e. politics, sports, entertainment) and demonstrate their capacity to generalize by detecting clickbait news outside of domain with high F1-scores.</abstract>
<identifier type="citekey">ginga-uban-2024-scitechbaitro</identifier>
<location>
<url>https://aclanthology.org/2024.nlp4pi-1.17</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>188</start>
<end>201</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SciTechBaitRO: ClickBait Detection for Romanian Science and Technology News
%A Gînga, Raluca-Andreea
%A Uban, Ana Sabina
%Y Dementieva, Daryna
%Y Ignat, Oana
%Y Jin, Zhijing
%Y Mihalcea, Rada
%Y Piatti, Giorgio
%Y Tetreault, Joel
%Y Wilson, Steven
%Y Zhao, Jieyu
%S Proceedings of the Third Workshop on NLP for Positive Impact
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F ginga-uban-2024-scitechbaitro
%X In this paper, we introduce a new annotated corpus of clickbait news in a low-resource language - Romanian, and a rarely covered domain - science and technology news: SciTechBaitRO. It is one of the first and the largest corpus (almost 11,000 examples) of annotated clickbait texts for the Romanian language and the first one to focus on the sci-tech domain, to our knowledge. We evaluate the possibility of automatically detecting clickbait through a series of data analysis and machine learning experiments with varied features and models, including a range of linguistic features, classical machine learning models, deep learning and pre-trained models. We compare the performance of models using different kinds of features, and show that the best results are given by the BERT models, with results of up to 89% F1 score. We additionally evaluate the models in a cross-domain setting for news belonging to other categories (i.e. politics, sports, entertainment) and demonstrate their capacity to generalize by detecting clickbait news outside of domain with high F1-scores.
%U https://aclanthology.org/2024.nlp4pi-1.17
%P 188-201
Markdown (Informal)
[SciTechBaitRO: ClickBait Detection for Romanian Science and Technology News](https://aclanthology.org/2024.nlp4pi-1.17) (Gînga & Uban, NLP4PI 2024)
ACL