@inproceedings{mainali-etal-2025-silver,
title = "Silver@{CASE}2025: Detection of Hate Speech, Targets, Humor, and Stance in Marginalized Movement",
author = "Mainali, Rohan and
Aryal, Neha and
Poudel, Sweta and
Acharya, Anupraj and
Thapa, Rabin",
editor = {H{\"u}rriyeto{\u{g}}lu, Ali and
Tanev, Hristo and
Thapa, Surendrabikram},
booktitle = "Proceedings of the 8th Workshop on Challenges and Applications of Automated Extraction of Socio-political Events from Texts",
month = sep,
year = "2025",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2025.case-1.10/",
pages = "83--90",
abstract = "Memes, a multimodal form of communication, have emerged as a popular mode of expression in online discourse, particularly among marginalized groups. With multiple meanings, memes often combine satire, irony, and nuanced language, presenting particular challenges to machines in detecting hate speech, humor, stance, and the target of hostility. This paper presents a comparison of unimodal and multimodal solutions to address all four subtasks of the CASE 2025 Shared Task on Multimodal Hate, Humor, and Stance Detection. We compare transformer-based text models (BERT, RoBERTa) with CNN-based vision models (DenseNet, EfficientNet), and multimodal fusion methods, such as CLIP. We find that multimodal systems consistently outperform the unimodal baseline, with CLIP performing the best on all subtasks with a macro F1 score of 78{\%} in sub-task A, 56{\%} in sub-task B, 59{\%} in sub-task C, and 72{\%} in sub-task D."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mainali-etal-2025-silver">
<titleInfo>
<title>Silver@CASE2025: Detection of Hate Speech, Targets, Humor, and Stance in Marginalized Movement</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rohan</namePart>
<namePart type="family">Mainali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Neha</namePart>
<namePart type="family">Aryal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sweta</namePart>
<namePart type="family">Poudel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anupraj</namePart>
<namePart type="family">Acharya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rabin</namePart>
<namePart type="family">Thapa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 8th Workshop on Challenges and Applications of Automated Extraction of Socio-political Events from Texts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ali</namePart>
<namePart type="family">Hürriyetoğlu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hristo</namePart>
<namePart type="family">Tanev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Surendrabikram</namePart>
<namePart type="family">Thapa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Memes, a multimodal form of communication, have emerged as a popular mode of expression in online discourse, particularly among marginalized groups. With multiple meanings, memes often combine satire, irony, and nuanced language, presenting particular challenges to machines in detecting hate speech, humor, stance, and the target of hostility. This paper presents a comparison of unimodal and multimodal solutions to address all four subtasks of the CASE 2025 Shared Task on Multimodal Hate, Humor, and Stance Detection. We compare transformer-based text models (BERT, RoBERTa) with CNN-based vision models (DenseNet, EfficientNet), and multimodal fusion methods, such as CLIP. We find that multimodal systems consistently outperform the unimodal baseline, with CLIP performing the best on all subtasks with a macro F1 score of 78% in sub-task A, 56% in sub-task B, 59% in sub-task C, and 72% in sub-task D.</abstract>
<identifier type="citekey">mainali-etal-2025-silver</identifier>
<location>
<url>https://aclanthology.org/2025.case-1.10/</url>
</location>
<part>
<date>2025-09</date>
<extent unit="page">
<start>83</start>
<end>90</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Silver@CASE2025: Detection of Hate Speech, Targets, Humor, and Stance in Marginalized Movement
%A Mainali, Rohan
%A Aryal, Neha
%A Poudel, Sweta
%A Acharya, Anupraj
%A Thapa, Rabin
%Y Hürriyetoğlu, Ali
%Y Tanev, Hristo
%Y Thapa, Surendrabikram
%S Proceedings of the 8th Workshop on Challenges and Applications of Automated Extraction of Socio-political Events from Texts
%D 2025
%8 September
%I INCOMA Ltd., Shoumen, Bulgaria
%C Varna, Bulgaria
%F mainali-etal-2025-silver
%X Memes, a multimodal form of communication, have emerged as a popular mode of expression in online discourse, particularly among marginalized groups. With multiple meanings, memes often combine satire, irony, and nuanced language, presenting particular challenges to machines in detecting hate speech, humor, stance, and the target of hostility. This paper presents a comparison of unimodal and multimodal solutions to address all four subtasks of the CASE 2025 Shared Task on Multimodal Hate, Humor, and Stance Detection. We compare transformer-based text models (BERT, RoBERTa) with CNN-based vision models (DenseNet, EfficientNet), and multimodal fusion methods, such as CLIP. We find that multimodal systems consistently outperform the unimodal baseline, with CLIP performing the best on all subtasks with a macro F1 score of 78% in sub-task A, 56% in sub-task B, 59% in sub-task C, and 72% in sub-task D.
%U https://aclanthology.org/2025.case-1.10/
%P 83-90
Markdown (Informal)
[Silver@CASE2025: Detection of Hate Speech, Targets, Humor, and Stance in Marginalized Movement](https://aclanthology.org/2025.case-1.10/) (Mainali et al., CASE 2025)
ACL
- Rohan Mainali, Neha Aryal, Sweta Poudel, Anupraj Acharya, and Rabin Thapa. 2025. Silver@CASE2025: Detection of Hate Speech, Targets, Humor, and Stance in Marginalized Movement. In Proceedings of the 8th Workshop on Challenges and Applications of Automated Extraction of Socio-political Events from Texts, pages 83–90, Varna, Bulgaria. INCOMA Ltd., Shoumen, Bulgaria.