@inproceedings{joy-etal-2025-cuet,
title = "{CUET} {NOOB}@{CASE}2025: {M}ultimodal{H}ate Speech Detection in Text-Embedded Memes using Late Fusion with Attention Mechanism",
author = "Joy, Tomal Paul and
Islam, Aminul and
Islam, Saimum and
Shawon, Md. Tanvir Ahammed and
Mia, Md. Ayon and
Khan, Mohammad Ibrahim",
editor = {H{\"u}rriyeto{\u{g}}lu, Ali and
Tanev, Hristo and
Thapa, Surendrabikram},
booktitle = "Proceedings of the 8th Workshop on Challenges and Applications of Automated Extraction of Socio-political Events from Texts",
month = sep,
year = "2025",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2025.case-1.19/",
pages = "152--158",
abstract = "Memes and text-embedded images have rapidly become compelling cultural artifacts that both facilitate expressive communication and serve as conduits for spreading hate speech against marginalized communities. Detecting hate speech within such multimodal content poses significant challenges due to the complex and subtle interplay between textual and visual elements. This paper presents our approach for Subtask A of the Shared Task on Multimodal Hate Detection in Marginalized Movement@CASE 2025, focusing on the binary classification of memes into Hate or No Hate categories. We propose a novel multimodal architecture that integrates DistilBERT for textual encoding with Vision Transformer (ViT) for image representation, combined through an advanced late fusion mechanism leveraging multi-head attention. Our method utilizes attention-based feature alignment to capture nuanced cross-modal interactions within memes. The proposed system achieved an F1-score of 0.7416 on the test set, securing the 13th position in the competition. These results underscore the value of sophisticated fusion strategies and attention mechanisms in comprehending and detecting complex socio-political content embedded in memes."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="joy-etal-2025-cuet">
<titleInfo>
<title>CUET NOOB@CASE2025: MultimodalHate Speech Detection in Text-Embedded Memes using Late Fusion with Attention Mechanism</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tomal</namePart>
<namePart type="given">Paul</namePart>
<namePart type="family">Joy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aminul</namePart>
<namePart type="family">Islam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Saimum</namePart>
<namePart type="family">Islam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Md.</namePart>
<namePart type="given">Tanvir</namePart>
<namePart type="given">Ahammed</namePart>
<namePart type="family">Shawon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Md.</namePart>
<namePart type="given">Ayon</namePart>
<namePart type="family">Mia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Ibrahim</namePart>
<namePart type="family">Khan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 8th Workshop on Challenges and Applications of Automated Extraction of Socio-political Events from Texts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ali</namePart>
<namePart type="family">Hürriyetoğlu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hristo</namePart>
<namePart type="family">Tanev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Surendrabikram</namePart>
<namePart type="family">Thapa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Memes and text-embedded images have rapidly become compelling cultural artifacts that both facilitate expressive communication and serve as conduits for spreading hate speech against marginalized communities. Detecting hate speech within such multimodal content poses significant challenges due to the complex and subtle interplay between textual and visual elements. This paper presents our approach for Subtask A of the Shared Task on Multimodal Hate Detection in Marginalized Movement@CASE 2025, focusing on the binary classification of memes into Hate or No Hate categories. We propose a novel multimodal architecture that integrates DistilBERT for textual encoding with Vision Transformer (ViT) for image representation, combined through an advanced late fusion mechanism leveraging multi-head attention. Our method utilizes attention-based feature alignment to capture nuanced cross-modal interactions within memes. The proposed system achieved an F1-score of 0.7416 on the test set, securing the 13th position in the competition. These results underscore the value of sophisticated fusion strategies and attention mechanisms in comprehending and detecting complex socio-political content embedded in memes.</abstract>
<identifier type="citekey">joy-etal-2025-cuet</identifier>
<location>
<url>https://aclanthology.org/2025.case-1.19/</url>
</location>
<part>
<date>2025-09</date>
<extent unit="page">
<start>152</start>
<end>158</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CUET NOOB@CASE2025: MultimodalHate Speech Detection in Text-Embedded Memes using Late Fusion with Attention Mechanism
%A Joy, Tomal Paul
%A Islam, Aminul
%A Islam, Saimum
%A Shawon, Md. Tanvir Ahammed
%A Mia, Md. Ayon
%A Khan, Mohammad Ibrahim
%Y Hürriyetoğlu, Ali
%Y Tanev, Hristo
%Y Thapa, Surendrabikram
%S Proceedings of the 8th Workshop on Challenges and Applications of Automated Extraction of Socio-political Events from Texts
%D 2025
%8 September
%I INCOMA Ltd., Shoumen, Bulgaria
%C Varna, Bulgaria
%F joy-etal-2025-cuet
%X Memes and text-embedded images have rapidly become compelling cultural artifacts that both facilitate expressive communication and serve as conduits for spreading hate speech against marginalized communities. Detecting hate speech within such multimodal content poses significant challenges due to the complex and subtle interplay between textual and visual elements. This paper presents our approach for Subtask A of the Shared Task on Multimodal Hate Detection in Marginalized Movement@CASE 2025, focusing on the binary classification of memes into Hate or No Hate categories. We propose a novel multimodal architecture that integrates DistilBERT for textual encoding with Vision Transformer (ViT) for image representation, combined through an advanced late fusion mechanism leveraging multi-head attention. Our method utilizes attention-based feature alignment to capture nuanced cross-modal interactions within memes. The proposed system achieved an F1-score of 0.7416 on the test set, securing the 13th position in the competition. These results underscore the value of sophisticated fusion strategies and attention mechanisms in comprehending and detecting complex socio-political content embedded in memes.
%U https://aclanthology.org/2025.case-1.19/
%P 152-158
Markdown (Informal)
[CUET NOOB@CASE2025: MultimodalHate Speech Detection in Text-Embedded Memes using Late Fusion with Attention Mechanism](https://aclanthology.org/2025.case-1.19/) (Joy et al., CASE 2025)
ACL
- Tomal Paul Joy, Aminul Islam, Saimum Islam, Md. Tanvir Ahammed Shawon, Md. Ayon Mia, and Mohammad Ibrahim Khan. 2025. CUET NOOB@CASE2025: MultimodalHate Speech Detection in Text-Embedded Memes using Late Fusion with Attention Mechanism. In Proceedings of the 8th Workshop on Challenges and Applications of Automated Extraction of Socio-political Events from Texts, pages 152–158, Varna, Bulgaria. INCOMA Ltd., Shoumen, Bulgaria.