@inproceedings{maharjan-etal-2025-multimodal,
title = "Multimodal Kathmandu@{CASE} 2025: Task-Specific Adaptation of Multimodal Transformers for Hate, Stance, and Humor Detection",
author = "Maharjan, Sujal and
Shrestha, Astha and
Thakur, Shuvam and
Thapa, Rabin",
editor = {H{\"u}rriyeto{\u{g}}lu, Ali and
Tanev, Hristo and
Thapa, Surendrabikram},
booktitle = "Proceedings of the 8th Workshop on Challenges and Applications of Automated Extraction of Socio-political Events from Texts",
month = sep,
year = "2025",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2025.case-1.13/",
pages = "107--114",
abstract = "The multimodal ambiguity of text-embedded images (memes), particularly those pertaining to marginalized communities, presents a significant challenge for natural language and vision processing. The subtle interaction between text, image, and cultural context makes it challenging to develop robust moderation tools. This paper tackles this challenge across four key tasks: (A) Hate Speech Detection, (B) Hate Target Classification, (C) Topical Stance Classification, and (D) Intended Humor Detection. We demonstrate that the nuances of these tasks demand a departure from a `onesize-fits-all' approach. Our central contribution is a task-specific methodology, where we align model architecture with the specific challenges of each task, all built upon a common CLIP-ViT backbone. Our results illustrate the strong performance of this task-specific approach, with multiple architectures excelling at each task. For Hate Speech Detection (Task A), the Co-Attention Ensemble model achieved a top F1-score of 0.7929; for Hate Target Classification (Task B), our Hierarchical CrossAttention Transformer achieved an F1-score of 0.5777; and for Stance (Task C) and Humor Detection (Task D), our Two-Stage Multiplicative Fusion Framework yielded leading F1-scores of 0.6070 and 0.7529, respectively. Beyond raw results, we also provide detailed error analyses, including confusion matrices, to reveal weaknesses driven by multimodal ambiguity and class imbalance. Ultimately, this work provides a blueprint for the community, establishing that optimal performance in multimodal analysis is achieved not by a single superior model, but through the customized design of specialized solutions, supported by empirical validation of key methodological choices."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="maharjan-etal-2025-multimodal">
<titleInfo>
<title>Multimodal Kathmandu@CASE 2025: Task-Specific Adaptation of Multimodal Transformers for Hate, Stance, and Humor Detection</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sujal</namePart>
<namePart type="family">Maharjan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Astha</namePart>
<namePart type="family">Shrestha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuvam</namePart>
<namePart type="family">Thakur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rabin</namePart>
<namePart type="family">Thapa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 8th Workshop on Challenges and Applications of Automated Extraction of Socio-political Events from Texts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ali</namePart>
<namePart type="family">Hürriyetoğlu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hristo</namePart>
<namePart type="family">Tanev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Surendrabikram</namePart>
<namePart type="family">Thapa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The multimodal ambiguity of text-embedded images (memes), particularly those pertaining to marginalized communities, presents a significant challenge for natural language and vision processing. The subtle interaction between text, image, and cultural context makes it challenging to develop robust moderation tools. This paper tackles this challenge across four key tasks: (A) Hate Speech Detection, (B) Hate Target Classification, (C) Topical Stance Classification, and (D) Intended Humor Detection. We demonstrate that the nuances of these tasks demand a departure from a ‘onesize-fits-all’ approach. Our central contribution is a task-specific methodology, where we align model architecture with the specific challenges of each task, all built upon a common CLIP-ViT backbone. Our results illustrate the strong performance of this task-specific approach, with multiple architectures excelling at each task. For Hate Speech Detection (Task A), the Co-Attention Ensemble model achieved a top F1-score of 0.7929; for Hate Target Classification (Task B), our Hierarchical CrossAttention Transformer achieved an F1-score of 0.5777; and for Stance (Task C) and Humor Detection (Task D), our Two-Stage Multiplicative Fusion Framework yielded leading F1-scores of 0.6070 and 0.7529, respectively. Beyond raw results, we also provide detailed error analyses, including confusion matrices, to reveal weaknesses driven by multimodal ambiguity and class imbalance. Ultimately, this work provides a blueprint for the community, establishing that optimal performance in multimodal analysis is achieved not by a single superior model, but through the customized design of specialized solutions, supported by empirical validation of key methodological choices.</abstract>
<identifier type="citekey">maharjan-etal-2025-multimodal</identifier>
<location>
<url>https://aclanthology.org/2025.case-1.13/</url>
</location>
<part>
<date>2025-09</date>
<extent unit="page">
<start>107</start>
<end>114</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Multimodal Kathmandu@CASE 2025: Task-Specific Adaptation of Multimodal Transformers for Hate, Stance, and Humor Detection
%A Maharjan, Sujal
%A Shrestha, Astha
%A Thakur, Shuvam
%A Thapa, Rabin
%Y Hürriyetoğlu, Ali
%Y Tanev, Hristo
%Y Thapa, Surendrabikram
%S Proceedings of the 8th Workshop on Challenges and Applications of Automated Extraction of Socio-political Events from Texts
%D 2025
%8 September
%I INCOMA Ltd., Shoumen, Bulgaria
%C Varna, Bulgaria
%F maharjan-etal-2025-multimodal
%X The multimodal ambiguity of text-embedded images (memes), particularly those pertaining to marginalized communities, presents a significant challenge for natural language and vision processing. The subtle interaction between text, image, and cultural context makes it challenging to develop robust moderation tools. This paper tackles this challenge across four key tasks: (A) Hate Speech Detection, (B) Hate Target Classification, (C) Topical Stance Classification, and (D) Intended Humor Detection. We demonstrate that the nuances of these tasks demand a departure from a ‘onesize-fits-all’ approach. Our central contribution is a task-specific methodology, where we align model architecture with the specific challenges of each task, all built upon a common CLIP-ViT backbone. Our results illustrate the strong performance of this task-specific approach, with multiple architectures excelling at each task. For Hate Speech Detection (Task A), the Co-Attention Ensemble model achieved a top F1-score of 0.7929; for Hate Target Classification (Task B), our Hierarchical CrossAttention Transformer achieved an F1-score of 0.5777; and for Stance (Task C) and Humor Detection (Task D), our Two-Stage Multiplicative Fusion Framework yielded leading F1-scores of 0.6070 and 0.7529, respectively. Beyond raw results, we also provide detailed error analyses, including confusion matrices, to reveal weaknesses driven by multimodal ambiguity and class imbalance. Ultimately, this work provides a blueprint for the community, establishing that optimal performance in multimodal analysis is achieved not by a single superior model, but through the customized design of specialized solutions, supported by empirical validation of key methodological choices.
%U https://aclanthology.org/2025.case-1.13/
%P 107-114
Markdown (Informal)
[Multimodal Kathmandu@CASE 2025: Task-Specific Adaptation of Multimodal Transformers for Hate, Stance, and Humor Detection](https://aclanthology.org/2025.case-1.13/) (Maharjan et al., CASE 2025)
ACL
- Sujal Maharjan, Astha Shrestha, Shuvam Thakur, and Rabin Thapa. 2025. Multimodal Kathmandu@CASE 2025: Task-Specific Adaptation of Multimodal Transformers for Hate, Stance, and Humor Detection. In Proceedings of the 8th Workshop on Challenges and Applications of Automated Extraction of Socio-political Events from Texts, pages 107–114, Varna, Bulgaria. INCOMA Ltd., Shoumen, Bulgaria.