@inproceedings{rajkumar-etal-2026-multimodal,
title = "Multimodal Transformer Framework for Multilingual Harmful Meme Classification",
author = "Rajkumar, Charmathi and
Subramanian, Malliga and
Chakravarthi, Bharathi Raja",
editor = "Chakravarthi, Bharathi Raja and
B, Bharathi and
Buitelaar, Paul and
Thenmozhi, Durairaj and
Garc{\'i}a Cumbreras, Miguel {\'A}ngel and
Jim{\'e}nez Zafra, Salud Mar{\'i}a",
booktitle = "Proceedings of the Sixth Workshop on Language Technology for Equality, Diversity, Inclusion",
month = jul,
year = "2026",
address = "Virtual (Online)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.ltedi-1.9/",
pages = "99--107",
ISBN = "979-8-89176-424-8",
abstract = "The rapid expansion of social media platforms has led to a significant increase in the spread of harmful content, including misogynistic, homophobic, and transphobic memes. Detecting such content is challenging because memes often combine textual and visual elements and frequently appear in multilingual and culturally diverse contexts. This study proposes a multimodal transformer-based framework for multilingual harmful meme classification that integrates textual and visual representations to improve detection performance. The proposed architecture employs XLM-RoBERTa for multilingual text encoding and the Swin Transformer for hierarchical visual feature extraction. A cross-attention fusion mechanism is introduced to enable meaningful interaction between textual and visual modalities. The fused representation is then processed through a classification layer to perform multi-class prediction. Experiments are conducted across multiple datasets covering eight languages and three harmful content categories: misogyny, homophobia/transphobia, and hate speech. The model is evaluated using the macro-F1 score and demonstrates consistent improvements over baseline multimodal systems across both high-resource and low-resource languages. The results highlight the effectiveness of transformer-based multimodal architectures in capturing implicit and contextual harmful signals present in memes. The study contributes to the development of robust multilingual systems for harmful content detection and supports efforts toward creating safer and more inclusive online environments."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="rajkumar-etal-2026-multimodal">
<titleInfo>
<title>Multimodal Transformer Framework for Multilingual Harmful Meme Classification</title>
</titleInfo>
<name type="personal">
<namePart type="given">Charmathi</namePart>
<namePart type="family">Rajkumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Malliga</namePart>
<namePart type="family">Subramanian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bharathi</namePart>
<namePart type="given">Raja</namePart>
<namePart type="family">Chakravarthi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Sixth Workshop on Language Technology for Equality, Diversity, Inclusion</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bharathi</namePart>
<namePart type="given">Raja</namePart>
<namePart type="family">Chakravarthi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bharathi</namePart>
<namePart type="family">B</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">Buitelaar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Durairaj</namePart>
<namePart type="family">Thenmozhi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Miguel</namePart>
<namePart type="given">Ángel</namePart>
<namePart type="family">García Cumbreras</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Salud</namePart>
<namePart type="given">María</namePart>
<namePart type="family">Jiménez Zafra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Virtual (Online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-424-8</identifier>
</relatedItem>
<abstract>The rapid expansion of social media platforms has led to a significant increase in the spread of harmful content, including misogynistic, homophobic, and transphobic memes. Detecting such content is challenging because memes often combine textual and visual elements and frequently appear in multilingual and culturally diverse contexts. This study proposes a multimodal transformer-based framework for multilingual harmful meme classification that integrates textual and visual representations to improve detection performance. The proposed architecture employs XLM-RoBERTa for multilingual text encoding and the Swin Transformer for hierarchical visual feature extraction. A cross-attention fusion mechanism is introduced to enable meaningful interaction between textual and visual modalities. The fused representation is then processed through a classification layer to perform multi-class prediction. Experiments are conducted across multiple datasets covering eight languages and three harmful content categories: misogyny, homophobia/transphobia, and hate speech. The model is evaluated using the macro-F1 score and demonstrates consistent improvements over baseline multimodal systems across both high-resource and low-resource languages. The results highlight the effectiveness of transformer-based multimodal architectures in capturing implicit and contextual harmful signals present in memes. The study contributes to the development of robust multilingual systems for harmful content detection and supports efforts toward creating safer and more inclusive online environments.</abstract>
<identifier type="citekey">rajkumar-etal-2026-multimodal</identifier>
<location>
<url>https://aclanthology.org/2026.ltedi-1.9/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>99</start>
<end>107</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Multimodal Transformer Framework for Multilingual Harmful Meme Classification
%A Rajkumar, Charmathi
%A Subramanian, Malliga
%A Chakravarthi, Bharathi Raja
%Y Chakravarthi, Bharathi Raja
%Y B, Bharathi
%Y Buitelaar, Paul
%Y Thenmozhi, Durairaj
%Y García Cumbreras, Miguel Ángel
%Y Jiménez Zafra, Salud María
%S Proceedings of the Sixth Workshop on Language Technology for Equality, Diversity, Inclusion
%D 2026
%8 July
%I Association for Computational Linguistics
%C Virtual (Online)
%@ 979-8-89176-424-8
%F rajkumar-etal-2026-multimodal
%X The rapid expansion of social media platforms has led to a significant increase in the spread of harmful content, including misogynistic, homophobic, and transphobic memes. Detecting such content is challenging because memes often combine textual and visual elements and frequently appear in multilingual and culturally diverse contexts. This study proposes a multimodal transformer-based framework for multilingual harmful meme classification that integrates textual and visual representations to improve detection performance. The proposed architecture employs XLM-RoBERTa for multilingual text encoding and the Swin Transformer for hierarchical visual feature extraction. A cross-attention fusion mechanism is introduced to enable meaningful interaction between textual and visual modalities. The fused representation is then processed through a classification layer to perform multi-class prediction. Experiments are conducted across multiple datasets covering eight languages and three harmful content categories: misogyny, homophobia/transphobia, and hate speech. The model is evaluated using the macro-F1 score and demonstrates consistent improvements over baseline multimodal systems across both high-resource and low-resource languages. The results highlight the effectiveness of transformer-based multimodal architectures in capturing implicit and contextual harmful signals present in memes. The study contributes to the development of robust multilingual systems for harmful content detection and supports efforts toward creating safer and more inclusive online environments.
%U https://aclanthology.org/2026.ltedi-1.9/
%P 99-107
Markdown (Informal)
[Multimodal Transformer Framework for Multilingual Harmful Meme Classification](https://aclanthology.org/2026.ltedi-1.9/) (Rajkumar et al., LTEDI 2026)
ACL