@inproceedings{jaishri-etal-2026-susmitha,
title = "Susmitha@{LT}-{EDI} 2026: Detecting {LGBTQ}+ Phobia in Multilingual Memes via Joint Representation",
author = "Jaishri, Susmitha and
Shanmugavadivel, Kogilavani and
Subramanian, Malliga and
R, Mouleeshuwarapprabu",
editor = "Chakravarthi, Bharathi Raja and
B, Bharathi and
Buitelaar, Paul and
Thenmozhi, Durairaj and
Garc{\'i}a Cumbreras, Miguel {\'A}ngel and
Jim{\'e}nez Zafra, Salud Mar{\'i}a",
booktitle = "Proceedings of the Sixth Workshop on Language Technology for Equality, Diversity, Inclusion",
month = jul,
year = "2026",
address = "Virtual (Online)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.ltedi-1.27/",
pages = "222--225",
ISBN = "979-8-89176-424-8",
abstract = "The automated detection of LGBTQ+ phobia in social media memes is essential for fostering inclusive digital environments, yet it remains challenging due to the complex interplay of visual metaphors and multilingual text. We participated in the ``Homophobia and Transphobia Meme Classification'' shared task at LT-EDI 2026, evaluating a multimodal architecture across English, Hindi, and Chinese tracks. Our system employs a late-fusion strategy: XLM-RoBERTa encodes OCR-extracted text into a representation $h_{t} \in \mathbb{R}^{768}$ , while CLIP extracts visual features $h_{v} \in \mathbb{R}^{512}$. These are concatenated into a joint vector $z = [h_{t} \oplus h_{v}] \in \mathbb{R}^{1280}$ and processed via a non-linear multilayer perceptron to capture cross-modal interactions.The system demonstrated robust performance in high-resource contexts, securing 3rd rank in the Chinese track (Macro F1: 0.7371) and 4th rank in the English track (Macro F1: 0.6121). In contrast, the Hindi track results (Macro F1: 0.1616) revealed significant challenges related to script complexity and class imbalance. These findings underscore the effectiveness of global transformer-based models for multimodal reasoning while highlighting the ongoing need for specialized linguistic refinement in low-resource and diverse script environments"
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jaishri-etal-2026-susmitha">
<titleInfo>
<title>Susmitha@LT-EDI 2026: Detecting LGBTQ+ Phobia in Multilingual Memes via Joint Representation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Susmitha</namePart>
<namePart type="family">Jaishri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kogilavani</namePart>
<namePart type="family">Shanmugavadivel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Malliga</namePart>
<namePart type="family">Subramanian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mouleeshuwarapprabu</namePart>
<namePart type="family">R</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Sixth Workshop on Language Technology for Equality, Diversity, Inclusion</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bharathi</namePart>
<namePart type="given">Raja</namePart>
<namePart type="family">Chakravarthi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bharathi</namePart>
<namePart type="family">B</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">Buitelaar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Durairaj</namePart>
<namePart type="family">Thenmozhi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Miguel</namePart>
<namePart type="given">Ángel</namePart>
<namePart type="family">García Cumbreras</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Salud</namePart>
<namePart type="given">María</namePart>
<namePart type="family">Jiménez Zafra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Virtual (Online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-424-8</identifier>
</relatedItem>
<abstract>The automated detection of LGBTQ+ phobia in social media memes is essential for fostering inclusive digital environments, yet it remains challenging due to the complex interplay of visual metaphors and multilingual text. We participated in the “Homophobia and Transphobia Meme Classification” shared task at LT-EDI 2026, evaluating a multimodal architecture across English, Hindi, and Chinese tracks. Our system employs a late-fusion strategy: XLM-RoBERTa encodes OCR-extracted text into a representation h_t ın \mathbbR⁷68 , while CLIP extracts visual features h_v ın \mathbbR⁵12. These are concatenated into a joint vector z = [h_t øplus h_v] ın \mathbbR¹280 and processed via a non-linear multilayer perceptron to capture cross-modal interactions.The system demonstrated robust performance in high-resource contexts, securing 3rd rank in the Chinese track (Macro F1: 0.7371) and 4th rank in the English track (Macro F1: 0.6121). In contrast, the Hindi track results (Macro F1: 0.1616) revealed significant challenges related to script complexity and class imbalance. These findings underscore the effectiveness of global transformer-based models for multimodal reasoning while highlighting the ongoing need for specialized linguistic refinement in low-resource and diverse script environments</abstract>
<identifier type="citekey">jaishri-etal-2026-susmitha</identifier>
<location>
<url>https://aclanthology.org/2026.ltedi-1.27/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>222</start>
<end>225</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Susmitha@LT-EDI 2026: Detecting LGBTQ+ Phobia in Multilingual Memes via Joint Representation
%A Jaishri, Susmitha
%A Shanmugavadivel, Kogilavani
%A Subramanian, Malliga
%A R, Mouleeshuwarapprabu
%Y Chakravarthi, Bharathi Raja
%Y B, Bharathi
%Y Buitelaar, Paul
%Y Thenmozhi, Durairaj
%Y García Cumbreras, Miguel Ángel
%Y Jiménez Zafra, Salud María
%S Proceedings of the Sixth Workshop on Language Technology for Equality, Diversity, Inclusion
%D 2026
%8 July
%I Association for Computational Linguistics
%C Virtual (Online)
%@ 979-8-89176-424-8
%F jaishri-etal-2026-susmitha
%X The automated detection of LGBTQ+ phobia in social media memes is essential for fostering inclusive digital environments, yet it remains challenging due to the complex interplay of visual metaphors and multilingual text. We participated in the “Homophobia and Transphobia Meme Classification” shared task at LT-EDI 2026, evaluating a multimodal architecture across English, Hindi, and Chinese tracks. Our system employs a late-fusion strategy: XLM-RoBERTa encodes OCR-extracted text into a representation h_t ın \mathbbR⁷68 , while CLIP extracts visual features h_v ın \mathbbR⁵12. These are concatenated into a joint vector z = [h_t øplus h_v] ın \mathbbR¹280 and processed via a non-linear multilayer perceptron to capture cross-modal interactions.The system demonstrated robust performance in high-resource contexts, securing 3rd rank in the Chinese track (Macro F1: 0.7371) and 4th rank in the English track (Macro F1: 0.6121). In contrast, the Hindi track results (Macro F1: 0.1616) revealed significant challenges related to script complexity and class imbalance. These findings underscore the effectiveness of global transformer-based models for multimodal reasoning while highlighting the ongoing need for specialized linguistic refinement in low-resource and diverse script environments
%U https://aclanthology.org/2026.ltedi-1.27/
%P 222-225
Markdown (Informal)
[Susmitha@LT-EDI 2026: Detecting LGBTQ+ Phobia in Multilingual Memes via Joint Representation](https://aclanthology.org/2026.ltedi-1.27/) (Jaishri et al., LTEDI 2026)
ACL