@inproceedings{goel-etal-2025-target,
title = "Target-Augmented Shared Fusion-based Multimodal Sarcasm Explanation Generation",
author = "Goel, Palaash and
Chauhan, Dushyant Singh and
Akhtar, Md Shad",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2025",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-naacl.472/",
doi = "10.18653/v1/2025.findings-naacl.472",
pages = "8480--8493",
ISBN = "979-8-89176-195-7",
abstract = "Sarcasm is a linguistic phenomenon that intends to ridicule a target (e.g., entity, event, or person) in an inherent way. Multimodal Sarcasm Explanation (MuSE) aims at revealing the intended irony in a sarcastic post using a natural language explanation. Though important, existing systems overlooked the significance of the target of sarcasm in generating explanations. In this paper, we propose a \textbf{T}arget-a\textbf{U}gmented sha\textbf{R}ed fusion-\textbf{B}ased sarcasm explanati\textbf{O}n model, aka. TURBO. We design a novel shared-fusion mechanism to leverage the inter-modality relationships between an image and its caption. TURBO assumes the target of the sarcasm and guides the multimodal shared fusion mechanism in learning intricacies of the intended irony for explanations. We evaluate our proposed TURBO model on the MORE+ dataset. Comparison against multiple baselines and state-of-the-art models signifies the performance improvement of TURBO by an average margin of +3.3{\%}. Moreover, we explore LLMs in zero and one-shot settings for our task and observe that LLM-generated explanation, though remarkable, often fails to capture the critical nuances of the sarcasm. Furthermore, we supplement our study with extensive human mevaluation on TURBO{'}s generated explanations and find them out to be comparatively better than other systems."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="goel-etal-2025-target">
<titleInfo>
<title>Target-Augmented Shared Fusion-based Multimodal Sarcasm Explanation Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Palaash</namePart>
<namePart type="family">Goel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dushyant</namePart>
<namePart type="given">Singh</namePart>
<namePart type="family">Chauhan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Md</namePart>
<namePart type="given">Shad</namePart>
<namePart type="family">Akhtar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-195-7</identifier>
</relatedItem>
<abstract>Sarcasm is a linguistic phenomenon that intends to ridicule a target (e.g., entity, event, or person) in an inherent way. Multimodal Sarcasm Explanation (MuSE) aims at revealing the intended irony in a sarcastic post using a natural language explanation. Though important, existing systems overlooked the significance of the target of sarcasm in generating explanations. In this paper, we propose a Target-aUgmented shaRed fusion-Based sarcasm explanatiOn model, aka. TURBO. We design a novel shared-fusion mechanism to leverage the inter-modality relationships between an image and its caption. TURBO assumes the target of the sarcasm and guides the multimodal shared fusion mechanism in learning intricacies of the intended irony for explanations. We evaluate our proposed TURBO model on the MORE+ dataset. Comparison against multiple baselines and state-of-the-art models signifies the performance improvement of TURBO by an average margin of +3.3%. Moreover, we explore LLMs in zero and one-shot settings for our task and observe that LLM-generated explanation, though remarkable, often fails to capture the critical nuances of the sarcasm. Furthermore, we supplement our study with extensive human mevaluation on TURBO’s generated explanations and find them out to be comparatively better than other systems.</abstract>
<identifier type="citekey">goel-etal-2025-target</identifier>
<identifier type="doi">10.18653/v1/2025.findings-naacl.472</identifier>
<location>
<url>https://aclanthology.org/2025.findings-naacl.472/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>8480</start>
<end>8493</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Target-Augmented Shared Fusion-based Multimodal Sarcasm Explanation Generation
%A Goel, Palaash
%A Chauhan, Dushyant Singh
%A Akhtar, Md Shad
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Findings of the Association for Computational Linguistics: NAACL 2025
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-195-7
%F goel-etal-2025-target
%X Sarcasm is a linguistic phenomenon that intends to ridicule a target (e.g., entity, event, or person) in an inherent way. Multimodal Sarcasm Explanation (MuSE) aims at revealing the intended irony in a sarcastic post using a natural language explanation. Though important, existing systems overlooked the significance of the target of sarcasm in generating explanations. In this paper, we propose a Target-aUgmented shaRed fusion-Based sarcasm explanatiOn model, aka. TURBO. We design a novel shared-fusion mechanism to leverage the inter-modality relationships between an image and its caption. TURBO assumes the target of the sarcasm and guides the multimodal shared fusion mechanism in learning intricacies of the intended irony for explanations. We evaluate our proposed TURBO model on the MORE+ dataset. Comparison against multiple baselines and state-of-the-art models signifies the performance improvement of TURBO by an average margin of +3.3%. Moreover, we explore LLMs in zero and one-shot settings for our task and observe that LLM-generated explanation, though remarkable, often fails to capture the critical nuances of the sarcasm. Furthermore, we supplement our study with extensive human mevaluation on TURBO’s generated explanations and find them out to be comparatively better than other systems.
%R 10.18653/v1/2025.findings-naacl.472
%U https://aclanthology.org/2025.findings-naacl.472/
%U https://doi.org/10.18653/v1/2025.findings-naacl.472
%P 8480-8493
Markdown (Informal)
[Target-Augmented Shared Fusion-based Multimodal Sarcasm Explanation Generation](https://aclanthology.org/2025.findings-naacl.472/) (Goel et al., Findings 2025)
ACL