@inproceedings{mahato-etal-2026-llms,
title = "{LLM}s in Sarcasm Detection? It{'}s elementary! (Or is it?)",
author = "Mahato, Priyanshu and
Mishra, Aniket Santosh and
Ghosh, Kripabandhu",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-long.1346/",
pages = "29169--29199",
ISBN = "979-8-89176-390-6",
abstract = "While Large Language Models (LLMs) are frequently cited for their sophisticated pragmatic reasoning (CITATION), recent progress in sarcasm detection increasingly relies on synthetic benchmarks (CITATION). This study exposes a catastrophic generalization gap in this paradigm: we observe that models achieve near-perfect accuracy on synthetic data but collapse to random guessing on organic human speech. By triangulating hidden state geometry, entropy analysis, and causal interventions, we demonstrate that this disparity stems from shortcut learning (CITATION){---}models exploit the low-entropy statistical signatures of generated text while remaining ``semantically blind'' to the pragmatic cues essential for irony. Our findings indicate that high performance on synthetic leaderboards reflects forensic pattern matching rather than the genuine linguistic intelligence assumed in prior work, creating a statistical mirage of competence."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mahato-etal-2026-llms">
<titleInfo>
<title>LLMs in Sarcasm Detection? It’s elementary! (Or is it?)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Priyanshu</namePart>
<namePart type="family">Mahato</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aniket</namePart>
<namePart type="given">Santosh</namePart>
<namePart type="family">Mishra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kripabandhu</namePart>
<namePart type="family">Ghosh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-390-6</identifier>
</relatedItem>
<abstract>While Large Language Models (LLMs) are frequently cited for their sophisticated pragmatic reasoning (CITATION), recent progress in sarcasm detection increasingly relies on synthetic benchmarks (CITATION). This study exposes a catastrophic generalization gap in this paradigm: we observe that models achieve near-perfect accuracy on synthetic data but collapse to random guessing on organic human speech. By triangulating hidden state geometry, entropy analysis, and causal interventions, we demonstrate that this disparity stems from shortcut learning (CITATION)—models exploit the low-entropy statistical signatures of generated text while remaining “semantically blind” to the pragmatic cues essential for irony. Our findings indicate that high performance on synthetic leaderboards reflects forensic pattern matching rather than the genuine linguistic intelligence assumed in prior work, creating a statistical mirage of competence.</abstract>
<identifier type="citekey">mahato-etal-2026-llms</identifier>
<location>
<url>https://aclanthology.org/2026.acl-long.1346/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>29169</start>
<end>29199</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T LLMs in Sarcasm Detection? It’s elementary! (Or is it?)
%A Mahato, Priyanshu
%A Mishra, Aniket Santosh
%A Ghosh, Kripabandhu
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-390-6
%F mahato-etal-2026-llms
%X While Large Language Models (LLMs) are frequently cited for their sophisticated pragmatic reasoning (CITATION), recent progress in sarcasm detection increasingly relies on synthetic benchmarks (CITATION). This study exposes a catastrophic generalization gap in this paradigm: we observe that models achieve near-perfect accuracy on synthetic data but collapse to random guessing on organic human speech. By triangulating hidden state geometry, entropy analysis, and causal interventions, we demonstrate that this disparity stems from shortcut learning (CITATION)—models exploit the low-entropy statistical signatures of generated text while remaining “semantically blind” to the pragmatic cues essential for irony. Our findings indicate that high performance on synthetic leaderboards reflects forensic pattern matching rather than the genuine linguistic intelligence assumed in prior work, creating a statistical mirage of competence.
%U https://aclanthology.org/2026.acl-long.1346/
%P 29169-29199
Markdown (Informal)
[LLMs in Sarcasm Detection? It’s elementary! (Or is it?)](https://aclanthology.org/2026.acl-long.1346/) (Mahato et al., ACL 2026)
ACL
- Priyanshu Mahato, Aniket Santosh Mishra, and Kripabandhu Ghosh. 2026. LLMs in Sarcasm Detection? It’s elementary! (Or is it?). In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 29169–29199, San Diego, California, United States. Association for Computational Linguistics.