@inproceedings{kim-etal-2026-read,
title = "Read the Room, Read the Image: Understanding Indirect Speech Acts in Multimodal Visual Contexts",
author = "Kim, Jaehee and
Chung, Ji Hoon and
Park, Seoyoon and
Kim, Unsol and
Park, Kyungwon and
Kim, JiHak and
Chen, Yi-Jun and
Kim, Hansaem",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1556/",
pages = "31109--31124",
ISBN = "979-8-89176-395-1",
abstract = "Indirect speech acts (ISAs) require pragmatic reasoning over context, as directive intent cannot be inferred from surface form alone. Prior text-based studies and existing multimodal benchmarks largely overlook this requirement, focusing instead on explicitly encoded context or perceptual recognition, and thus underexplore context-dependent pragmatic understanding{---}particularly in high-context languages such as Korean. We introduce READI, a multimodal benchmark for evaluating ISA understanding through integrated reasoning over visual context and dialogue. READI models graded indirectness grounded in pragmatic theory and formulates the task as vision-based pragmatic question answering (V-PQA), supporting cross-lingual evaluation in English and Korean. Experiments show that even state-of-the-art multimodal models struggle with visually grounded indirect speech acts, with performance declining as indirectness increases, underscoring the need for benchmarks that explicitly target contextual pragmatic reasoning."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kim-etal-2026-read">
<titleInfo>
<title>Read the Room, Read the Image: Understanding Indirect Speech Acts in Multimodal Visual Contexts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jaehee</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ji</namePart>
<namePart type="given">Hoon</namePart>
<namePart type="family">Chung</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seoyoon</namePart>
<namePart type="family">Park</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Unsol</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kyungwon</namePart>
<namePart type="family">Park</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">JiHak</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yi-Jun</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hansaem</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Indirect speech acts (ISAs) require pragmatic reasoning over context, as directive intent cannot be inferred from surface form alone. Prior text-based studies and existing multimodal benchmarks largely overlook this requirement, focusing instead on explicitly encoded context or perceptual recognition, and thus underexplore context-dependent pragmatic understanding—particularly in high-context languages such as Korean. We introduce READI, a multimodal benchmark for evaluating ISA understanding through integrated reasoning over visual context and dialogue. READI models graded indirectness grounded in pragmatic theory and formulates the task as vision-based pragmatic question answering (V-PQA), supporting cross-lingual evaluation in English and Korean. Experiments show that even state-of-the-art multimodal models struggle with visually grounded indirect speech acts, with performance declining as indirectness increases, underscoring the need for benchmarks that explicitly target contextual pragmatic reasoning.</abstract>
<identifier type="citekey">kim-etal-2026-read</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1556/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>31109</start>
<end>31124</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Read the Room, Read the Image: Understanding Indirect Speech Acts in Multimodal Visual Contexts
%A Kim, Jaehee
%A Chung, Ji Hoon
%A Park, Seoyoon
%A Kim, Unsol
%A Park, Kyungwon
%A Kim, JiHak
%A Chen, Yi-Jun
%A Kim, Hansaem
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F kim-etal-2026-read
%X Indirect speech acts (ISAs) require pragmatic reasoning over context, as directive intent cannot be inferred from surface form alone. Prior text-based studies and existing multimodal benchmarks largely overlook this requirement, focusing instead on explicitly encoded context or perceptual recognition, and thus underexplore context-dependent pragmatic understanding—particularly in high-context languages such as Korean. We introduce READI, a multimodal benchmark for evaluating ISA understanding through integrated reasoning over visual context and dialogue. READI models graded indirectness grounded in pragmatic theory and formulates the task as vision-based pragmatic question answering (V-PQA), supporting cross-lingual evaluation in English and Korean. Experiments show that even state-of-the-art multimodal models struggle with visually grounded indirect speech acts, with performance declining as indirectness increases, underscoring the need for benchmarks that explicitly target contextual pragmatic reasoning.
%U https://aclanthology.org/2026.findings-acl.1556/
%P 31109-31124
Markdown (Informal)
[Read the Room, Read the Image: Understanding Indirect Speech Acts in Multimodal Visual Contexts](https://aclanthology.org/2026.findings-acl.1556/) (Kim et al., Findings 2026)
ACL
- Jaehee Kim, Ji Hoon Chung, Seoyoon Park, Unsol Kim, Kyungwon Park, JiHak Kim, Yi-Jun Chen, and Hansaem Kim. 2026. Read the Room, Read the Image: Understanding Indirect Speech Acts in Multimodal Visual Contexts. In Findings of the Association for Computational Linguistics: ACL 2026, pages 31109–31124, San Diego, California, United States. Association for Computational Linguistics.