@inproceedings{yerukola-etal-2024-pope,
title = "Is the Pope Catholic? Yes, the Pope is Catholic. Generative Evaluation of Non-Literal Intent Resolution in {LLM}s",
author = "Yerukola, Akhila and
Vaduguru, Saujas and
Fried, Daniel and
Sap, Maarten",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.luhme-short.26/",
doi = "10.18653/v1/2024.acl-short.26",
pages = "265--275",
abstract = "Humans often express their communicative intents indirectly or non-literally, which requires their interlocutors{---}human or AI{---}to understand beyond the literal meaning of words. While most existing work has focused on discriminative evaluations, we present a new approach to generatively evaluate large language models' (LLMs') intention understanding by examining their responses to non-literal utterances. Ideally, an LLM should respond in line with the true intention of a non-literal utterance, not its literal interpretation. Our findings show that LLMs struggle to generate contextually relevant responses to non-literal language. We also find that providing oracle intentions substantially improves response appropriateness, but using chain-of-thought to make models spell out intentions before responding improves much less. These findings suggest that LLMs are not yet pragmatic interlocutors, and that explicitly modeling intention could improve LLM responses to non-literal language."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yerukola-etal-2024-pope">
<titleInfo>
<title>Is the Pope Catholic? Yes, the Pope is Catholic. Generative Evaluation of Non-Literal Intent Resolution in LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Akhila</namePart>
<namePart type="family">Yerukola</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Saujas</namePart>
<namePart type="family">Vaduguru</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Fried</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maarten</namePart>
<namePart type="family">Sap</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Humans often express their communicative intents indirectly or non-literally, which requires their interlocutors—human or AI—to understand beyond the literal meaning of words. While most existing work has focused on discriminative evaluations, we present a new approach to generatively evaluate large language models’ (LLMs’) intention understanding by examining their responses to non-literal utterances. Ideally, an LLM should respond in line with the true intention of a non-literal utterance, not its literal interpretation. Our findings show that LLMs struggle to generate contextually relevant responses to non-literal language. We also find that providing oracle intentions substantially improves response appropriateness, but using chain-of-thought to make models spell out intentions before responding improves much less. These findings suggest that LLMs are not yet pragmatic interlocutors, and that explicitly modeling intention could improve LLM responses to non-literal language.</abstract>
<identifier type="citekey">yerukola-etal-2024-pope</identifier>
<identifier type="doi">10.18653/v1/2024.acl-short.26</identifier>
<location>
<url>https://aclanthology.org/2024.luhme-short.26/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>265</start>
<end>275</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Is the Pope Catholic? Yes, the Pope is Catholic. Generative Evaluation of Non-Literal Intent Resolution in LLMs
%A Yerukola, Akhila
%A Vaduguru, Saujas
%A Fried, Daniel
%A Sap, Maarten
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F yerukola-etal-2024-pope
%X Humans often express their communicative intents indirectly or non-literally, which requires their interlocutors—human or AI—to understand beyond the literal meaning of words. While most existing work has focused on discriminative evaluations, we present a new approach to generatively evaluate large language models’ (LLMs’) intention understanding by examining their responses to non-literal utterances. Ideally, an LLM should respond in line with the true intention of a non-literal utterance, not its literal interpretation. Our findings show that LLMs struggle to generate contextually relevant responses to non-literal language. We also find that providing oracle intentions substantially improves response appropriateness, but using chain-of-thought to make models spell out intentions before responding improves much less. These findings suggest that LLMs are not yet pragmatic interlocutors, and that explicitly modeling intention could improve LLM responses to non-literal language.
%R 10.18653/v1/2024.acl-short.26
%U https://aclanthology.org/2024.luhme-short.26/
%U https://doi.org/10.18653/v1/2024.acl-short.26
%P 265-275
Markdown (Informal)
[Is the Pope Catholic? Yes, the Pope is Catholic. Generative Evaluation of Non-Literal Intent Resolution in LLMs](https://aclanthology.org/2024.luhme-short.26/) (Yerukola et al., ACL 2024)
ACL