@inproceedings{wu-etal-2024-rethinking,
title = "Rethinking Pragmatics in Large Language Models: Towards Open-Ended Evaluation and Preference Tuning",
author = "Wu, Shengguang and
Yang, Shusheng and
Chen, Zhenglun and
Su, Qi",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.1258",
pages = "22583--22599",
abstract = "This study addresses the challenges of assessing and enhancing social-pragmatic inference in large language models (LLMs). We first highlight the inadequacy of current accuracy-based multiple choice question answering (MCQA) formats in assessing social-pragmatic reasoning, and propose the direct evaluation of models{'} free-form responses as measure, which correlates better with human judgment. Furthermore, we explore methods to improve pragmatic abilities in LLMs, advocating for preference optimization (PO) over supervised finetuning (SFT), given the absence of a definitive {``}gold{''} answer in social contexts. Our results show that preferential tuning consistently outperforms SFT across pragmatic phenomena and offers a near-free launch in pragmatic abilities without compromising general capabilities. Lastly, we examine the internal structure of LLMs, revealing that the significant boost in pragmatic reasoning is tied to deeper layer representations, analogous to human high-level thinking. Our experiments span a variety of pragmatic and social reasoning datasets, as well as an image referential game requiring a multimodal theory of mind (ToM). With our refined paradigms for evaluating and enhancing pragmatic inference, this paper offers key insights into building more socially aware language models.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wu-etal-2024-rethinking">
<titleInfo>
<title>Rethinking Pragmatics in Large Language Models: Towards Open-Ended Evaluation and Preference Tuning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shengguang</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shusheng</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhenglun</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qi</namePart>
<namePart type="family">Su</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This study addresses the challenges of assessing and enhancing social-pragmatic inference in large language models (LLMs). We first highlight the inadequacy of current accuracy-based multiple choice question answering (MCQA) formats in assessing social-pragmatic reasoning, and propose the direct evaluation of models’ free-form responses as measure, which correlates better with human judgment. Furthermore, we explore methods to improve pragmatic abilities in LLMs, advocating for preference optimization (PO) over supervised finetuning (SFT), given the absence of a definitive “gold” answer in social contexts. Our results show that preferential tuning consistently outperforms SFT across pragmatic phenomena and offers a near-free launch in pragmatic abilities without compromising general capabilities. Lastly, we examine the internal structure of LLMs, revealing that the significant boost in pragmatic reasoning is tied to deeper layer representations, analogous to human high-level thinking. Our experiments span a variety of pragmatic and social reasoning datasets, as well as an image referential game requiring a multimodal theory of mind (ToM). With our refined paradigms for evaluating and enhancing pragmatic inference, this paper offers key insights into building more socially aware language models.</abstract>
<identifier type="citekey">wu-etal-2024-rethinking</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.1258</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>22583</start>
<end>22599</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Rethinking Pragmatics in Large Language Models: Towards Open-Ended Evaluation and Preference Tuning
%A Wu, Shengguang
%A Yang, Shusheng
%A Chen, Zhenglun
%A Su, Qi
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F wu-etal-2024-rethinking
%X This study addresses the challenges of assessing and enhancing social-pragmatic inference in large language models (LLMs). We first highlight the inadequacy of current accuracy-based multiple choice question answering (MCQA) formats in assessing social-pragmatic reasoning, and propose the direct evaluation of models’ free-form responses as measure, which correlates better with human judgment. Furthermore, we explore methods to improve pragmatic abilities in LLMs, advocating for preference optimization (PO) over supervised finetuning (SFT), given the absence of a definitive “gold” answer in social contexts. Our results show that preferential tuning consistently outperforms SFT across pragmatic phenomena and offers a near-free launch in pragmatic abilities without compromising general capabilities. Lastly, we examine the internal structure of LLMs, revealing that the significant boost in pragmatic reasoning is tied to deeper layer representations, analogous to human high-level thinking. Our experiments span a variety of pragmatic and social reasoning datasets, as well as an image referential game requiring a multimodal theory of mind (ToM). With our refined paradigms for evaluating and enhancing pragmatic inference, this paper offers key insights into building more socially aware language models.
%U https://aclanthology.org/2024.emnlp-main.1258
%P 22583-22599
Markdown (Informal)
[Rethinking Pragmatics in Large Language Models: Towards Open-Ended Evaluation and Preference Tuning](https://aclanthology.org/2024.emnlp-main.1258) (Wu et al., EMNLP 2024)
ACL