@inproceedings{zhou-etal-2025-rel,
title = "{REL}-{A}.{I}.: An Interaction-Centered Approach To Measuring Human-{LM} Reliance",
author = "Zhou, Kaitlyn and
Hwang, Jena D. and
Ren, Xiang and
Dziri, Nouha and
Jurafsky, Dan and
Sap, Maarten",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-long.556/",
doi = "10.18653/v1/2025.naacl-long.556",
pages = "11148--11167",
ISBN = "979-8-89176-189-6",
abstract = "The ability to communicate uncertainty and knowledge limitations is crucial for the safety of large language models (LLMs). Current evaluations of these abilities typically examine the correspondence between model accuracy and its internal probabilities or linguistic outputs. However, evaluation of the uncertainty of LLM communication should also focus on the behaviors of their human interlocutors: how much do users rely on what the LLM says? We introduce an interaction-centered evaluation approach called Rel-A.I. (pronounced ``rely'') that quantifies whether and how humans rely on LLMs' responses, complementing existing calibration evaluations. Through nine user studies with 450 participants, we investigate three crucial aspects that influence user reliance. We show that emphatic expressions of politeness (e.g., ``I{'}m happy to help!'') that precede LLM answers will cause participants to perceive these models as more competent, and in turn, rely 30{\%} more on their generations. Additionally, the context of the interaction, such as the knowledge domain and nature of previous interactions with the LLM, substantially influences user reliance (e.g., users will rely 10{\%} more on LLMs when responding to questions involving calculations). Our results show that calibration and language quality alone are insufficient in informing which LLMs are safely calibrated, and illustrate the need to consider features of the interactional context."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhou-etal-2025-rel">
<titleInfo>
<title>REL-A.I.: An Interaction-Centered Approach To Measuring Human-LM Reliance</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kaitlyn</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jena</namePart>
<namePart type="given">D</namePart>
<namePart type="family">Hwang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiang</namePart>
<namePart type="family">Ren</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nouha</namePart>
<namePart type="family">Dziri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dan</namePart>
<namePart type="family">Jurafsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maarten</namePart>
<namePart type="family">Sap</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-189-6</identifier>
</relatedItem>
<abstract>The ability to communicate uncertainty and knowledge limitations is crucial for the safety of large language models (LLMs). Current evaluations of these abilities typically examine the correspondence between model accuracy and its internal probabilities or linguistic outputs. However, evaluation of the uncertainty of LLM communication should also focus on the behaviors of their human interlocutors: how much do users rely on what the LLM says? We introduce an interaction-centered evaluation approach called Rel-A.I. (pronounced “rely”) that quantifies whether and how humans rely on LLMs’ responses, complementing existing calibration evaluations. Through nine user studies with 450 participants, we investigate three crucial aspects that influence user reliance. We show that emphatic expressions of politeness (e.g., “I’m happy to help!”) that precede LLM answers will cause participants to perceive these models as more competent, and in turn, rely 30% more on their generations. Additionally, the context of the interaction, such as the knowledge domain and nature of previous interactions with the LLM, substantially influences user reliance (e.g., users will rely 10% more on LLMs when responding to questions involving calculations). Our results show that calibration and language quality alone are insufficient in informing which LLMs are safely calibrated, and illustrate the need to consider features of the interactional context.</abstract>
<identifier type="citekey">zhou-etal-2025-rel</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-long.556</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-long.556/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>11148</start>
<end>11167</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T REL-A.I.: An Interaction-Centered Approach To Measuring Human-LM Reliance
%A Zhou, Kaitlyn
%A Hwang, Jena D.
%A Ren, Xiang
%A Dziri, Nouha
%A Jurafsky, Dan
%A Sap, Maarten
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-189-6
%F zhou-etal-2025-rel
%X The ability to communicate uncertainty and knowledge limitations is crucial for the safety of large language models (LLMs). Current evaluations of these abilities typically examine the correspondence between model accuracy and its internal probabilities or linguistic outputs. However, evaluation of the uncertainty of LLM communication should also focus on the behaviors of their human interlocutors: how much do users rely on what the LLM says? We introduce an interaction-centered evaluation approach called Rel-A.I. (pronounced “rely”) that quantifies whether and how humans rely on LLMs’ responses, complementing existing calibration evaluations. Through nine user studies with 450 participants, we investigate three crucial aspects that influence user reliance. We show that emphatic expressions of politeness (e.g., “I’m happy to help!”) that precede LLM answers will cause participants to perceive these models as more competent, and in turn, rely 30% more on their generations. Additionally, the context of the interaction, such as the knowledge domain and nature of previous interactions with the LLM, substantially influences user reliance (e.g., users will rely 10% more on LLMs when responding to questions involving calculations). Our results show that calibration and language quality alone are insufficient in informing which LLMs are safely calibrated, and illustrate the need to consider features of the interactional context.
%R 10.18653/v1/2025.naacl-long.556
%U https://aclanthology.org/2025.naacl-long.556/
%U https://doi.org/10.18653/v1/2025.naacl-long.556
%P 11148-11167
Markdown (Informal)
[REL-A.I.: An Interaction-Centered Approach To Measuring Human-LM Reliance](https://aclanthology.org/2025.naacl-long.556/) (Zhou et al., NAACL 2025)
ACL
- Kaitlyn Zhou, Jena D. Hwang, Xiang Ren, Nouha Dziri, Dan Jurafsky, and Maarten Sap. 2025. REL-A.I.: An Interaction-Centered Approach To Measuring Human-LM Reliance. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 11148–11167, Albuquerque, New Mexico. Association for Computational Linguistics.