@inproceedings{wu-etal-2026-user,
title = "User Perceptions vs. Proxy {LLM} Judges: Privacy and Helpfulness in {LLM} Responses to Privacy-Sensitive Scenarios",
author = "Wu, Xiaoyuan and
Kaushik, Roshni and
Li, Wenkai and
Bauer, Lujo and
Onoue, Koichi",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-long.1645/",
pages = "35562--35579",
ISBN = "979-8-89176-390-6",
abstract = "Large language models (LLMs) are rapidly being adopted for tasks like draftingemails, summarizing meetings, and answering health questions. In thesesettings, users may need to share private information (e.g., contactdetails, health records). To evaluate LLMs' ability to identify and redactsuch information, prior work introduced real-life, scenario-based benchmarks(e.g., ConfAIde, PrivacyLens) and found that LLMs can leak privateinformation in complex scenarios. However, these evaluations relied on proxy LLMs to judge the helpfulnessand privacy-preservation quality of LLM responses, rather than directlymeasuring users' perceptions. To understand how users perceive the helpfulness and privacy-preservationquality of LLM responses to privacy-sensitive scenarios, we conducted auser study ($n=94$) using 90 PrivacyLens scenarios. We found that users hadlow agreement with each other when evaluating identical LLM responses. Incontrast, five proxy LLMs reached high agreement, yet each proxy LLM hadlow correlation with users' evaluations. These results indicate that proxy LLMs cannot accurately estimate users' wide range of perceptions of utility and privacy inprivacy-sensitive scenarios. We discuss the need for more user-centeredstudies to measure LLMs' ability to help users while preserving privacy,and for improving alignment between LLMs and users in estimating perceivedprivacy and utility."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wu-etal-2026-user">
<titleInfo>
<title>User Perceptions vs. Proxy LLM Judges: Privacy and Helpfulness in LLM Responses to Privacy-Sensitive Scenarios</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xiaoyuan</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roshni</namePart>
<namePart type="family">Kaushik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenkai</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lujo</namePart>
<namePart type="family">Bauer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Koichi</namePart>
<namePart type="family">Onoue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-390-6</identifier>
</relatedItem>
<abstract>Large language models (LLMs) are rapidly being adopted for tasks like draftingemails, summarizing meetings, and answering health questions. In thesesettings, users may need to share private information (e.g., contactdetails, health records). To evaluate LLMs’ ability to identify and redactsuch information, prior work introduced real-life, scenario-based benchmarks(e.g., ConfAIde, PrivacyLens) and found that LLMs can leak privateinformation in complex scenarios. However, these evaluations relied on proxy LLMs to judge the helpfulnessand privacy-preservation quality of LLM responses, rather than directlymeasuring users’ perceptions. To understand how users perceive the helpfulness and privacy-preservationquality of LLM responses to privacy-sensitive scenarios, we conducted auser study (n=94) using 90 PrivacyLens scenarios. We found that users hadlow agreement with each other when evaluating identical LLM responses. Incontrast, five proxy LLMs reached high agreement, yet each proxy LLM hadlow correlation with users’ evaluations. These results indicate that proxy LLMs cannot accurately estimate users’ wide range of perceptions of utility and privacy inprivacy-sensitive scenarios. We discuss the need for more user-centeredstudies to measure LLMs’ ability to help users while preserving privacy,and for improving alignment between LLMs and users in estimating perceivedprivacy and utility.</abstract>
<identifier type="citekey">wu-etal-2026-user</identifier>
<location>
<url>https://aclanthology.org/2026.acl-long.1645/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>35562</start>
<end>35579</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T User Perceptions vs. Proxy LLM Judges: Privacy and Helpfulness in LLM Responses to Privacy-Sensitive Scenarios
%A Wu, Xiaoyuan
%A Kaushik, Roshni
%A Li, Wenkai
%A Bauer, Lujo
%A Onoue, Koichi
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-390-6
%F wu-etal-2026-user
%X Large language models (LLMs) are rapidly being adopted for tasks like draftingemails, summarizing meetings, and answering health questions. In thesesettings, users may need to share private information (e.g., contactdetails, health records). To evaluate LLMs’ ability to identify and redactsuch information, prior work introduced real-life, scenario-based benchmarks(e.g., ConfAIde, PrivacyLens) and found that LLMs can leak privateinformation in complex scenarios. However, these evaluations relied on proxy LLMs to judge the helpfulnessand privacy-preservation quality of LLM responses, rather than directlymeasuring users’ perceptions. To understand how users perceive the helpfulness and privacy-preservationquality of LLM responses to privacy-sensitive scenarios, we conducted auser study (n=94) using 90 PrivacyLens scenarios. We found that users hadlow agreement with each other when evaluating identical LLM responses. Incontrast, five proxy LLMs reached high agreement, yet each proxy LLM hadlow correlation with users’ evaluations. These results indicate that proxy LLMs cannot accurately estimate users’ wide range of perceptions of utility and privacy inprivacy-sensitive scenarios. We discuss the need for more user-centeredstudies to measure LLMs’ ability to help users while preserving privacy,and for improving alignment between LLMs and users in estimating perceivedprivacy and utility.
%U https://aclanthology.org/2026.acl-long.1645/
%P 35562-35579
Markdown (Informal)
[User Perceptions vs. Proxy LLM Judges: Privacy and Helpfulness in LLM Responses to Privacy-Sensitive Scenarios](https://aclanthology.org/2026.acl-long.1645/) (Wu et al., ACL 2026)
ACL