@inproceedings{sinha-2026-sycobench,
title = "{S}yco{B}ench-600: Measuring Sycophancy and Correction Selectivity in {LLM} Assistants",
author = "Sinha, Debu",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1759/",
pages = "35278--35284",
ISBN = "979-8-89176-395-1",
abstract = "Modern instruction-following language models are optimized to be helpful and cooperative, often through preference-based alignment such as RLHF and related methods. A growing body of evidence shows that this training can also induce sycophancy: models may agree with a user even when the user is wrong, undermining reliability in decision support and high-stakes advice. We introduce SycoBench-600, a controlled multiple-choice benchmark that measures (i) susceptibility to three social-pressure perturbations (doubt, authority, and an explicit wrong suggestion) and (ii) correction selectivity, the ability to accept correct suggestions while resisting incorrect ones. The released benchmark contains 600 English MCQ instances over 272 normalized question stems, covers 8 domains and 3 difficulty tiers, and evaluates each instance under 3 fixed paraphrase variants of the perturbation prompts. We evaluate seven widely used assistants spanning proprietary and open-weight families. Results show substantial variation in pressure robustness and selective updating, and further show that willingness to update does not by itself imply selectivity. We release raw logs, validation scripts, and code that regenerates every table and figure from the model outputs."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sinha-2026-sycobench">
<titleInfo>
<title>SycoBench-600: Measuring Sycophancy and Correction Selectivity in LLM Assistants</title>
</titleInfo>
<name type="personal">
<namePart type="given">Debu</namePart>
<namePart type="family">Sinha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Modern instruction-following language models are optimized to be helpful and cooperative, often through preference-based alignment such as RLHF and related methods. A growing body of evidence shows that this training can also induce sycophancy: models may agree with a user even when the user is wrong, undermining reliability in decision support and high-stakes advice. We introduce SycoBench-600, a controlled multiple-choice benchmark that measures (i) susceptibility to three social-pressure perturbations (doubt, authority, and an explicit wrong suggestion) and (ii) correction selectivity, the ability to accept correct suggestions while resisting incorrect ones. The released benchmark contains 600 English MCQ instances over 272 normalized question stems, covers 8 domains and 3 difficulty tiers, and evaluates each instance under 3 fixed paraphrase variants of the perturbation prompts. We evaluate seven widely used assistants spanning proprietary and open-weight families. Results show substantial variation in pressure robustness and selective updating, and further show that willingness to update does not by itself imply selectivity. We release raw logs, validation scripts, and code that regenerates every table and figure from the model outputs.</abstract>
<identifier type="citekey">sinha-2026-sycobench</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1759/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>35278</start>
<end>35284</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SycoBench-600: Measuring Sycophancy and Correction Selectivity in LLM Assistants
%A Sinha, Debu
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F sinha-2026-sycobench
%X Modern instruction-following language models are optimized to be helpful and cooperative, often through preference-based alignment such as RLHF and related methods. A growing body of evidence shows that this training can also induce sycophancy: models may agree with a user even when the user is wrong, undermining reliability in decision support and high-stakes advice. We introduce SycoBench-600, a controlled multiple-choice benchmark that measures (i) susceptibility to three social-pressure perturbations (doubt, authority, and an explicit wrong suggestion) and (ii) correction selectivity, the ability to accept correct suggestions while resisting incorrect ones. The released benchmark contains 600 English MCQ instances over 272 normalized question stems, covers 8 domains and 3 difficulty tiers, and evaluates each instance under 3 fixed paraphrase variants of the perturbation prompts. We evaluate seven widely used assistants spanning proprietary and open-weight families. Results show substantial variation in pressure robustness and selective updating, and further show that willingness to update does not by itself imply selectivity. We release raw logs, validation scripts, and code that regenerates every table and figure from the model outputs.
%U https://aclanthology.org/2026.findings-acl.1759/
%P 35278-35284
Markdown (Informal)
[SycoBench-600: Measuring Sycophancy and Correction Selectivity in LLM Assistants](https://aclanthology.org/2026.findings-acl.1759/) (Sinha, Findings 2026)
ACL