@inproceedings{stengel-eskin-etal-2025-teaching,
title = "Teaching Models to Balance Resisting and Accepting Persuasion",
author = "Stengel-Eskin, Elias and
Hase, Peter and
Bansal, Mohit",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-long.412/",
doi = "10.18653/v1/2025.naacl-long.412",
pages = "8108--8122",
ISBN = "979-8-89176-189-6",
abstract = "Large language models (LLMs) are susceptible to persuasion, which can pose risks when models are faced with an adversarial interlocutor. We take a first step towards defending models against persuasion while also arguing that defense against adversarial (i.e. *negative*) persuasion is only half of the equation: models should also be able to accept beneficial (i.e. *positive*) persuasion to improve their answers. We show that optimizing models for only one side results in poor performance on the other. In order to balance positive and negative persuasion, we introduce **P**ersuasion-**B**alanced **T**raining (or **PBT**), which leverages multi-agent recursive dialogue trees to create data and trains models via preference optimization to accept persuasion *when appropriate*. PBT allows us to use data generated from dialogues between smaller 7-8B models for training much larger 70B models. Moreover, PBT consistently improves resistance to misinformation and resilience to being challenged while also resulting in the best overall performance on holistic data containing both positive and negative persuasion. Crucially, we show that PBT models are better teammates in multi-agent debates across two domains (trivia and commonsense QA). We find that without PBT, pairs of stronger and weaker models have unstable performance, with the order in which the models present their answers determining whether the team obtains the stronger or weaker model{'}s performance. PBT leads to better and more stable results and less order dependence, with the stronger model consistently pulling the weaker one up."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="stengel-eskin-etal-2025-teaching">
<titleInfo>
<title>Teaching Models to Balance Resisting and Accepting Persuasion</title>
</titleInfo>
<name type="personal">
<namePart type="given">Elias</namePart>
<namePart type="family">Stengel-Eskin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peter</namePart>
<namePart type="family">Hase</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-189-6</identifier>
</relatedItem>
<abstract>Large language models (LLMs) are susceptible to persuasion, which can pose risks when models are faced with an adversarial interlocutor. We take a first step towards defending models against persuasion while also arguing that defense against adversarial (i.e. *negative*) persuasion is only half of the equation: models should also be able to accept beneficial (i.e. *positive*) persuasion to improve their answers. We show that optimizing models for only one side results in poor performance on the other. In order to balance positive and negative persuasion, we introduce **P**ersuasion-**B**alanced **T**raining (or **PBT**), which leverages multi-agent recursive dialogue trees to create data and trains models via preference optimization to accept persuasion *when appropriate*. PBT allows us to use data generated from dialogues between smaller 7-8B models for training much larger 70B models. Moreover, PBT consistently improves resistance to misinformation and resilience to being challenged while also resulting in the best overall performance on holistic data containing both positive and negative persuasion. Crucially, we show that PBT models are better teammates in multi-agent debates across two domains (trivia and commonsense QA). We find that without PBT, pairs of stronger and weaker models have unstable performance, with the order in which the models present their answers determining whether the team obtains the stronger or weaker model’s performance. PBT leads to better and more stable results and less order dependence, with the stronger model consistently pulling the weaker one up.</abstract>
<identifier type="citekey">stengel-eskin-etal-2025-teaching</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-long.412</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-long.412/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>8108</start>
<end>8122</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Teaching Models to Balance Resisting and Accepting Persuasion
%A Stengel-Eskin, Elias
%A Hase, Peter
%A Bansal, Mohit
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-189-6
%F stengel-eskin-etal-2025-teaching
%X Large language models (LLMs) are susceptible to persuasion, which can pose risks when models are faced with an adversarial interlocutor. We take a first step towards defending models against persuasion while also arguing that defense against adversarial (i.e. *negative*) persuasion is only half of the equation: models should also be able to accept beneficial (i.e. *positive*) persuasion to improve their answers. We show that optimizing models for only one side results in poor performance on the other. In order to balance positive and negative persuasion, we introduce **P**ersuasion-**B**alanced **T**raining (or **PBT**), which leverages multi-agent recursive dialogue trees to create data and trains models via preference optimization to accept persuasion *when appropriate*. PBT allows us to use data generated from dialogues between smaller 7-8B models for training much larger 70B models. Moreover, PBT consistently improves resistance to misinformation and resilience to being challenged while also resulting in the best overall performance on holistic data containing both positive and negative persuasion. Crucially, we show that PBT models are better teammates in multi-agent debates across two domains (trivia and commonsense QA). We find that without PBT, pairs of stronger and weaker models have unstable performance, with the order in which the models present their answers determining whether the team obtains the stronger or weaker model’s performance. PBT leads to better and more stable results and less order dependence, with the stronger model consistently pulling the weaker one up.
%R 10.18653/v1/2025.naacl-long.412
%U https://aclanthology.org/2025.naacl-long.412/
%U https://doi.org/10.18653/v1/2025.naacl-long.412
%P 8108-8122
Markdown (Informal)
[Teaching Models to Balance Resisting and Accepting Persuasion](https://aclanthology.org/2025.naacl-long.412/) (Stengel-Eskin et al., NAACL 2025)
ACL
- Elias Stengel-Eskin, Peter Hase, and Mohit Bansal. 2025. Teaching Models to Balance Resisting and Accepting Persuasion. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 8108–8122, Albuquerque, New Mexico. Association for Computational Linguistics.