@inproceedings{contro-etal-2026-chatbotmanip,
title = "{C}hatbot{M}anip: a Dataset to Facilitate Evaluation and Oversight of Manipulative Chatbot Behaviour",
author = "Contro, Jack Luigi Henry and
Deol, Simrat and
Brandao, Martim and
He, Yulan",
editor = "Chang, Kai-Wei and
Mehrabi, Ninareh and
Krishna, Satyapriya and
Das, Anubrata and
Dhamala, Jwala and
Cao, Yang Trista and
Kumarage, Tharindu and
Ramakrishna, Anil and
Christodoulopoulos, Christos and
Wan, Yixin and
Galystan, Aram and
Kumar, Anoop and
Gupta, Rahul",
booktitle = "Proceedings of the 6th Workshop on Trustworthy {NLP} ({T}rust{NLP} 2026)",
month = jul,
year = "2026",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.trustnlp-main.7/",
pages = "92--107",
ISBN = "979-8-89176-418-7",
abstract = "This paper introduces ChatbotManip, a novel dataset for studying manipulation in Chatbots. It contains simulated generated conversations between a chatbot and a (simulated) user, where the chatbot is explicitly asked to showcase manipulation tactics, persuade the user towards some goal, or simply be helpful. We consider a diverse set of chatbot manipulation contexts, from consumer and personal advice to citizen advice and controversial proposition argumentation. Each conversation is annotated by human annotators for both general manipulation and specific manipulation tactics. Our research reveals three key findings. First, Large Language Models (LLMs) can be manipulative when explicitly instructed, with annotators identifying manipulation in approximately 84{\%} of such conversations. Second, even when only instructed to be ``persuasive'' without explicit manipulation prompts, LLMs frequently default to controversial manipulative strategies, particularly Gaslighting and Fear Enhancement. Third, zero-shot larger models such as Gemini 2.5 pro have the best performance in detecting manipulation (of the models tested), with more work required to fine-tune smaller open source models for real-world on-device oversight. Our work provides important insights for AI safety research and highlights the need of addressing manipulation risks as LLMs are increasingly deployed in consumer-facing applications."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="contro-etal-2026-chatbotmanip">
<titleInfo>
<title>ChatbotManip: a Dataset to Facilitate Evaluation and Oversight of Manipulative Chatbot Behaviour</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jack</namePart>
<namePart type="given">Luigi</namePart>
<namePart type="given">Henry</namePart>
<namePart type="family">Contro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simrat</namePart>
<namePart type="family">Deol</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Martim</namePart>
<namePart type="family">Brandao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 6th Workshop on Trustworthy NLP (TrustNLP 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kai-Wei</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ninareh</namePart>
<namePart type="family">Mehrabi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Satyapriya</namePart>
<namePart type="family">Krishna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anubrata</namePart>
<namePart type="family">Das</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jwala</namePart>
<namePart type="family">Dhamala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="given">Trista</namePart>
<namePart type="family">Cao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tharindu</namePart>
<namePart type="family">Kumarage</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anil</namePart>
<namePart type="family">Ramakrishna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yixin</namePart>
<namePart type="family">Wan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aram</namePart>
<namePart type="family">Galystan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anoop</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rahul</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-418-7</identifier>
</relatedItem>
<abstract>This paper introduces ChatbotManip, a novel dataset for studying manipulation in Chatbots. It contains simulated generated conversations between a chatbot and a (simulated) user, where the chatbot is explicitly asked to showcase manipulation tactics, persuade the user towards some goal, or simply be helpful. We consider a diverse set of chatbot manipulation contexts, from consumer and personal advice to citizen advice and controversial proposition argumentation. Each conversation is annotated by human annotators for both general manipulation and specific manipulation tactics. Our research reveals three key findings. First, Large Language Models (LLMs) can be manipulative when explicitly instructed, with annotators identifying manipulation in approximately 84% of such conversations. Second, even when only instructed to be “persuasive” without explicit manipulation prompts, LLMs frequently default to controversial manipulative strategies, particularly Gaslighting and Fear Enhancement. Third, zero-shot larger models such as Gemini 2.5 pro have the best performance in detecting manipulation (of the models tested), with more work required to fine-tune smaller open source models for real-world on-device oversight. Our work provides important insights for AI safety research and highlights the need of addressing manipulation risks as LLMs are increasingly deployed in consumer-facing applications.</abstract>
<identifier type="citekey">contro-etal-2026-chatbotmanip</identifier>
<location>
<url>https://aclanthology.org/2026.trustnlp-main.7/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>92</start>
<end>107</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ChatbotManip: a Dataset to Facilitate Evaluation and Oversight of Manipulative Chatbot Behaviour
%A Contro, Jack Luigi Henry
%A Deol, Simrat
%A Brandao, Martim
%A He, Yulan
%Y Chang, Kai-Wei
%Y Mehrabi, Ninareh
%Y Krishna, Satyapriya
%Y Das, Anubrata
%Y Dhamala, Jwala
%Y Cao, Yang Trista
%Y Kumarage, Tharindu
%Y Ramakrishna, Anil
%Y Christodoulopoulos, Christos
%Y Wan, Yixin
%Y Galystan, Aram
%Y Kumar, Anoop
%Y Gupta, Rahul
%S Proceedings of the 6th Workshop on Trustworthy NLP (TrustNLP 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California
%@ 979-8-89176-418-7
%F contro-etal-2026-chatbotmanip
%X This paper introduces ChatbotManip, a novel dataset for studying manipulation in Chatbots. It contains simulated generated conversations between a chatbot and a (simulated) user, where the chatbot is explicitly asked to showcase manipulation tactics, persuade the user towards some goal, or simply be helpful. We consider a diverse set of chatbot manipulation contexts, from consumer and personal advice to citizen advice and controversial proposition argumentation. Each conversation is annotated by human annotators for both general manipulation and specific manipulation tactics. Our research reveals three key findings. First, Large Language Models (LLMs) can be manipulative when explicitly instructed, with annotators identifying manipulation in approximately 84% of such conversations. Second, even when only instructed to be “persuasive” without explicit manipulation prompts, LLMs frequently default to controversial manipulative strategies, particularly Gaslighting and Fear Enhancement. Third, zero-shot larger models such as Gemini 2.5 pro have the best performance in detecting manipulation (of the models tested), with more work required to fine-tune smaller open source models for real-world on-device oversight. Our work provides important insights for AI safety research and highlights the need of addressing manipulation risks as LLMs are increasingly deployed in consumer-facing applications.
%U https://aclanthology.org/2026.trustnlp-main.7/
%P 92-107
Markdown (Informal)
[ChatbotManip: a Dataset to Facilitate Evaluation and Oversight of Manipulative Chatbot Behaviour](https://aclanthology.org/2026.trustnlp-main.7/) (Contro et al., TrustNLP 2026)
ACL