@inproceedings{li-etal-2026-measuring,
title = "Measuring Large Language Models' Adversarial Behavior in Social Deduction Games",
author = "Li, Marissa Zhao and
Shivakumar, Esha and
Wang, Peiran and
Li, Ying and
Tian, Yuan",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.2043/",
pages = "41099--41115",
ISBN = "979-8-89176-395-1",
abstract = "As large language models are increasingly adopted and trusted in real-world applications, understanding their behavior beyond single-turn prompting has become critical. Existing safety evaluations primarily focus on refusal-based methods that test whether models avoid responding to inappropriate or violent requests, leaving open questions about how models behave in interactive social settings. In this paper, we observe the adversarial behavior of LLM models through a multi-agent simulation across five diverse social deduction conversational games, acting as testbeds that provide social pressures and survival stress based on game design without explicit prompt injections. From these interactions, we construct a closed behavioral taxonomy derived through open card sorting, applied uniformly across models using a meta-LLM for behavior labeling. This approach displays that models exhibit distinct behavioral profiles and that models' different ways of structured deliberation influence both social stability and competitive success."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="li-etal-2026-measuring">
<titleInfo>
<title>Measuring Large Language Models’ Adversarial Behavior in Social Deduction Games</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marissa</namePart>
<namePart type="given">Zhao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Esha</namePart>
<namePart type="family">Shivakumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peiran</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ying</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuan</namePart>
<namePart type="family">Tian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>As large language models are increasingly adopted and trusted in real-world applications, understanding their behavior beyond single-turn prompting has become critical. Existing safety evaluations primarily focus on refusal-based methods that test whether models avoid responding to inappropriate or violent requests, leaving open questions about how models behave in interactive social settings. In this paper, we observe the adversarial behavior of LLM models through a multi-agent simulation across five diverse social deduction conversational games, acting as testbeds that provide social pressures and survival stress based on game design without explicit prompt injections. From these interactions, we construct a closed behavioral taxonomy derived through open card sorting, applied uniformly across models using a meta-LLM for behavior labeling. This approach displays that models exhibit distinct behavioral profiles and that models’ different ways of structured deliberation influence both social stability and competitive success.</abstract>
<identifier type="citekey">li-etal-2026-measuring</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.2043/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>41099</start>
<end>41115</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Measuring Large Language Models’ Adversarial Behavior in Social Deduction Games
%A Li, Marissa Zhao
%A Shivakumar, Esha
%A Wang, Peiran
%A Li, Ying
%A Tian, Yuan
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F li-etal-2026-measuring
%X As large language models are increasingly adopted and trusted in real-world applications, understanding their behavior beyond single-turn prompting has become critical. Existing safety evaluations primarily focus on refusal-based methods that test whether models avoid responding to inappropriate or violent requests, leaving open questions about how models behave in interactive social settings. In this paper, we observe the adversarial behavior of LLM models through a multi-agent simulation across five diverse social deduction conversational games, acting as testbeds that provide social pressures and survival stress based on game design without explicit prompt injections. From these interactions, we construct a closed behavioral taxonomy derived through open card sorting, applied uniformly across models using a meta-LLM for behavior labeling. This approach displays that models exhibit distinct behavioral profiles and that models’ different ways of structured deliberation influence both social stability and competitive success.
%U https://aclanthology.org/2026.findings-acl.2043/
%P 41099-41115
Markdown (Informal)
[Measuring Large Language Models’ Adversarial Behavior in Social Deduction Games](https://aclanthology.org/2026.findings-acl.2043/) (Li et al., Findings 2026)
ACL