@inproceedings{khan-etal-2026-adaptive,
title = "The Adaptive Interrogator: Detecting Trojan {LLM}s in Multi-Agent Systems via Evolved Conversational Strategies",
author = "Khan, Rana Muhammad Shahroz and
Zhang, Ruichen and
Tan, Zhen and
Fleming, Charles and
Chen, Tianlong",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1348/",
pages = "27029--27044",
ISBN = "979-8-89176-395-1",
abstract = "While Large Language Model (LLM) safety has focused on single-agent, white-box settings, the adoption of Multi-Agent Systems (MAS) creates a critical blind spot: $\textit{supply chain vulnerabilities in MAS ecosystems}$. These systems often rely on third-party agents accessed via black-box APIs, creating risks where attackers can embed hidden triggers to manipulate collective reasoning or outputs. Because internal weights are inaccessible, traditional white-box defenses fail to detect these threats. Consequently, a critical gap exists in auditing these systems for ``Trojan'' agents, $\textit{i.e.}$, malicious models that behave normally until triggered by specific, often multi-turn, conversational contexts. To bridge this gap, we introduce the $\textbf{C}$onversational $\textbf{T}$rojan $\textbf{U}$nmasking $\textbf{S}$ystem ($\textbf{CTUS}$), a black-box auditing framework that leverages an Evolutionary Algorithm (EA) to autonomously expose hidden threats. Drawing on social deduction mechanics, CTUS deploys a ``Judge'' agent to evolve conversational probes that provoke Trojan agents into revealing their malicious nature without alerting benign peers. We validate CTUS across diverse architectures (Llama-2/3, Gemma, Mistral) and attack vectors (word, syntax, semantic, RLHF). Our results demonstrate that CTUS achieves superior detection rates (up to 100{\%} in specific configurations). Furthermore, we conduct rigorous analyses to confirm the framework{'}s robustness, exhibiting negligible false positives on benign systems and stability across system configurations, establishing CTUS as a scalable safeguard for the multi-agent landscape."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="khan-etal-2026-adaptive">
<titleInfo>
<title>The Adaptive Interrogator: Detecting Trojan LLMs in Multi-Agent Systems via Evolved Conversational Strategies</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rana</namePart>
<namePart type="given">Muhammad</namePart>
<namePart type="given">Shahroz</namePart>
<namePart type="family">Khan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruichen</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhen</namePart>
<namePart type="family">Tan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Charles</namePart>
<namePart type="family">Fleming</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tianlong</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>While Large Language Model (LLM) safety has focused on single-agent, white-box settings, the adoption of Multi-Agent Systems (MAS) creates a critical blind spot: supply chain vulnerabilities in MAS ecosystems. These systems often rely on third-party agents accessed via black-box APIs, creating risks where attackers can embed hidden triggers to manipulate collective reasoning or outputs. Because internal weights are inaccessible, traditional white-box defenses fail to detect these threats. Consequently, a critical gap exists in auditing these systems for “Trojan” agents, i.e., malicious models that behave normally until triggered by specific, often multi-turn, conversational contexts. To bridge this gap, we introduce the Conversational Trojan Unmasking System (CTUS), a black-box auditing framework that leverages an Evolutionary Algorithm (EA) to autonomously expose hidden threats. Drawing on social deduction mechanics, CTUS deploys a “Judge” agent to evolve conversational probes that provoke Trojan agents into revealing their malicious nature without alerting benign peers. We validate CTUS across diverse architectures (Llama-2/3, Gemma, Mistral) and attack vectors (word, syntax, semantic, RLHF). Our results demonstrate that CTUS achieves superior detection rates (up to 100% in specific configurations). Furthermore, we conduct rigorous analyses to confirm the framework’s robustness, exhibiting negligible false positives on benign systems and stability across system configurations, establishing CTUS as a scalable safeguard for the multi-agent landscape.</abstract>
<identifier type="citekey">khan-etal-2026-adaptive</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1348/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>27029</start>
<end>27044</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The Adaptive Interrogator: Detecting Trojan LLMs in Multi-Agent Systems via Evolved Conversational Strategies
%A Khan, Rana Muhammad Shahroz
%A Zhang, Ruichen
%A Tan, Zhen
%A Fleming, Charles
%A Chen, Tianlong
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F khan-etal-2026-adaptive
%X While Large Language Model (LLM) safety has focused on single-agent, white-box settings, the adoption of Multi-Agent Systems (MAS) creates a critical blind spot: supply chain vulnerabilities in MAS ecosystems. These systems often rely on third-party agents accessed via black-box APIs, creating risks where attackers can embed hidden triggers to manipulate collective reasoning or outputs. Because internal weights are inaccessible, traditional white-box defenses fail to detect these threats. Consequently, a critical gap exists in auditing these systems for “Trojan” agents, i.e., malicious models that behave normally until triggered by specific, often multi-turn, conversational contexts. To bridge this gap, we introduce the Conversational Trojan Unmasking System (CTUS), a black-box auditing framework that leverages an Evolutionary Algorithm (EA) to autonomously expose hidden threats. Drawing on social deduction mechanics, CTUS deploys a “Judge” agent to evolve conversational probes that provoke Trojan agents into revealing their malicious nature without alerting benign peers. We validate CTUS across diverse architectures (Llama-2/3, Gemma, Mistral) and attack vectors (word, syntax, semantic, RLHF). Our results demonstrate that CTUS achieves superior detection rates (up to 100% in specific configurations). Furthermore, we conduct rigorous analyses to confirm the framework’s robustness, exhibiting negligible false positives on benign systems and stability across system configurations, establishing CTUS as a scalable safeguard for the multi-agent landscape.
%U https://aclanthology.org/2026.findings-acl.1348/
%P 27029-27044
Markdown (Informal)
[The Adaptive Interrogator: Detecting Trojan LLMs in Multi-Agent Systems via Evolved Conversational Strategies](https://aclanthology.org/2026.findings-acl.1348/) (Khan et al., Findings 2026)
ACL