@inproceedings{lee-etal-2026-entrust,
title = "Can We Entrust Justice to {AI}?: How Persona Traps Contaminate Reasoning in Criminal Investigation",
author = "Lee, Jaewook and
Kang, Myeong-Cheol and
Shin, Jong-hun",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.843/",
pages = "17083--17103",
ISBN = "979-8-89176-395-1",
abstract = "If large language models (LLMs) are deployed to analyze evidence and evaluate suspects in criminal investigations, are they free from the very trap that has led countless human investigators to misjudgment{---}implicit bias swayed by information irrelevant to the essence of the case? To answer this question, this study systematically injected personas (gender, race, relationship) into neutralized murder mystery scenarios and examined the reasoning stability of LLMs. Experimental results revealed that implicit bias propagation was observed across all models. The phenomenon where models outwardly state ``that information is irrelevant to the judgment'' while their actual conclusions are already influenced by the injected persona was universally observed. Interestingly, model scale alone did not guarantee stability: while the largest model achieved the lowest instability, several smaller models outperformed much larger ones. The most notable finding concerns the differential vulnerability across persona types: while race and gender were processed relatively stably, relationship information{---}particularly hostile relationships{---}induced significantly higher reasoning contamination. More concerning is the fact that even when conclusions were correctly maintained, the reasoning process itself was extensively contaminated. These findings suggest that current alignment techniques have created a blind spot by focusing on identity-based bias while neglecting relationship-based bias, and propose that stability evaluation should encompass not only outputs but also reasoning processes."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lee-etal-2026-entrust">
<titleInfo>
<title>Can We Entrust Justice to AI?: How Persona Traps Contaminate Reasoning in Criminal Investigation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jaewook</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Myeong-Cheol</namePart>
<namePart type="family">Kang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jong-hun</namePart>
<namePart type="family">Shin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>If large language models (LLMs) are deployed to analyze evidence and evaluate suspects in criminal investigations, are they free from the very trap that has led countless human investigators to misjudgment—implicit bias swayed by information irrelevant to the essence of the case? To answer this question, this study systematically injected personas (gender, race, relationship) into neutralized murder mystery scenarios and examined the reasoning stability of LLMs. Experimental results revealed that implicit bias propagation was observed across all models. The phenomenon where models outwardly state “that information is irrelevant to the judgment” while their actual conclusions are already influenced by the injected persona was universally observed. Interestingly, model scale alone did not guarantee stability: while the largest model achieved the lowest instability, several smaller models outperformed much larger ones. The most notable finding concerns the differential vulnerability across persona types: while race and gender were processed relatively stably, relationship information—particularly hostile relationships—induced significantly higher reasoning contamination. More concerning is the fact that even when conclusions were correctly maintained, the reasoning process itself was extensively contaminated. These findings suggest that current alignment techniques have created a blind spot by focusing on identity-based bias while neglecting relationship-based bias, and propose that stability evaluation should encompass not only outputs but also reasoning processes.</abstract>
<identifier type="citekey">lee-etal-2026-entrust</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.843/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>17083</start>
<end>17103</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Can We Entrust Justice to AI?: How Persona Traps Contaminate Reasoning in Criminal Investigation
%A Lee, Jaewook
%A Kang, Myeong-Cheol
%A Shin, Jong-hun
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F lee-etal-2026-entrust
%X If large language models (LLMs) are deployed to analyze evidence and evaluate suspects in criminal investigations, are they free from the very trap that has led countless human investigators to misjudgment—implicit bias swayed by information irrelevant to the essence of the case? To answer this question, this study systematically injected personas (gender, race, relationship) into neutralized murder mystery scenarios and examined the reasoning stability of LLMs. Experimental results revealed that implicit bias propagation was observed across all models. The phenomenon where models outwardly state “that information is irrelevant to the judgment” while their actual conclusions are already influenced by the injected persona was universally observed. Interestingly, model scale alone did not guarantee stability: while the largest model achieved the lowest instability, several smaller models outperformed much larger ones. The most notable finding concerns the differential vulnerability across persona types: while race and gender were processed relatively stably, relationship information—particularly hostile relationships—induced significantly higher reasoning contamination. More concerning is the fact that even when conclusions were correctly maintained, the reasoning process itself was extensively contaminated. These findings suggest that current alignment techniques have created a blind spot by focusing on identity-based bias while neglecting relationship-based bias, and propose that stability evaluation should encompass not only outputs but also reasoning processes.
%U https://aclanthology.org/2026.findings-acl.843/
%P 17083-17103
Markdown (Informal)
[Can We Entrust Justice to AI?: How Persona Traps Contaminate Reasoning in Criminal Investigation](https://aclanthology.org/2026.findings-acl.843/) (Lee et al., Findings 2026)
ACL