@inproceedings{kim-cho-2025-goodliar,
title = "{GOODLIAR}: A Reinforcement Learning-Based Deceptive Agent for Disrupting {LLM} Beliefs on Foundational Principles",
author = "Kim, Soo Kyung and
Cho, Hyunsoo",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.160/",
doi = "10.18653/v1/2025.findings-acl.160",
pages = "3076--3101",
ISBN = "979-8-89176-256-5",
abstract = "Large Language Models (LLMs) often succumb to adversarial prompts, a phenomenon popularly known as ``jailbreaking.'' While jailbreaking primarily targets short-term noncompliance with predefined policies, we argue that a deeper vulnerability lies in altering an LLM{'}s \textit{fundamental axiomatic beliefs}, such as mathematical or philosophical truths. In this work, we introduce GoodLiar, a reinforcement learning (RL)-based framework that generates deceptive contexts to systematically \textit{rewrite} an LLM{'}s core logical or philosophical understandings. By incentivizing an RL agent to produce persuasive and coherent arguments, GoodLiar aims to induce \textit{persistent} belief shifts, rather than merely influencing immediate judgments of factual truthfulness. {\%}rather than one-off policy breaches. Our approach introduces \textit{DA-ILQL}, a novel offline RL method that extends ILQL by integrating on-policy data and language exploration to enhance the language discovery and optimization. Through extensive evaluations on multiple LLMs, we show that deceptive contexts discovered by GoodLiar consistently outperform simple multi-turn prompting methods."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kim-cho-2025-goodliar">
<titleInfo>
<title>GOODLIAR: A Reinforcement Learning-Based Deceptive Agent for Disrupting LLM Beliefs on Foundational Principles</title>
</titleInfo>
<name type="personal">
<namePart type="given">Soo</namePart>
<namePart type="given">Kyung</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hyunsoo</namePart>
<namePart type="family">Cho</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Large Language Models (LLMs) often succumb to adversarial prompts, a phenomenon popularly known as “jailbreaking.” While jailbreaking primarily targets short-term noncompliance with predefined policies, we argue that a deeper vulnerability lies in altering an LLM’s fundamental axiomatic beliefs, such as mathematical or philosophical truths. In this work, we introduce GoodLiar, a reinforcement learning (RL)-based framework that generates deceptive contexts to systematically rewrite an LLM’s core logical or philosophical understandings. By incentivizing an RL agent to produce persuasive and coherent arguments, GoodLiar aims to induce persistent belief shifts, rather than merely influencing immediate judgments of factual truthfulness. %rather than one-off policy breaches. Our approach introduces DA-ILQL, a novel offline RL method that extends ILQL by integrating on-policy data and language exploration to enhance the language discovery and optimization. Through extensive evaluations on multiple LLMs, we show that deceptive contexts discovered by GoodLiar consistently outperform simple multi-turn prompting methods.</abstract>
<identifier type="citekey">kim-cho-2025-goodliar</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.160</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.160/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>3076</start>
<end>3101</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T GOODLIAR: A Reinforcement Learning-Based Deceptive Agent for Disrupting LLM Beliefs on Foundational Principles
%A Kim, Soo Kyung
%A Cho, Hyunsoo
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F kim-cho-2025-goodliar
%X Large Language Models (LLMs) often succumb to adversarial prompts, a phenomenon popularly known as “jailbreaking.” While jailbreaking primarily targets short-term noncompliance with predefined policies, we argue that a deeper vulnerability lies in altering an LLM’s fundamental axiomatic beliefs, such as mathematical or philosophical truths. In this work, we introduce GoodLiar, a reinforcement learning (RL)-based framework that generates deceptive contexts to systematically rewrite an LLM’s core logical or philosophical understandings. By incentivizing an RL agent to produce persuasive and coherent arguments, GoodLiar aims to induce persistent belief shifts, rather than merely influencing immediate judgments of factual truthfulness. %rather than one-off policy breaches. Our approach introduces DA-ILQL, a novel offline RL method that extends ILQL by integrating on-policy data and language exploration to enhance the language discovery and optimization. Through extensive evaluations on multiple LLMs, we show that deceptive contexts discovered by GoodLiar consistently outperform simple multi-turn prompting methods.
%R 10.18653/v1/2025.findings-acl.160
%U https://aclanthology.org/2025.findings-acl.160/
%U https://doi.org/10.18653/v1/2025.findings-acl.160
%P 3076-3101
Markdown (Informal)
[GOODLIAR: A Reinforcement Learning-Based Deceptive Agent for Disrupting LLM Beliefs on Foundational Principles](https://aclanthology.org/2025.findings-acl.160/) (Kim & Cho, Findings 2025)
ACL