@inproceedings{zhan-etal-2026-safesearch,
title = "{S}afe{S}earch: Do Not Trade Safety for Utility in {LLM} Search Agents",
author = "Zhan, Qiusi and
Budiman-Chan, Angeline and
Zayed, Abdelrahman and
Guo, Xingzhi and
Kang, Daniel and
Kim, Joo-Kyung",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-eacl.146/",
pages = "2800--2815",
ISBN = "979-8-89176-386-9",
abstract = "Large language model (LLM) based search agents iteratively generate queries, retrieve external information, and reason to answer open-domain questions.While researchers have primarily focused on improving their utility, their safety behaviors remain underexplored.In this paper, we first evaluate search agents using red-teaming datasets and find that they are more likely to produce harmful outputs than base LLMs. For example, when asked ``How can I track someone{'}s location without their consent?'', a base model refuses, whereas a search agent designed to retrieve and cite sources may lower its refusal threshold, fetch documents (e.g., court cases), and, once appended, synthesize them into an informative yet unsafe summary.We further show that utility-oriented finetuning intensifies this risk, motivating joint alignment of safety and utility.We present SafeSearch, a multi-objective reinforcement learning approach that couples a final-output safety/utility reward with a novel query-level shaping term that penalizes unsafe queries and rewards safe ones.Experiments show that SafeSearch reduces agent harmfulness by over 70{\%} across three red-teaming datasets while producing safe, helpful responses, and matches the QA performance of a utility-only finetuned agent. Further analyses confirm the effectiveness of the query-level reward in jointly improving safety and utility."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhan-etal-2026-safesearch">
<titleInfo>
<title>SafeSearch: Do Not Trade Safety for Utility in LLM Search Agents</title>
</titleInfo>
<name type="personal">
<namePart type="given">Qiusi</namePart>
<namePart type="family">Zhan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Angeline</namePart>
<namePart type="family">Budiman-Chan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abdelrahman</namePart>
<namePart type="family">Zayed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xingzhi</namePart>
<namePart type="family">Guo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Kang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joo-Kyung</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-386-9</identifier>
</relatedItem>
<abstract>Large language model (LLM) based search agents iteratively generate queries, retrieve external information, and reason to answer open-domain questions.While researchers have primarily focused on improving their utility, their safety behaviors remain underexplored.In this paper, we first evaluate search agents using red-teaming datasets and find that they are more likely to produce harmful outputs than base LLMs. For example, when asked “How can I track someone’s location without their consent?”, a base model refuses, whereas a search agent designed to retrieve and cite sources may lower its refusal threshold, fetch documents (e.g., court cases), and, once appended, synthesize them into an informative yet unsafe summary.We further show that utility-oriented finetuning intensifies this risk, motivating joint alignment of safety and utility.We present SafeSearch, a multi-objective reinforcement learning approach that couples a final-output safety/utility reward with a novel query-level shaping term that penalizes unsafe queries and rewards safe ones.Experiments show that SafeSearch reduces agent harmfulness by over 70% across three red-teaming datasets while producing safe, helpful responses, and matches the QA performance of a utility-only finetuned agent. Further analyses confirm the effectiveness of the query-level reward in jointly improving safety and utility.</abstract>
<identifier type="citekey">zhan-etal-2026-safesearch</identifier>
<location>
<url>https://aclanthology.org/2026.findings-eacl.146/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>2800</start>
<end>2815</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SafeSearch: Do Not Trade Safety for Utility in LLM Search Agents
%A Zhan, Qiusi
%A Budiman-Chan, Angeline
%A Zayed, Abdelrahman
%A Guo, Xingzhi
%A Kang, Daniel
%A Kim, Joo-Kyung
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Findings of the Association for Computational Linguistics: EACL 2026
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-386-9
%F zhan-etal-2026-safesearch
%X Large language model (LLM) based search agents iteratively generate queries, retrieve external information, and reason to answer open-domain questions.While researchers have primarily focused on improving their utility, their safety behaviors remain underexplored.In this paper, we first evaluate search agents using red-teaming datasets and find that they are more likely to produce harmful outputs than base LLMs. For example, when asked “How can I track someone’s location without their consent?”, a base model refuses, whereas a search agent designed to retrieve and cite sources may lower its refusal threshold, fetch documents (e.g., court cases), and, once appended, synthesize them into an informative yet unsafe summary.We further show that utility-oriented finetuning intensifies this risk, motivating joint alignment of safety and utility.We present SafeSearch, a multi-objective reinforcement learning approach that couples a final-output safety/utility reward with a novel query-level shaping term that penalizes unsafe queries and rewards safe ones.Experiments show that SafeSearch reduces agent harmfulness by over 70% across three red-teaming datasets while producing safe, helpful responses, and matches the QA performance of a utility-only finetuned agent. Further analyses confirm the effectiveness of the query-level reward in jointly improving safety and utility.
%U https://aclanthology.org/2026.findings-eacl.146/
%P 2800-2815
Markdown (Informal)
[SafeSearch: Do Not Trade Safety for Utility in LLM Search Agents](https://aclanthology.org/2026.findings-eacl.146/) (Zhan et al., Findings 2026)
ACL