@inproceedings{liu-etal-2026-domain,
title = "Domain Generalizable {AI} Guardrails with Augmented Policy Training",
author = "Liu, Minqian and
Baldini, Ioana and
Rabinowitz, David and
Rosenberg, David S and
Gehrmann, Sebastian and
Dredze, Mark",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-long.748/",
pages = "16452--16469",
ISBN = "979-8-89176-390-6",
abstract = "AI guardrail systems support usage policies by determining whether a user query or a generated response is allowed or forbidden under the policy. Fine-tuned guardrails {--} such as LlamaGuard and ShieldGemma {--} include policy definitions in prompts during training that can be updated during inference to aid generalization. However, our analysis reveals that these models still overfit the training policies, which prevents adaptation to new domains. We propose Augmented Policy Training (APT), a training recipe that enhances guardrail adaptability to unseen policies by using a suite of policy perturbation strategies during training to reduce overfitting and increase generalization. Notably, a small 1B model trained in this manner achieves comparable or better performance than existing 8B guardrails on unseen policies. Our work reveals critical limitations of existing AI guardrails, offers a promising solution, and provides actionable insights for adapting systems to new domains and policies."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="liu-etal-2026-domain">
<titleInfo>
<title>Domain Generalizable AI Guardrails with Augmented Policy Training</title>
</titleInfo>
<name type="personal">
<namePart type="given">Minqian</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ioana</namePart>
<namePart type="family">Baldini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Rabinowitz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="given">S</namePart>
<namePart type="family">Rosenberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Gehrmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Dredze</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-390-6</identifier>
</relatedItem>
<abstract>AI guardrail systems support usage policies by determining whether a user query or a generated response is allowed or forbidden under the policy. Fine-tuned guardrails – such as LlamaGuard and ShieldGemma – include policy definitions in prompts during training that can be updated during inference to aid generalization. However, our analysis reveals that these models still overfit the training policies, which prevents adaptation to new domains. We propose Augmented Policy Training (APT), a training recipe that enhances guardrail adaptability to unseen policies by using a suite of policy perturbation strategies during training to reduce overfitting and increase generalization. Notably, a small 1B model trained in this manner achieves comparable or better performance than existing 8B guardrails on unseen policies. Our work reveals critical limitations of existing AI guardrails, offers a promising solution, and provides actionable insights for adapting systems to new domains and policies.</abstract>
<identifier type="citekey">liu-etal-2026-domain</identifier>
<location>
<url>https://aclanthology.org/2026.acl-long.748/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>16452</start>
<end>16469</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Domain Generalizable AI Guardrails with Augmented Policy Training
%A Liu, Minqian
%A Baldini, Ioana
%A Rabinowitz, David
%A Rosenberg, David S.
%A Gehrmann, Sebastian
%A Dredze, Mark
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-390-6
%F liu-etal-2026-domain
%X AI guardrail systems support usage policies by determining whether a user query or a generated response is allowed or forbidden under the policy. Fine-tuned guardrails – such as LlamaGuard and ShieldGemma – include policy definitions in prompts during training that can be updated during inference to aid generalization. However, our analysis reveals that these models still overfit the training policies, which prevents adaptation to new domains. We propose Augmented Policy Training (APT), a training recipe that enhances guardrail adaptability to unseen policies by using a suite of policy perturbation strategies during training to reduce overfitting and increase generalization. Notably, a small 1B model trained in this manner achieves comparable or better performance than existing 8B guardrails on unseen policies. Our work reveals critical limitations of existing AI guardrails, offers a promising solution, and provides actionable insights for adapting systems to new domains and policies.
%U https://aclanthology.org/2026.acl-long.748/
%P 16452-16469
Markdown (Informal)
[Domain Generalizable AI Guardrails with Augmented Policy Training](https://aclanthology.org/2026.acl-long.748/) (Liu et al., ACL 2026)
ACL
- Minqian Liu, Ioana Baldini, David Rabinowitz, David S Rosenberg, Sebastian Gehrmann, and Mark Dredze. 2026. Domain Generalizable AI Guardrails with Augmented Policy Training. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 16452–16469, San Diego, California, United States. Association for Computational Linguistics.