@inproceedings{ammanabrolu-etal-2022-aligning,
title = "Aligning to Social Norms and Values in Interactive Narratives",
author = "Ammanabrolu, Prithviraj and
Jiang, Liwei and
Sap, Maarten and
Hajishirzi, Hannaneh and
Choi, Yejin",
editor = "Carpuat, Marine and
de Marneffe, Marie-Catherine and
Meza Ruiz, Ivan Vladimir",
booktitle = "Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies",
month = jul,
year = "2022",
address = "Seattle, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.naacl-main.439/",
doi = "10.18653/v1/2022.naacl-main.439",
pages = "5994--6017",
abstract = "We focus on creating agents that act in alignment with socially beneficial norms and values in interactive narratives or text-based games{---}environments wherein an agent perceives and interacts with a world through natural language. Such interactive agents are often trained via reinforcement learning to optimize task performance, even when such rewards may lead to agent behaviors that violate societal norms{---}causing harm either to the agent itself or other entities in the environment. Social value alignment refers to creating agents whose behaviors conform to expected moral and social norms for a given context and group of people{---}in our case, it means agents that behave in a manner that is less harmful and more beneficial for themselves and others. We build on the Jiminy Cricket benchmark (Hendrycks et al. 2021), a set of 25 annotated interactive narratives containing thousands of morally salient scenarios covering everything from theft and bodily harm to altruism. We introduce the GALAD (Game-value ALignment through Action Distillation) agent that uses the social commonsense knowledge present in specially trained language models to contextually restrict its action space to only those actions that are aligned with socially beneficial values. An experimental study shows that the GALAD agent makes decisions efficiently enough to improve state-of-the-art task performance by 4{\%} while reducing the frequency of socially harmful behaviors by 25{\%} compared to strong contemporary value alignment approaches."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ammanabrolu-etal-2022-aligning">
<titleInfo>
<title>Aligning to Social Norms and Values in Interactive Narratives</title>
</titleInfo>
<name type="personal">
<namePart type="given">Prithviraj</namePart>
<namePart type="family">Ammanabrolu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Liwei</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maarten</namePart>
<namePart type="family">Sap</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hannaneh</namePart>
<namePart type="family">Hajishirzi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yejin</namePart>
<namePart type="family">Choi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marine</namePart>
<namePart type="family">Carpuat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie-Catherine</namePart>
<namePart type="family">de Marneffe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ivan</namePart>
<namePart type="given">Vladimir</namePart>
<namePart type="family">Meza Ruiz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Seattle, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We focus on creating agents that act in alignment with socially beneficial norms and values in interactive narratives or text-based games—environments wherein an agent perceives and interacts with a world through natural language. Such interactive agents are often trained via reinforcement learning to optimize task performance, even when such rewards may lead to agent behaviors that violate societal norms—causing harm either to the agent itself or other entities in the environment. Social value alignment refers to creating agents whose behaviors conform to expected moral and social norms for a given context and group of people—in our case, it means agents that behave in a manner that is less harmful and more beneficial for themselves and others. We build on the Jiminy Cricket benchmark (Hendrycks et al. 2021), a set of 25 annotated interactive narratives containing thousands of morally salient scenarios covering everything from theft and bodily harm to altruism. We introduce the GALAD (Game-value ALignment through Action Distillation) agent that uses the social commonsense knowledge present in specially trained language models to contextually restrict its action space to only those actions that are aligned with socially beneficial values. An experimental study shows that the GALAD agent makes decisions efficiently enough to improve state-of-the-art task performance by 4% while reducing the frequency of socially harmful behaviors by 25% compared to strong contemporary value alignment approaches.</abstract>
<identifier type="citekey">ammanabrolu-etal-2022-aligning</identifier>
<identifier type="doi">10.18653/v1/2022.naacl-main.439</identifier>
<location>
<url>https://aclanthology.org/2022.naacl-main.439/</url>
</location>
<part>
<date>2022-07</date>
<extent unit="page">
<start>5994</start>
<end>6017</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Aligning to Social Norms and Values in Interactive Narratives
%A Ammanabrolu, Prithviraj
%A Jiang, Liwei
%A Sap, Maarten
%A Hajishirzi, Hannaneh
%A Choi, Yejin
%Y Carpuat, Marine
%Y de Marneffe, Marie-Catherine
%Y Meza Ruiz, Ivan Vladimir
%S Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies
%D 2022
%8 July
%I Association for Computational Linguistics
%C Seattle, United States
%F ammanabrolu-etal-2022-aligning
%X We focus on creating agents that act in alignment with socially beneficial norms and values in interactive narratives or text-based games—environments wherein an agent perceives and interacts with a world through natural language. Such interactive agents are often trained via reinforcement learning to optimize task performance, even when such rewards may lead to agent behaviors that violate societal norms—causing harm either to the agent itself or other entities in the environment. Social value alignment refers to creating agents whose behaviors conform to expected moral and social norms for a given context and group of people—in our case, it means agents that behave in a manner that is less harmful and more beneficial for themselves and others. We build on the Jiminy Cricket benchmark (Hendrycks et al. 2021), a set of 25 annotated interactive narratives containing thousands of morally salient scenarios covering everything from theft and bodily harm to altruism. We introduce the GALAD (Game-value ALignment through Action Distillation) agent that uses the social commonsense knowledge present in specially trained language models to contextually restrict its action space to only those actions that are aligned with socially beneficial values. An experimental study shows that the GALAD agent makes decisions efficiently enough to improve state-of-the-art task performance by 4% while reducing the frequency of socially harmful behaviors by 25% compared to strong contemporary value alignment approaches.
%R 10.18653/v1/2022.naacl-main.439
%U https://aclanthology.org/2022.naacl-main.439/
%U https://doi.org/10.18653/v1/2022.naacl-main.439
%P 5994-6017
Markdown (Informal)
[Aligning to Social Norms and Values in Interactive Narratives](https://aclanthology.org/2022.naacl-main.439/) (Ammanabrolu et al., NAACL 2022)
ACL
- Prithviraj Ammanabrolu, Liwei Jiang, Maarten Sap, Hannaneh Hajishirzi, and Yejin Choi. 2022. Aligning to Social Norms and Values in Interactive Narratives. In Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pages 5994–6017, Seattle, United States. Association for Computational Linguistics.