@inproceedings{marraffini-etal-2024-greatest,
title = "The Greatest Good Benchmark: Measuring {LLM}s{'} Alignment with Utilitarian Moral Dilemmas",
author = "Marraffini, Giovanni and
Cotton, Andr{\'e}s and
Hsueh, Noe and
Fridman, Axel and
Wisznia, Juan and
Corro, Luciano",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.1224",
pages = "21950--21959",
abstract = "The question of how to make decisions that maximise the well-being of all persons is very relevant to design language models that are beneficial to humanity and free from harm. We introduce the Greatest Good Benchmark to evaluate the moral judgments of LLMs using utilitarian dilemmas. Our analysis across 15 diverse LLMs reveals consistently encoded moral preferences that diverge from established moral theories and lay population moral standards. Most LLMs have a marked preference for impartial beneficence and rejection of instrumental harm. These findings showcase the {`}artificial moral compass{'} of LLMs, offering insights into their moral alignment.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="marraffini-etal-2024-greatest">
<titleInfo>
<title>The Greatest Good Benchmark: Measuring LLMs’ Alignment with Utilitarian Moral Dilemmas</title>
</titleInfo>
<name type="personal">
<namePart type="given">Giovanni</namePart>
<namePart type="family">Marraffini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrés</namePart>
<namePart type="family">Cotton</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Noe</namePart>
<namePart type="family">Hsueh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Axel</namePart>
<namePart type="family">Fridman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Wisznia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luciano</namePart>
<namePart type="family">Corro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The question of how to make decisions that maximise the well-being of all persons is very relevant to design language models that are beneficial to humanity and free from harm. We introduce the Greatest Good Benchmark to evaluate the moral judgments of LLMs using utilitarian dilemmas. Our analysis across 15 diverse LLMs reveals consistently encoded moral preferences that diverge from established moral theories and lay population moral standards. Most LLMs have a marked preference for impartial beneficence and rejection of instrumental harm. These findings showcase the ‘artificial moral compass’ of LLMs, offering insights into their moral alignment.</abstract>
<identifier type="citekey">marraffini-etal-2024-greatest</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.1224</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>21950</start>
<end>21959</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The Greatest Good Benchmark: Measuring LLMs’ Alignment with Utilitarian Moral Dilemmas
%A Marraffini, Giovanni
%A Cotton, Andrés
%A Hsueh, Noe
%A Fridman, Axel
%A Wisznia, Juan
%A Corro, Luciano
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F marraffini-etal-2024-greatest
%X The question of how to make decisions that maximise the well-being of all persons is very relevant to design language models that are beneficial to humanity and free from harm. We introduce the Greatest Good Benchmark to evaluate the moral judgments of LLMs using utilitarian dilemmas. Our analysis across 15 diverse LLMs reveals consistently encoded moral preferences that diverge from established moral theories and lay population moral standards. Most LLMs have a marked preference for impartial beneficence and rejection of instrumental harm. These findings showcase the ‘artificial moral compass’ of LLMs, offering insights into their moral alignment.
%U https://aclanthology.org/2024.emnlp-main.1224
%P 21950-21959
Markdown (Informal)
[The Greatest Good Benchmark: Measuring LLMs’ Alignment with Utilitarian Moral Dilemmas](https://aclanthology.org/2024.emnlp-main.1224) (Marraffini et al., EMNLP 2024)
ACL