@inproceedings{skorski-landowska-2025-beyond,
title = "Beyond Human Judgment: A {B}ayesian Evaluation of {LLM}s' Moral Values Understanding",
author = "Skorski, Maciej and
Landowska, Alina",
editor = "Noidea, Noidea",
booktitle = "Proceedings of the 2nd Workshop on Uncertainty-Aware NLP (UncertaiNLP 2025)",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.uncertainlp-main.3/",
pages = "17--26",
ISBN = "979-8-89176-349-4",
abstract = "How do Large Language Models understand moral dimensions compared to humans?This first comprehensive large-scale Bayesian evaluation of leading language models provides the answer. In contrast to prior approaches based on deterministic ground truth (obtained via majority or inclusion consensus), we obtain the labels by modelling annotators' disagreement to capture both aleatoric uncertainty (inherent human disagreement) and epistemic uncertainty (model domain sensitivity).We evaluated Claude Sonnet 4, DeepSeek-V3, and Llama 4 Maverick across 250K+ annotations from nearly 700 annotators in 100K+ texts spanning social networks, news, and discussion forums. Our GPU-optimized Bayesian framework processed 1M+ model queries, revealing that AI models generally rank among the top 25{\%} of annotators in terms of balanced accuracy, substantially better than average humans.Importantly, we find that AI produces far fewer false negatives than humans, highlighting their more sensitive moral detection capabilities."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="skorski-landowska-2025-beyond">
<titleInfo>
<title>Beyond Human Judgment: A Bayesian Evaluation of LLMs’ Moral Values Understanding</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maciej</namePart>
<namePart type="family">Skorski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alina</namePart>
<namePart type="family">Landowska</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Workshop on Uncertainty-Aware NLP (UncertaiNLP 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Noidea</namePart>
<namePart type="family">Noidea</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-349-4</identifier>
</relatedItem>
<abstract>How do Large Language Models understand moral dimensions compared to humans?This first comprehensive large-scale Bayesian evaluation of leading language models provides the answer. In contrast to prior approaches based on deterministic ground truth (obtained via majority or inclusion consensus), we obtain the labels by modelling annotators’ disagreement to capture both aleatoric uncertainty (inherent human disagreement) and epistemic uncertainty (model domain sensitivity).We evaluated Claude Sonnet 4, DeepSeek-V3, and Llama 4 Maverick across 250K+ annotations from nearly 700 annotators in 100K+ texts spanning social networks, news, and discussion forums. Our GPU-optimized Bayesian framework processed 1M+ model queries, revealing that AI models generally rank among the top 25% of annotators in terms of balanced accuracy, substantially better than average humans.Importantly, we find that AI produces far fewer false negatives than humans, highlighting their more sensitive moral detection capabilities.</abstract>
<identifier type="citekey">skorski-landowska-2025-beyond</identifier>
<location>
<url>https://aclanthology.org/2025.uncertainlp-main.3/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>17</start>
<end>26</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Beyond Human Judgment: A Bayesian Evaluation of LLMs’ Moral Values Understanding
%A Skorski, Maciej
%A Landowska, Alina
%Y Noidea, Noidea
%S Proceedings of the 2nd Workshop on Uncertainty-Aware NLP (UncertaiNLP 2025)
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-349-4
%F skorski-landowska-2025-beyond
%X How do Large Language Models understand moral dimensions compared to humans?This first comprehensive large-scale Bayesian evaluation of leading language models provides the answer. In contrast to prior approaches based on deterministic ground truth (obtained via majority or inclusion consensus), we obtain the labels by modelling annotators’ disagreement to capture both aleatoric uncertainty (inherent human disagreement) and epistemic uncertainty (model domain sensitivity).We evaluated Claude Sonnet 4, DeepSeek-V3, and Llama 4 Maverick across 250K+ annotations from nearly 700 annotators in 100K+ texts spanning social networks, news, and discussion forums. Our GPU-optimized Bayesian framework processed 1M+ model queries, revealing that AI models generally rank among the top 25% of annotators in terms of balanced accuracy, substantially better than average humans.Importantly, we find that AI produces far fewer false negatives than humans, highlighting their more sensitive moral detection capabilities.
%U https://aclanthology.org/2025.uncertainlp-main.3/
%P 17-26
Markdown (Informal)
[Beyond Human Judgment: A Bayesian Evaluation of LLMs’ Moral Values Understanding](https://aclanthology.org/2025.uncertainlp-main.3/) (Skorski & Landowska, UncertaiNLP 2025)
ACL