@inproceedings{kierans-etal-2026-position,
title = "Position: Evaluations of {AI} Moral Reasoning Still Miss Half of the Picture",
author = "Kierans, Aidan and
Dutt, Ritam and
Rittichier, Kaley and
Dori-Hacohen, Shiri and
Ghosh, Avijit",
editor = "Akhtar, Mubashara and
Batzner, Jan and
Choshen, Leshem and
Ghosh, Avijit and
Gohar, Usman and
Mickel, Jennifer and
Pant, Ichhya and
Talat, Zeerak and
Lin, Michelle",
booktitle = "Proceedings of the Workshop on Evaluating Evaluations ({E}val{E}val)",
month = jul,
year = "2026",
address = "San Diego, CA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.evaleval-1.38/",
pages = "237--244",
ISBN = "979-8-89176-429-3",
abstract = "Recent work on evaluating the moral competence of large language models (LLMs) has focused primarily on what we call the moral value problem, i.e., whether model outputs align with human moral values. In contrast, the moral norm problem, i.e., whether models can identify and correctly apply context-sensitive moral norms, remains underexplored. We posit that this imbalance stems from the field{'}s reliance on descriptive ethics frameworks, such as Moral Foundations Theory and Kohlberg{'}s stages of moral development, which emphasize value representation over normative application. We review existing benchmarks and evaluation methods, and show that they cluster heavily around the value problem, while discussion regarding normative ethics remains underrepresented. We identify three crucial gaps: (i) the absence of high-quality groundtruth data for moral norms and their applications, (ii) insufficient evaluation of intermediate reasoning processes, and (iii) limited attention to the identification of morally relevant features in context. Subsequently, we propose a research agenda that includes the development of standardized formal representations for normative theories, the construction of expert-annotated datasets capturing norm application, and evaluation protocols that explicitly distinguish between values-level and normslevel competence. Our goal is to encourage a more systematic study of normative reasoning in LLMs."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kierans-etal-2026-position">
<titleInfo>
<title>Position: Evaluations of AI Moral Reasoning Still Miss Half of the Picture</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aidan</namePart>
<namePart type="family">Kierans</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ritam</namePart>
<namePart type="family">Dutt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kaley</namePart>
<namePart type="family">Rittichier</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shiri</namePart>
<namePart type="family">Dori-Hacohen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Avijit</namePart>
<namePart type="family">Ghosh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop on Evaluating Evaluations (EvalEval)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mubashara</namePart>
<namePart type="family">Akhtar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Batzner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leshem</namePart>
<namePart type="family">Choshen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Avijit</namePart>
<namePart type="family">Ghosh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Usman</namePart>
<namePart type="family">Gohar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jennifer</namePart>
<namePart type="family">Mickel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ichhya</namePart>
<namePart type="family">Pant</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zeerak</namePart>
<namePart type="family">Talat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michelle</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, CA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-429-3</identifier>
</relatedItem>
<abstract>Recent work on evaluating the moral competence of large language models (LLMs) has focused primarily on what we call the moral value problem, i.e., whether model outputs align with human moral values. In contrast, the moral norm problem, i.e., whether models can identify and correctly apply context-sensitive moral norms, remains underexplored. We posit that this imbalance stems from the field’s reliance on descriptive ethics frameworks, such as Moral Foundations Theory and Kohlberg’s stages of moral development, which emphasize value representation over normative application. We review existing benchmarks and evaluation methods, and show that they cluster heavily around the value problem, while discussion regarding normative ethics remains underrepresented. We identify three crucial gaps: (i) the absence of high-quality groundtruth data for moral norms and their applications, (ii) insufficient evaluation of intermediate reasoning processes, and (iii) limited attention to the identification of morally relevant features in context. Subsequently, we propose a research agenda that includes the development of standardized formal representations for normative theories, the construction of expert-annotated datasets capturing norm application, and evaluation protocols that explicitly distinguish between values-level and normslevel competence. Our goal is to encourage a more systematic study of normative reasoning in LLMs.</abstract>
<identifier type="citekey">kierans-etal-2026-position</identifier>
<location>
<url>https://aclanthology.org/2026.evaleval-1.38/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>237</start>
<end>244</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Position: Evaluations of AI Moral Reasoning Still Miss Half of the Picture
%A Kierans, Aidan
%A Dutt, Ritam
%A Rittichier, Kaley
%A Dori-Hacohen, Shiri
%A Ghosh, Avijit
%Y Akhtar, Mubashara
%Y Batzner, Jan
%Y Choshen, Leshem
%Y Ghosh, Avijit
%Y Gohar, Usman
%Y Mickel, Jennifer
%Y Pant, Ichhya
%Y Talat, Zeerak
%Y Lin, Michelle
%S Proceedings of the Workshop on Evaluating Evaluations (EvalEval)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, CA
%@ 979-8-89176-429-3
%F kierans-etal-2026-position
%X Recent work on evaluating the moral competence of large language models (LLMs) has focused primarily on what we call the moral value problem, i.e., whether model outputs align with human moral values. In contrast, the moral norm problem, i.e., whether models can identify and correctly apply context-sensitive moral norms, remains underexplored. We posit that this imbalance stems from the field’s reliance on descriptive ethics frameworks, such as Moral Foundations Theory and Kohlberg’s stages of moral development, which emphasize value representation over normative application. We review existing benchmarks and evaluation methods, and show that they cluster heavily around the value problem, while discussion regarding normative ethics remains underrepresented. We identify three crucial gaps: (i) the absence of high-quality groundtruth data for moral norms and their applications, (ii) insufficient evaluation of intermediate reasoning processes, and (iii) limited attention to the identification of morally relevant features in context. Subsequently, we propose a research agenda that includes the development of standardized formal representations for normative theories, the construction of expert-annotated datasets capturing norm application, and evaluation protocols that explicitly distinguish between values-level and normslevel competence. Our goal is to encourage a more systematic study of normative reasoning in LLMs.
%U https://aclanthology.org/2026.evaleval-1.38/
%P 237-244
Markdown (Informal)
[Position: Evaluations of AI Moral Reasoning Still Miss Half of the Picture](https://aclanthology.org/2026.evaleval-1.38/) (Kierans et al., EvalEval 2026)
ACL