@inproceedings{henkel-etal-2025-humans,
title = "When Humans Can{'}t Agree, Neither Can Machines: The Promise and Pitfalls of {LLM}s for Formative Literacy Assessment",
author = "Henkel, Owen and
Vanacore, Kirk and
Roberts, Bill",
editor = "Wilson, Joshua and
Ormerod, Christopher and
Beiting Parrish, Magdalen",
booktitle = "Proceedings of the Artificial Intelligence in Measurement and Education Conference (AIME-Con): Coordinated Session Papers",
month = oct,
year = "2025",
address = "Wyndham Grand Pittsburgh, Downtown, Pittsburgh, Pennsylvania, United States",
publisher = "National Council on Measurement in Education (NCME)",
url = "https://aclanthology.org/2025.aimecon-sessions.8/",
pages = "69--78",
ISBN = "979-8-218-84230-7",
abstract = "Story retell assessments provide valuable insights into reading comprehension but face implementation barriers due to time-intensive administration and scoring. This study examines whether Large Language Models (LLMs) can reliably replicate human judgment in grading story retells. Using a novel dataset, we conduct three complementary studies examining LLM performance across different rubric systems, agreement patterns, and reasoning alignment. We find that LLMs (a) achieve near-human reliability with appropriate rubric design, (b) perform well on easy-to-grade cases but poorly on ambiguous ones, (c) produce explanations for their grades that are plausible for straightforward cases but unreliable for complex ones, and (d) different LLMs display consistent ``grading personalities'' (systematically scoring harder or easier across all student responses). These findings support hybrid assessment architectures where AI handles routine scoring, enabling more frequent formative assessment while directing teacher expertise toward students requiring nuanced support."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="henkel-etal-2025-humans">
<titleInfo>
<title>When Humans Can’t Agree, Neither Can Machines: The Promise and Pitfalls of LLMs for Formative Literacy Assessment</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Henkel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kirk</namePart>
<namePart type="family">Vanacore</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bill</namePart>
<namePart type="family">Roberts</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Artificial Intelligence in Measurement and Education Conference (AIME-Con): Coordinated Session Papers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Joshua</namePart>
<namePart type="family">Wilson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="family">Ormerod</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Magdalen</namePart>
<namePart type="family">Beiting Parrish</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>National Council on Measurement in Education (NCME)</publisher>
<place>
<placeTerm type="text">Wyndham Grand Pittsburgh, Downtown, Pittsburgh, Pennsylvania, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-218-84230-7</identifier>
</relatedItem>
<abstract>Story retell assessments provide valuable insights into reading comprehension but face implementation barriers due to time-intensive administration and scoring. This study examines whether Large Language Models (LLMs) can reliably replicate human judgment in grading story retells. Using a novel dataset, we conduct three complementary studies examining LLM performance across different rubric systems, agreement patterns, and reasoning alignment. We find that LLMs (a) achieve near-human reliability with appropriate rubric design, (b) perform well on easy-to-grade cases but poorly on ambiguous ones, (c) produce explanations for their grades that are plausible for straightforward cases but unreliable for complex ones, and (d) different LLMs display consistent “grading personalities” (systematically scoring harder or easier across all student responses). These findings support hybrid assessment architectures where AI handles routine scoring, enabling more frequent formative assessment while directing teacher expertise toward students requiring nuanced support.</abstract>
<identifier type="citekey">henkel-etal-2025-humans</identifier>
<location>
<url>https://aclanthology.org/2025.aimecon-sessions.8/</url>
</location>
<part>
<date>2025-10</date>
<extent unit="page">
<start>69</start>
<end>78</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T When Humans Can’t Agree, Neither Can Machines: The Promise and Pitfalls of LLMs for Formative Literacy Assessment
%A Henkel, Owen
%A Vanacore, Kirk
%A Roberts, Bill
%Y Wilson, Joshua
%Y Ormerod, Christopher
%Y Beiting Parrish, Magdalen
%S Proceedings of the Artificial Intelligence in Measurement and Education Conference (AIME-Con): Coordinated Session Papers
%D 2025
%8 October
%I National Council on Measurement in Education (NCME)
%C Wyndham Grand Pittsburgh, Downtown, Pittsburgh, Pennsylvania, United States
%@ 979-8-218-84230-7
%F henkel-etal-2025-humans
%X Story retell assessments provide valuable insights into reading comprehension but face implementation barriers due to time-intensive administration and scoring. This study examines whether Large Language Models (LLMs) can reliably replicate human judgment in grading story retells. Using a novel dataset, we conduct three complementary studies examining LLM performance across different rubric systems, agreement patterns, and reasoning alignment. We find that LLMs (a) achieve near-human reliability with appropriate rubric design, (b) perform well on easy-to-grade cases but poorly on ambiguous ones, (c) produce explanations for their grades that are plausible for straightforward cases but unreliable for complex ones, and (d) different LLMs display consistent “grading personalities” (systematically scoring harder or easier across all student responses). These findings support hybrid assessment architectures where AI handles routine scoring, enabling more frequent formative assessment while directing teacher expertise toward students requiring nuanced support.
%U https://aclanthology.org/2025.aimecon-sessions.8/
%P 69-78
Markdown (Informal)
[When Humans Can’t Agree, Neither Can Machines: The Promise and Pitfalls of LLMs for Formative Literacy Assessment](https://aclanthology.org/2025.aimecon-sessions.8/) (Henkel et al., AIME-Con 2025)
ACL