@inproceedings{li-etal-2025-comparing,
title = "Comparing {AI} tools and Human Raters in Predicting Reading Item Difficulty",
author = "Li, Hongli and
Aldib, Roula and
Marchong, Chad and
Fan, Kevin",
editor = "Wilson, Joshua and
Ormerod, Christopher and
Beiting Parrish, Magdalen",
booktitle = "Proceedings of the Artificial Intelligence in Measurement and Education Conference (AIME-Con): Works in Progress",
month = oct,
year = "2025",
address = "Wyndham Grand Pittsburgh, Downtown, Pittsburgh, Pennsylvania, United States",
publisher = "National Council on Measurement in Education (NCME)",
url = "https://aclanthology.org/2025.aimecon-wip.10/",
pages = "84--89",
ISBN = "979-8-218-84229-1",
abstract = "This study compares AI tools and human raters in predicting the difficulty of reading comprehension items without response data. Predictions from AI models (ChatGPT, Gemini, Claude, and DeepSeek) and human raters are evaluated against empirical difficulty values derived from student responses. Findings will inform AI{'}s potential to support test development."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="li-etal-2025-comparing">
<titleInfo>
<title>Comparing AI tools and Human Raters in Predicting Reading Item Difficulty</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hongli</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roula</namePart>
<namePart type="family">Aldib</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chad</namePart>
<namePart type="family">Marchong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Fan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Artificial Intelligence in Measurement and Education Conference (AIME-Con): Works in Progress</title>
</titleInfo>
<name type="personal">
<namePart type="given">Joshua</namePart>
<namePart type="family">Wilson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="family">Ormerod</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Magdalen</namePart>
<namePart type="family">Beiting Parrish</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>National Council on Measurement in Education (NCME)</publisher>
<place>
<placeTerm type="text">Wyndham Grand Pittsburgh, Downtown, Pittsburgh, Pennsylvania, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-218-84229-1</identifier>
</relatedItem>
<abstract>This study compares AI tools and human raters in predicting the difficulty of reading comprehension items without response data. Predictions from AI models (ChatGPT, Gemini, Claude, and DeepSeek) and human raters are evaluated against empirical difficulty values derived from student responses. Findings will inform AI’s potential to support test development.</abstract>
<identifier type="citekey">li-etal-2025-comparing</identifier>
<location>
<url>https://aclanthology.org/2025.aimecon-wip.10/</url>
</location>
<part>
<date>2025-10</date>
<extent unit="page">
<start>84</start>
<end>89</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Comparing AI tools and Human Raters in Predicting Reading Item Difficulty
%A Li, Hongli
%A Aldib, Roula
%A Marchong, Chad
%A Fan, Kevin
%Y Wilson, Joshua
%Y Ormerod, Christopher
%Y Beiting Parrish, Magdalen
%S Proceedings of the Artificial Intelligence in Measurement and Education Conference (AIME-Con): Works in Progress
%D 2025
%8 October
%I National Council on Measurement in Education (NCME)
%C Wyndham Grand Pittsburgh, Downtown, Pittsburgh, Pennsylvania, United States
%@ 979-8-218-84229-1
%F li-etal-2025-comparing
%X This study compares AI tools and human raters in predicting the difficulty of reading comprehension items without response data. Predictions from AI models (ChatGPT, Gemini, Claude, and DeepSeek) and human raters are evaluated against empirical difficulty values derived from student responses. Findings will inform AI’s potential to support test development.
%U https://aclanthology.org/2025.aimecon-wip.10/
%P 84-89
Markdown (Informal)
[Comparing AI tools and Human Raters in Predicting Reading Item Difficulty](https://aclanthology.org/2025.aimecon-wip.10/) (Li et al., AIME-Con 2025)
ACL
- Hongli Li, Roula Aldib, Chad Marchong, and Kevin Fan. 2025. Comparing AI tools and Human Raters in Predicting Reading Item Difficulty. In Proceedings of the Artificial Intelligence in Measurement and Education Conference (AIME-Con): Works in Progress, pages 84–89, Wyndham Grand Pittsburgh, Downtown, Pittsburgh, Pennsylvania, United States. National Council on Measurement in Education (NCME).