@inproceedings{chakravarty-etal-2025-enhancing,
title = "Enhancing Marker Scoring Accuracy through Ordinal Confidence Modelling in Educational Assessments",
author = "Chakravarty, Abhirup and
Brenchley, Mark and
Breakspear, Trevor and
Lewin, Ian and
Huang, Yan",
editor = "Rehm, Georg and
Li, Yunyao",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 6: Industry Track)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-industry.106/",
doi = "10.18653/v1/2025.acl-industry.106",
pages = "1498--1507",
ISBN = "979-8-89176-288-6",
abstract = "A key ethical challenge in Automated Essay Scoring (AES) is ensuring that scores are only released when they meet high reliability standards. Confidence modelling addresses this by assigning a reliability estimate measure, in the form of a confidence score, to each automated score. In this study, we frame confidence estimation as a classification task: predicting whether an AES-generated score correctly places a candidate in the appropriate CEFR level. While this is a binary decision, we leverage the inherent granularity of the scoring domain in two ways. First, we reformulate the task as an $n$-ary classification problem using score binning. Second, we introduce a set of novel Kernel Weighted Ordinal Categorical Cross Entropy (KWOCCE) loss functions that incorporate the ordinal structure of CEFR labels. Our best-performing model achieves an F1 score of 0.97, and enables the system to release 47{\%} of scores with 100{\%} CEFR agreement and 99{\%} with at least 95{\%} CEFR agreement {---} compared to {\ensuremath{\approx}} 92 {\%} CEFR agreement from the standalone AES model where we release all AM predicted scores."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chakravarty-etal-2025-enhancing">
<titleInfo>
<title>Enhancing Marker Scoring Accuracy through Ordinal Confidence Modelling in Educational Assessments</title>
</titleInfo>
<name type="personal">
<namePart type="given">Abhirup</namePart>
<namePart type="family">Chakravarty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Brenchley</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Trevor</namePart>
<namePart type="family">Breakspear</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ian</namePart>
<namePart type="family">Lewin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yan</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 6: Industry Track)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Georg</namePart>
<namePart type="family">Rehm</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yunyao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-288-6</identifier>
</relatedItem>
<abstract>A key ethical challenge in Automated Essay Scoring (AES) is ensuring that scores are only released when they meet high reliability standards. Confidence modelling addresses this by assigning a reliability estimate measure, in the form of a confidence score, to each automated score. In this study, we frame confidence estimation as a classification task: predicting whether an AES-generated score correctly places a candidate in the appropriate CEFR level. While this is a binary decision, we leverage the inherent granularity of the scoring domain in two ways. First, we reformulate the task as an n-ary classification problem using score binning. Second, we introduce a set of novel Kernel Weighted Ordinal Categorical Cross Entropy (KWOCCE) loss functions that incorporate the ordinal structure of CEFR labels. Our best-performing model achieves an F1 score of 0.97, and enables the system to release 47% of scores with 100% CEFR agreement and 99% with at least 95% CEFR agreement — compared to \ensuremath\approx 92 % CEFR agreement from the standalone AES model where we release all AM predicted scores.</abstract>
<identifier type="citekey">chakravarty-etal-2025-enhancing</identifier>
<identifier type="doi">10.18653/v1/2025.acl-industry.106</identifier>
<location>
<url>https://aclanthology.org/2025.acl-industry.106/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>1498</start>
<end>1507</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Enhancing Marker Scoring Accuracy through Ordinal Confidence Modelling in Educational Assessments
%A Chakravarty, Abhirup
%A Brenchley, Mark
%A Breakspear, Trevor
%A Lewin, Ian
%A Huang, Yan
%Y Rehm, Georg
%Y Li, Yunyao
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 6: Industry Track)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-288-6
%F chakravarty-etal-2025-enhancing
%X A key ethical challenge in Automated Essay Scoring (AES) is ensuring that scores are only released when they meet high reliability standards. Confidence modelling addresses this by assigning a reliability estimate measure, in the form of a confidence score, to each automated score. In this study, we frame confidence estimation as a classification task: predicting whether an AES-generated score correctly places a candidate in the appropriate CEFR level. While this is a binary decision, we leverage the inherent granularity of the scoring domain in two ways. First, we reformulate the task as an n-ary classification problem using score binning. Second, we introduce a set of novel Kernel Weighted Ordinal Categorical Cross Entropy (KWOCCE) loss functions that incorporate the ordinal structure of CEFR labels. Our best-performing model achieves an F1 score of 0.97, and enables the system to release 47% of scores with 100% CEFR agreement and 99% with at least 95% CEFR agreement — compared to \ensuremath\approx 92 % CEFR agreement from the standalone AES model where we release all AM predicted scores.
%R 10.18653/v1/2025.acl-industry.106
%U https://aclanthology.org/2025.acl-industry.106/
%U https://doi.org/10.18653/v1/2025.acl-industry.106
%P 1498-1507
Markdown (Informal)
[Enhancing Marker Scoring Accuracy through Ordinal Confidence Modelling in Educational Assessments](https://aclanthology.org/2025.acl-industry.106/) (Chakravarty et al., ACL 2025)
ACL