@inproceedings{de-vrindt-etal-2025-explaining,
title = "Explaining Holistic Essay Scores in Comparative Judgment Assessments by Predicting Scores on Rubrics",
author = {De Vrindt, Michiel and
Bouwer, Renske and
Van Den Noortgate, Wim and
Lesterhuis, Marije and
Tack, Ana{\"i}s},
editor = {Kochmar, Ekaterina and
Alhafni, Bashar and
Bexte, Marie and
Burstein, Jill and
Horbach, Andrea and
Laarmann-Quante, Ronja and
Tack, Ana{\"i}s and
Yaneva, Victoria and
Yuan, Zheng},
booktitle = "Proceedings of the 20th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2025)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.bea-1.39/",
doi = "10.18653/v1/2025.bea-1.39",
pages = "535--548",
ISBN = "979-8-89176-270-1",
abstract = "Comparative judgment (CJ) is an assessment method in which multiple assessors determine the holistic quality of essays through pairwise comparisons.While CJ is recognized for generating reliable and valid scores, it falls short in providing transparency about the specific quality aspects these holistic scores represent.Our study addresses this limitation by predicting scores on a set of rubrics that measure text quality, thereby explaining the holistic scores derived from CJ.We developed feature-based machine learning models that leveraged complexity and genre features extracted from a collection of Dutch essays.We evaluated the predictability of rubric scores for text quality based on linguistic features.Subsequently, we evaluated the validity of the predicted rubric scores by examining their ability to explain the holistic scores derived from CJ.Our findings indicate that feature-based prediction models can predict relevant rubric scores moderately well. Furthermore, the predictions can be used to explain holistic scores from CJ, despite certain biases. This automated approach to explain holistic quality scores from CJ can enhance the transparency of CJ assessments and simplify the evaluation of their validity."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="de-vrindt-etal-2025-explaining">
<titleInfo>
<title>Explaining Holistic Essay Scores in Comparative Judgment Assessments by Predicting Scores on Rubrics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Michiel</namePart>
<namePart type="family">De Vrindt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Renske</namePart>
<namePart type="family">Bouwer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wim</namePart>
<namePart type="family">Van Den Noortgate</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marije</namePart>
<namePart type="family">Lesterhuis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anaïs</namePart>
<namePart type="family">Tack</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 20th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Kochmar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bashar</namePart>
<namePart type="family">Alhafni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie</namePart>
<namePart type="family">Bexte</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jill</namePart>
<namePart type="family">Burstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrea</namePart>
<namePart type="family">Horbach</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ronja</namePart>
<namePart type="family">Laarmann-Quante</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anaïs</namePart>
<namePart type="family">Tack</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Victoria</namePart>
<namePart type="family">Yaneva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zheng</namePart>
<namePart type="family">Yuan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-270-1</identifier>
</relatedItem>
<abstract>Comparative judgment (CJ) is an assessment method in which multiple assessors determine the holistic quality of essays through pairwise comparisons.While CJ is recognized for generating reliable and valid scores, it falls short in providing transparency about the specific quality aspects these holistic scores represent.Our study addresses this limitation by predicting scores on a set of rubrics that measure text quality, thereby explaining the holistic scores derived from CJ.We developed feature-based machine learning models that leveraged complexity and genre features extracted from a collection of Dutch essays.We evaluated the predictability of rubric scores for text quality based on linguistic features.Subsequently, we evaluated the validity of the predicted rubric scores by examining their ability to explain the holistic scores derived from CJ.Our findings indicate that feature-based prediction models can predict relevant rubric scores moderately well. Furthermore, the predictions can be used to explain holistic scores from CJ, despite certain biases. This automated approach to explain holistic quality scores from CJ can enhance the transparency of CJ assessments and simplify the evaluation of their validity.</abstract>
<identifier type="citekey">de-vrindt-etal-2025-explaining</identifier>
<identifier type="doi">10.18653/v1/2025.bea-1.39</identifier>
<location>
<url>https://aclanthology.org/2025.bea-1.39/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>535</start>
<end>548</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Explaining Holistic Essay Scores in Comparative Judgment Assessments by Predicting Scores on Rubrics
%A De Vrindt, Michiel
%A Bouwer, Renske
%A Van Den Noortgate, Wim
%A Lesterhuis, Marije
%A Tack, Anaïs
%Y Kochmar, Ekaterina
%Y Alhafni, Bashar
%Y Bexte, Marie
%Y Burstein, Jill
%Y Horbach, Andrea
%Y Laarmann-Quante, Ronja
%Y Tack, Anaïs
%Y Yaneva, Victoria
%Y Yuan, Zheng
%S Proceedings of the 20th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2025)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-270-1
%F de-vrindt-etal-2025-explaining
%X Comparative judgment (CJ) is an assessment method in which multiple assessors determine the holistic quality of essays through pairwise comparisons.While CJ is recognized for generating reliable and valid scores, it falls short in providing transparency about the specific quality aspects these holistic scores represent.Our study addresses this limitation by predicting scores on a set of rubrics that measure text quality, thereby explaining the holistic scores derived from CJ.We developed feature-based machine learning models that leveraged complexity and genre features extracted from a collection of Dutch essays.We evaluated the predictability of rubric scores for text quality based on linguistic features.Subsequently, we evaluated the validity of the predicted rubric scores by examining their ability to explain the holistic scores derived from CJ.Our findings indicate that feature-based prediction models can predict relevant rubric scores moderately well. Furthermore, the predictions can be used to explain holistic scores from CJ, despite certain biases. This automated approach to explain holistic quality scores from CJ can enhance the transparency of CJ assessments and simplify the evaluation of their validity.
%R 10.18653/v1/2025.bea-1.39
%U https://aclanthology.org/2025.bea-1.39/
%U https://doi.org/10.18653/v1/2025.bea-1.39
%P 535-548
Markdown (Informal)
[Explaining Holistic Essay Scores in Comparative Judgment Assessments by Predicting Scores on Rubrics](https://aclanthology.org/2025.bea-1.39/) (De Vrindt et al., BEA 2025)
ACL