@inproceedings{palomino-paassen-2025-benchmarking,
title = "Benchmarking Item Difficulty Classification in {G}erman Vocational Education and Training",
author = "Palomino, Alonso and
Paassen, Benjamin",
editor = "Angelova, Galia and
Kunilovskaya, Maria and
Escribe, Marie and
Mitkov, Ruslan",
booktitle = "Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era",
month = sep,
year = "2025",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2025.ranlp-1.99/",
pages = "870--875",
abstract = "Predicting the difficulty of exam questions or items is essential to effectively assembling and calibrating exams. While item response theory (IRT) models can estimate item difficulty, they require student responses that are costly and rarely available at scale. Natural language processing methods offer a text-only alternative; however, due to the scarcity of real-world labeled data, prior work often relies on synthetic or domain-specific corpora, limiting generalizability and overlooking the nuanced challenges of real-world text-based item difficulty estimation. Addressing this gap, we benchmark 122 classifiers on 935 German Vocational Education and Training (VET) items labeled via previous IRT analysis to assess feasibility under real-world conditions. In our setup, a stacked ensemble that combines linguistic features, pre-trained embeddings, and external semantic resources outperforms both transformer-based models and few-shot large language models, achieving moderate performance. We report findings and discuss limitations in the context of German VET."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="palomino-paassen-2025-benchmarking">
<titleInfo>
<title>Benchmarking Item Difficulty Classification in German Vocational Education and Training</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alonso</namePart>
<namePart type="family">Palomino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Benjamin</namePart>
<namePart type="family">Paassen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era</title>
</titleInfo>
<name type="personal">
<namePart type="given">Galia</namePart>
<namePart type="family">Angelova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Kunilovskaya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie</namePart>
<namePart type="family">Escribe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Predicting the difficulty of exam questions or items is essential to effectively assembling and calibrating exams. While item response theory (IRT) models can estimate item difficulty, they require student responses that are costly and rarely available at scale. Natural language processing methods offer a text-only alternative; however, due to the scarcity of real-world labeled data, prior work often relies on synthetic or domain-specific corpora, limiting generalizability and overlooking the nuanced challenges of real-world text-based item difficulty estimation. Addressing this gap, we benchmark 122 classifiers on 935 German Vocational Education and Training (VET) items labeled via previous IRT analysis to assess feasibility under real-world conditions. In our setup, a stacked ensemble that combines linguistic features, pre-trained embeddings, and external semantic resources outperforms both transformer-based models and few-shot large language models, achieving moderate performance. We report findings and discuss limitations in the context of German VET.</abstract>
<identifier type="citekey">palomino-paassen-2025-benchmarking</identifier>
<location>
<url>https://aclanthology.org/2025.ranlp-1.99/</url>
</location>
<part>
<date>2025-09</date>
<extent unit="page">
<start>870</start>
<end>875</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Benchmarking Item Difficulty Classification in German Vocational Education and Training
%A Palomino, Alonso
%A Paassen, Benjamin
%Y Angelova, Galia
%Y Kunilovskaya, Maria
%Y Escribe, Marie
%Y Mitkov, Ruslan
%S Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era
%D 2025
%8 September
%I INCOMA Ltd., Shoumen, Bulgaria
%C Varna, Bulgaria
%F palomino-paassen-2025-benchmarking
%X Predicting the difficulty of exam questions or items is essential to effectively assembling and calibrating exams. While item response theory (IRT) models can estimate item difficulty, they require student responses that are costly and rarely available at scale. Natural language processing methods offer a text-only alternative; however, due to the scarcity of real-world labeled data, prior work often relies on synthetic or domain-specific corpora, limiting generalizability and overlooking the nuanced challenges of real-world text-based item difficulty estimation. Addressing this gap, we benchmark 122 classifiers on 935 German Vocational Education and Training (VET) items labeled via previous IRT analysis to assess feasibility under real-world conditions. In our setup, a stacked ensemble that combines linguistic features, pre-trained embeddings, and external semantic resources outperforms both transformer-based models and few-shot large language models, achieving moderate performance. We report findings and discuss limitations in the context of German VET.
%U https://aclanthology.org/2025.ranlp-1.99/
%P 870-875
Markdown (Informal)
[Benchmarking Item Difficulty Classification in German Vocational Education and Training](https://aclanthology.org/2025.ranlp-1.99/) (Palomino & Paassen, RANLP 2025)
ACL