@inproceedings{kalbaliyev-sirts-2025-towards,
title = "Towards Evaluation of Language Models with Skill Dimensions: A Case Study on Narrative Question Answering",
author = "Kalbaliyev, Emil and
Sirts, Kairit",
editor = "Frermann, Lea and
Stevenson, Mark",
booktitle = "Proceedings of the 14th Joint Conference on Lexical and Computational Semantics (*SEM 2025)",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.starsem-1.34/",
pages = "430--440",
ISBN = "979-8-89176-340-1",
abstract = "Large language models have demonstrated varying levels of competence across a range of reasoning tasks, but coarse-grained evaluations often do not reflect their specific strengths and weaknesses, particularly in complex tasks such as Narrative Question Answering. In this paper, we advocate for a multi-dimensional skill-based evaluation that assesses models across distinct core skill dimensions. Our proposed skill-focused evaluation framework offers a granular and more realistic measure of model performance, revealing targeted areas for improvement and guiding future development. Experiments on Narrative Question Answering demonstrate that dimension-level analysis captures the multifaceted nature of the task and informs more effective model evaluation."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kalbaliyev-sirts-2025-towards">
<titleInfo>
<title>Towards Evaluation of Language Models with Skill Dimensions: A Case Study on Narrative Question Answering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Emil</namePart>
<namePart type="family">Kalbaliyev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kairit</namePart>
<namePart type="family">Sirts</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 14th Joint Conference on Lexical and Computational Semantics (*SEM 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lea</namePart>
<namePart type="family">Frermann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Stevenson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-340-1</identifier>
</relatedItem>
<abstract>Large language models have demonstrated varying levels of competence across a range of reasoning tasks, but coarse-grained evaluations often do not reflect their specific strengths and weaknesses, particularly in complex tasks such as Narrative Question Answering. In this paper, we advocate for a multi-dimensional skill-based evaluation that assesses models across distinct core skill dimensions. Our proposed skill-focused evaluation framework offers a granular and more realistic measure of model performance, revealing targeted areas for improvement and guiding future development. Experiments on Narrative Question Answering demonstrate that dimension-level analysis captures the multifaceted nature of the task and informs more effective model evaluation.</abstract>
<identifier type="citekey">kalbaliyev-sirts-2025-towards</identifier>
<location>
<url>https://aclanthology.org/2025.starsem-1.34/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>430</start>
<end>440</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Towards Evaluation of Language Models with Skill Dimensions: A Case Study on Narrative Question Answering
%A Kalbaliyev, Emil
%A Sirts, Kairit
%Y Frermann, Lea
%Y Stevenson, Mark
%S Proceedings of the 14th Joint Conference on Lexical and Computational Semantics (*SEM 2025)
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-340-1
%F kalbaliyev-sirts-2025-towards
%X Large language models have demonstrated varying levels of competence across a range of reasoning tasks, but coarse-grained evaluations often do not reflect their specific strengths and weaknesses, particularly in complex tasks such as Narrative Question Answering. In this paper, we advocate for a multi-dimensional skill-based evaluation that assesses models across distinct core skill dimensions. Our proposed skill-focused evaluation framework offers a granular and more realistic measure of model performance, revealing targeted areas for improvement and guiding future development. Experiments on Narrative Question Answering demonstrate that dimension-level analysis captures the multifaceted nature of the task and informs more effective model evaluation.
%U https://aclanthology.org/2025.starsem-1.34/
%P 430-440
Markdown (Informal)
[Towards Evaluation of Language Models with Skill Dimensions: A Case Study on Narrative Question Answering](https://aclanthology.org/2025.starsem-1.34/) (Kalbaliyev & Sirts, *SEM 2025)
ACL