@inproceedings{huber-niklaus-2025-llms,
title = "{LLM}s meet Bloom{'}s Taxonomy: A Cognitive View on Large Language Model Evaluations",
author = "Huber, Thomas and
Niklaus, Christina",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.350/",
pages = "5211--5246",
abstract = "Current evaluation approaches for Large Language Models (LLMs) lack a structured approach that reflects the underlying cognitive abilities required for solving the tasks. This hinders a thorough understanding of the current level of LLM capabilities. For instance, it is widely accepted that LLMs perform well in terms of grammar, but it is unclear in what specific cognitive areas they excel or struggle in. This paper introduces a novel perspective on the evaluation of LLMs that leverages a hierarchical classification of tasks. Specifically, we explore the most widely used benchmarks for LLMs to systematically identify how well these existing evaluation methods cover the levels of Bloom{'}s Taxonomy, a hierarchical framework for categorizing cognitive skills. This comprehensive analysis allows us to identify strengths and weaknesses in current LLM assessment strategies in terms of cognitive abilities and suggest directions for both future benchmark development as well as highlight potential avenues for LLM research. Our findings reveal that LLMs generally perform better on the lower end of Bloom{'}s Taxonomy. Additionally, we find that there are significant gaps in the coverage of cognitive skills in the most commonly used benchmarks."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="huber-niklaus-2025-llms">
<titleInfo>
<title>LLMs meet Bloom’s Taxonomy: A Cognitive View on Large Language Model Evaluations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Huber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christina</namePart>
<namePart type="family">Niklaus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Current evaluation approaches for Large Language Models (LLMs) lack a structured approach that reflects the underlying cognitive abilities required for solving the tasks. This hinders a thorough understanding of the current level of LLM capabilities. For instance, it is widely accepted that LLMs perform well in terms of grammar, but it is unclear in what specific cognitive areas they excel or struggle in. This paper introduces a novel perspective on the evaluation of LLMs that leverages a hierarchical classification of tasks. Specifically, we explore the most widely used benchmarks for LLMs to systematically identify how well these existing evaluation methods cover the levels of Bloom’s Taxonomy, a hierarchical framework for categorizing cognitive skills. This comprehensive analysis allows us to identify strengths and weaknesses in current LLM assessment strategies in terms of cognitive abilities and suggest directions for both future benchmark development as well as highlight potential avenues for LLM research. Our findings reveal that LLMs generally perform better on the lower end of Bloom’s Taxonomy. Additionally, we find that there are significant gaps in the coverage of cognitive skills in the most commonly used benchmarks.</abstract>
<identifier type="citekey">huber-niklaus-2025-llms</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.350/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>5211</start>
<end>5246</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T LLMs meet Bloom’s Taxonomy: A Cognitive View on Large Language Model Evaluations
%A Huber, Thomas
%A Niklaus, Christina
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F huber-niklaus-2025-llms
%X Current evaluation approaches for Large Language Models (LLMs) lack a structured approach that reflects the underlying cognitive abilities required for solving the tasks. This hinders a thorough understanding of the current level of LLM capabilities. For instance, it is widely accepted that LLMs perform well in terms of grammar, but it is unclear in what specific cognitive areas they excel or struggle in. This paper introduces a novel perspective on the evaluation of LLMs that leverages a hierarchical classification of tasks. Specifically, we explore the most widely used benchmarks for LLMs to systematically identify how well these existing evaluation methods cover the levels of Bloom’s Taxonomy, a hierarchical framework for categorizing cognitive skills. This comprehensive analysis allows us to identify strengths and weaknesses in current LLM assessment strategies in terms of cognitive abilities and suggest directions for both future benchmark development as well as highlight potential avenues for LLM research. Our findings reveal that LLMs generally perform better on the lower end of Bloom’s Taxonomy. Additionally, we find that there are significant gaps in the coverage of cognitive skills in the most commonly used benchmarks.
%U https://aclanthology.org/2025.coling-main.350/
%P 5211-5246
Markdown (Informal)
[LLMs meet Bloom’s Taxonomy: A Cognitive View on Large Language Model Evaluations](https://aclanthology.org/2025.coling-main.350/) (Huber & Niklaus, COLING 2025)
ACL