@inproceedings{khademi-khaledi-faili-2025-iruex,
title = "{IRUEX}: A Study on Large Language Models Problem-Solving Skills in {I}ran`s University Entrance Exam",
author = "Khademi Khaledi, Hamed and
Faili, Heshaam",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.434/",
pages = "6505--6519",
abstract = "In this paper, we present the IRUEX dataset, a novel multiple-choice educational resource specifically designed to evaluate the performance of Large Language Models (LLMs) across seven distinct categories. The dataset contains 868 Iran university entrance exam questions (Konkour) and 36,485 additional questions. Each additional question is accompanied by detailed solutions, and the dataset also includes relevant high school textbooks, providing comprehensive study material. A key feature of IRUEX is its focus on underrepresented languages, particularly assessing problem-solving skills, language proficiency, and reasoning. Our evaluation shows that GPT-4o outperforms the other LLMs tested on the IRUEX dataset. Techniques such as few-shot learning and retrieval-augmented generation (RAG) display varied effects across different categories, highlighting their unique strengths in specific areas. Additionally, a comprehensive user study classifies the errors made by LLMs into ten problem-solving ability categories. The analysis highlights that calculations and linguistic knowledge, particularly in low-resource languages, remain significant weaknesses in current LLMs. IRUEX has the potential to serve as a benchmark for evaluating the reasoning capabilities of LLMs in non-English settings, providing a foundation for improving their performance in diverse languages and contexts"
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="khademi-khaledi-faili-2025-iruex">
<titleInfo>
<title>IRUEX: A Study on Large Language Models Problem-Solving Skills in Iran‘s University Entrance Exam</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hamed</namePart>
<namePart type="family">Khademi Khaledi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Heshaam</namePart>
<namePart type="family">Faili</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper, we present the IRUEX dataset, a novel multiple-choice educational resource specifically designed to evaluate the performance of Large Language Models (LLMs) across seven distinct categories. The dataset contains 868 Iran university entrance exam questions (Konkour) and 36,485 additional questions. Each additional question is accompanied by detailed solutions, and the dataset also includes relevant high school textbooks, providing comprehensive study material. A key feature of IRUEX is its focus on underrepresented languages, particularly assessing problem-solving skills, language proficiency, and reasoning. Our evaluation shows that GPT-4o outperforms the other LLMs tested on the IRUEX dataset. Techniques such as few-shot learning and retrieval-augmented generation (RAG) display varied effects across different categories, highlighting their unique strengths in specific areas. Additionally, a comprehensive user study classifies the errors made by LLMs into ten problem-solving ability categories. The analysis highlights that calculations and linguistic knowledge, particularly in low-resource languages, remain significant weaknesses in current LLMs. IRUEX has the potential to serve as a benchmark for evaluating the reasoning capabilities of LLMs in non-English settings, providing a foundation for improving their performance in diverse languages and contexts</abstract>
<identifier type="citekey">khademi-khaledi-faili-2025-iruex</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.434/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>6505</start>
<end>6519</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T IRUEX: A Study on Large Language Models Problem-Solving Skills in Iran‘s University Entrance Exam
%A Khademi Khaledi, Hamed
%A Faili, Heshaam
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F khademi-khaledi-faili-2025-iruex
%X In this paper, we present the IRUEX dataset, a novel multiple-choice educational resource specifically designed to evaluate the performance of Large Language Models (LLMs) across seven distinct categories. The dataset contains 868 Iran university entrance exam questions (Konkour) and 36,485 additional questions. Each additional question is accompanied by detailed solutions, and the dataset also includes relevant high school textbooks, providing comprehensive study material. A key feature of IRUEX is its focus on underrepresented languages, particularly assessing problem-solving skills, language proficiency, and reasoning. Our evaluation shows that GPT-4o outperforms the other LLMs tested on the IRUEX dataset. Techniques such as few-shot learning and retrieval-augmented generation (RAG) display varied effects across different categories, highlighting their unique strengths in specific areas. Additionally, a comprehensive user study classifies the errors made by LLMs into ten problem-solving ability categories. The analysis highlights that calculations and linguistic knowledge, particularly in low-resource languages, remain significant weaknesses in current LLMs. IRUEX has the potential to serve as a benchmark for evaluating the reasoning capabilities of LLMs in non-English settings, providing a foundation for improving their performance in diverse languages and contexts
%U https://aclanthology.org/2025.coling-main.434/
%P 6505-6519
Markdown (Informal)
[IRUEX: A Study on Large Language Models Problem-Solving Skills in Iran’s University Entrance Exam](https://aclanthology.org/2025.coling-main.434/) (Khademi Khaledi & Faili, COLING 2025)
ACL