@inproceedings{ahmed-etal-2025-bennumeval,
title = "{B}en{N}um{E}val: A Benchmark to Assess {LLM}s' Numerical Reasoning Capabilities in {B}engali",
author = "Ahmed, Kawsar and
Osama, Md and
Sharif, Omar and
Hossain, Eftekhar and
Hoque, Mohammed Moshiul",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.915/",
doi = "10.18653/v1/2025.findings-acl.915",
pages = "17782--17799",
ISBN = "979-8-89176-256-5",
abstract = "Large Language Models (LLMs) demonstrate exceptional proficiency in general-purpose tasks but struggle with numerical reasoning, particularly in low-resource languages like Bengali. Despite advancements, limited research has explored their numerical reasoning capabilities in these languages. To address this gap, we present BenNumEval (Bengali Numerical Evaluation), a benchmark designed to assess LLMs on numerical reasoning tasks in Bengali. It comprises six diverse tasks and a total of 3.2k samples curated from real-world problem-solving scenarios. Our extensive evaluations reveal that even with advanced prompting techniques such as Cross-Lingual Prompting (XLP) and Cross-Lingual Chain-of-Thought Prompting (XCoT), LLMs fall notably short of human-level performance, particularly when using Bengali Native Prompting (BNaP). These findings underscore the substantial gap between current LLM capabilities and human expertise in numerical reasoning, highlighting the need for more robust and linguistically inclusive AI models to advance Bengali Language Processing and equitable AI development. The source code for the system and evaluation pipeline is publicly available on GitHub."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ahmed-etal-2025-bennumeval">
<titleInfo>
<title>BenNumEval: A Benchmark to Assess LLMs’ Numerical Reasoning Capabilities in Bengali</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kawsar</namePart>
<namePart type="family">Ahmed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Md</namePart>
<namePart type="family">Osama</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Omar</namePart>
<namePart type="family">Sharif</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eftekhar</namePart>
<namePart type="family">Hossain</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammed</namePart>
<namePart type="given">Moshiul</namePart>
<namePart type="family">Hoque</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Large Language Models (LLMs) demonstrate exceptional proficiency in general-purpose tasks but struggle with numerical reasoning, particularly in low-resource languages like Bengali. Despite advancements, limited research has explored their numerical reasoning capabilities in these languages. To address this gap, we present BenNumEval (Bengali Numerical Evaluation), a benchmark designed to assess LLMs on numerical reasoning tasks in Bengali. It comprises six diverse tasks and a total of 3.2k samples curated from real-world problem-solving scenarios. Our extensive evaluations reveal that even with advanced prompting techniques such as Cross-Lingual Prompting (XLP) and Cross-Lingual Chain-of-Thought Prompting (XCoT), LLMs fall notably short of human-level performance, particularly when using Bengali Native Prompting (BNaP). These findings underscore the substantial gap between current LLM capabilities and human expertise in numerical reasoning, highlighting the need for more robust and linguistically inclusive AI models to advance Bengali Language Processing and equitable AI development. The source code for the system and evaluation pipeline is publicly available on GitHub.</abstract>
<identifier type="citekey">ahmed-etal-2025-bennumeval</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.915</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.915/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>17782</start>
<end>17799</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T BenNumEval: A Benchmark to Assess LLMs’ Numerical Reasoning Capabilities in Bengali
%A Ahmed, Kawsar
%A Osama, Md
%A Sharif, Omar
%A Hossain, Eftekhar
%A Hoque, Mohammed Moshiul
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F ahmed-etal-2025-bennumeval
%X Large Language Models (LLMs) demonstrate exceptional proficiency in general-purpose tasks but struggle with numerical reasoning, particularly in low-resource languages like Bengali. Despite advancements, limited research has explored their numerical reasoning capabilities in these languages. To address this gap, we present BenNumEval (Bengali Numerical Evaluation), a benchmark designed to assess LLMs on numerical reasoning tasks in Bengali. It comprises six diverse tasks and a total of 3.2k samples curated from real-world problem-solving scenarios. Our extensive evaluations reveal that even with advanced prompting techniques such as Cross-Lingual Prompting (XLP) and Cross-Lingual Chain-of-Thought Prompting (XCoT), LLMs fall notably short of human-level performance, particularly when using Bengali Native Prompting (BNaP). These findings underscore the substantial gap between current LLM capabilities and human expertise in numerical reasoning, highlighting the need for more robust and linguistically inclusive AI models to advance Bengali Language Processing and equitable AI development. The source code for the system and evaluation pipeline is publicly available on GitHub.
%R 10.18653/v1/2025.findings-acl.915
%U https://aclanthology.org/2025.findings-acl.915/
%U https://doi.org/10.18653/v1/2025.findings-acl.915
%P 17782-17799
Markdown (Informal)
[BenNumEval: A Benchmark to Assess LLMs’ Numerical Reasoning Capabilities in Bengali](https://aclanthology.org/2025.findings-acl.915/) (Ahmed et al., Findings 2025)
ACL