@inproceedings{shahriar-etal-2025-p6jiggasha,
title = "{P}6{J}iggasha: Benchmarking Large Language Models on {B}angla Physics Question Answering with Cross-lingual Evaluation",
author = "Shahriar, S.m. and
Fuad, Md Tahmid Hasan and
Fahim, Md and
Hossain, Md. Azad",
editor = "Alam, Firoj and
Kar, Sudipta and
Chowdhury, Shammur Absar and
Hassan, Naeemul and
Prince, Enamul Hoque and
Tasnim, Mohiuddin and
Rony, Md Rashad Al Hasan and
Rahman, Md Tahmid Rahman",
booktitle = "Proceedings of the Second Workshop on Bangla Language Processing (BLP-2025)",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.banglalp-1.16/",
pages = "196--211",
ISBN = "979-8-89176-314-2",
abstract = "Understanding scientific concepts in native languages is crucial for educational accessibility and knowledge transfer. In this work, we present a comprehensive evaluation of Large Language Models (LLMs) on Bangla physics questions, introducing P6Jiggasha, a novel dataset of 1,500 multiple-choice questions compiled from HSC physics textbooks, supplementary guides, admission preparation books, and past examination papers from various educational boards. We evaluate three state-of-the-art models{---}GPT-4.1, Gemini-2.5 Pro, and DeepSeek-R1-Distill-Llama-70B{---}on both native Bangla questions and their English translations. Our results reveal significant performance variations, with GPT-4.1 achieving 86.67{\%} accuracy on Bangla questions in a single inference, while other models show substantial improvement through multiple inference attempts, with Gemini-2.5 Pro reaching 89.52{\%} after four iterations. We introduce a \textit{Cumulative Accuracy@k} metric to evaluate iterative reasoning capabilities and provide comprehensive analysis across six physics topics and six question types. Our error analysis reveals systematic cross-lingual inconsistencies where models produce contradictory answers for identical questions across languages. This study provides valuable insights into the capabilities and limitations of current LLMs for low-resource scientific question answering and establishes benchmarks for future research in Bangla natural language processing."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="shahriar-etal-2025-p6jiggasha">
<titleInfo>
<title>P6Jiggasha: Benchmarking Large Language Models on Bangla Physics Question Answering with Cross-lingual Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">S.m.</namePart>
<namePart type="family">Shahriar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Md</namePart>
<namePart type="given">Tahmid</namePart>
<namePart type="given">Hasan</namePart>
<namePart type="family">Fuad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Md</namePart>
<namePart type="family">Fahim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Md.</namePart>
<namePart type="given">Azad</namePart>
<namePart type="family">Hossain</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop on Bangla Language Processing (BLP-2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Firoj</namePart>
<namePart type="family">Alam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sudipta</namePart>
<namePart type="family">Kar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shammur</namePart>
<namePart type="given">Absar</namePart>
<namePart type="family">Chowdhury</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naeemul</namePart>
<namePart type="family">Hassan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enamul</namePart>
<namePart type="given">Hoque</namePart>
<namePart type="family">Prince</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohiuddin</namePart>
<namePart type="family">Tasnim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Md</namePart>
<namePart type="given">Rashad</namePart>
<namePart type="given">Al</namePart>
<namePart type="given">Hasan</namePart>
<namePart type="family">Rony</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Md</namePart>
<namePart type="given">Tahmid</namePart>
<namePart type="given">Rahman</namePart>
<namePart type="family">Rahman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mumbai, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-314-2</identifier>
</relatedItem>
<abstract>Understanding scientific concepts in native languages is crucial for educational accessibility and knowledge transfer. In this work, we present a comprehensive evaluation of Large Language Models (LLMs) on Bangla physics questions, introducing P6Jiggasha, a novel dataset of 1,500 multiple-choice questions compiled from HSC physics textbooks, supplementary guides, admission preparation books, and past examination papers from various educational boards. We evaluate three state-of-the-art models—GPT-4.1, Gemini-2.5 Pro, and DeepSeek-R1-Distill-Llama-70B—on both native Bangla questions and their English translations. Our results reveal significant performance variations, with GPT-4.1 achieving 86.67% accuracy on Bangla questions in a single inference, while other models show substantial improvement through multiple inference attempts, with Gemini-2.5 Pro reaching 89.52% after four iterations. We introduce a Cumulative Accuracy@k metric to evaluate iterative reasoning capabilities and provide comprehensive analysis across six physics topics and six question types. Our error analysis reveals systematic cross-lingual inconsistencies where models produce contradictory answers for identical questions across languages. This study provides valuable insights into the capabilities and limitations of current LLMs for low-resource scientific question answering and establishes benchmarks for future research in Bangla natural language processing.</abstract>
<identifier type="citekey">shahriar-etal-2025-p6jiggasha</identifier>
<location>
<url>https://aclanthology.org/2025.banglalp-1.16/</url>
</location>
<part>
<date>2025-12</date>
<extent unit="page">
<start>196</start>
<end>211</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T P6Jiggasha: Benchmarking Large Language Models on Bangla Physics Question Answering with Cross-lingual Evaluation
%A Shahriar, S.m.
%A Fuad, Md Tahmid Hasan
%A Fahim, Md
%A Hossain, Md. Azad
%Y Alam, Firoj
%Y Kar, Sudipta
%Y Chowdhury, Shammur Absar
%Y Hassan, Naeemul
%Y Prince, Enamul Hoque
%Y Tasnim, Mohiuddin
%Y Rony, Md Rashad Al Hasan
%Y Rahman, Md Tahmid Rahman
%S Proceedings of the Second Workshop on Bangla Language Processing (BLP-2025)
%D 2025
%8 December
%I Association for Computational Linguistics
%C Mumbai, India
%@ 979-8-89176-314-2
%F shahriar-etal-2025-p6jiggasha
%X Understanding scientific concepts in native languages is crucial for educational accessibility and knowledge transfer. In this work, we present a comprehensive evaluation of Large Language Models (LLMs) on Bangla physics questions, introducing P6Jiggasha, a novel dataset of 1,500 multiple-choice questions compiled from HSC physics textbooks, supplementary guides, admission preparation books, and past examination papers from various educational boards. We evaluate three state-of-the-art models—GPT-4.1, Gemini-2.5 Pro, and DeepSeek-R1-Distill-Llama-70B—on both native Bangla questions and their English translations. Our results reveal significant performance variations, with GPT-4.1 achieving 86.67% accuracy on Bangla questions in a single inference, while other models show substantial improvement through multiple inference attempts, with Gemini-2.5 Pro reaching 89.52% after four iterations. We introduce a Cumulative Accuracy@k metric to evaluate iterative reasoning capabilities and provide comprehensive analysis across six physics topics and six question types. Our error analysis reveals systematic cross-lingual inconsistencies where models produce contradictory answers for identical questions across languages. This study provides valuable insights into the capabilities and limitations of current LLMs for low-resource scientific question answering and establishes benchmarks for future research in Bangla natural language processing.
%U https://aclanthology.org/2025.banglalp-1.16/
%P 196-211
Markdown (Informal)
[P6Jiggasha: Benchmarking Large Language Models on Bangla Physics Question Answering with Cross-lingual Evaluation](https://aclanthology.org/2025.banglalp-1.16/) (Shahriar et al., BanglaLP 2025)
ACL