@inproceedings{hasan-etal-2026-banglastem,
title = "{B}angla{STEM}: A Parallel Corpus and Term-Weighted Evaluation for Technical {B}angla-{E}nglish Translation",
author = "Hasan, Kazi Reyazul and
Islam, A. B. M. Alim Al and
Adnan, Muhammad Abdullah",
editor = "T.Y.S.S., Santosh and
Rodriguez, Juan Diego and
de Gibert, Ona",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics ({ACL} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-srw.34/",
pages = "403--412",
ISBN = "979-8-89176-393-7",
abstract = "Large language models excel at technical problem solving in English but struggle when questions are posed in Bangla. While translation offers a practical solution, existing Bangla-English systems frequently mistranslate specialized terminology, altering problem semantics and degrading downstream performance. We present BanglaSTEM, a dataset of 5,000 Bangla-English sentence pairs covering computer science, mathematics, physics, chemistry, and biology. Our pipeline extracts matching passages from official bilingual curriculum textbooks using OCR, then uses LLMs to align sentences and mark technical terms. These aligned examples serve as few-shot prompts for generating over 12,000 new translation pairs from LLMs, avoiding copyright issues. Human evaluators then select the best 5,000 pairs that correctly preserve technical terminology. We also test a term-weighted BLEU metric that gives higher weight to technical words, since standard metrics treat terminology errors and common word errors equally. We show that our weighted metric correlates better with downstream accuracy in code generation and math solving, while standard BLEU gives high scores even for wrong translations. The full implementation, dataset, and model will be made publicly available."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hasan-etal-2026-banglastem">
<titleInfo>
<title>BanglaSTEM: A Parallel Corpus and Term-Weighted Evaluation for Technical Bangla-English Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kazi</namePart>
<namePart type="given">Reyazul</namePart>
<namePart type="family">Hasan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">A</namePart>
<namePart type="given">B</namePart>
<namePart type="given">M</namePart>
<namePart type="given">Alim</namePart>
<namePart type="given">Al</namePart>
<namePart type="family">Islam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Muhammad</namePart>
<namePart type="given">Abdullah</namePart>
<namePart type="family">Adnan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Santosh</namePart>
<namePart type="family">T.Y.S.S.</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="given">Diego</namePart>
<namePart type="family">Rodriguez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ona</namePart>
<namePart type="family">de Gibert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-393-7</identifier>
</relatedItem>
<abstract>Large language models excel at technical problem solving in English but struggle when questions are posed in Bangla. While translation offers a practical solution, existing Bangla-English systems frequently mistranslate specialized terminology, altering problem semantics and degrading downstream performance. We present BanglaSTEM, a dataset of 5,000 Bangla-English sentence pairs covering computer science, mathematics, physics, chemistry, and biology. Our pipeline extracts matching passages from official bilingual curriculum textbooks using OCR, then uses LLMs to align sentences and mark technical terms. These aligned examples serve as few-shot prompts for generating over 12,000 new translation pairs from LLMs, avoiding copyright issues. Human evaluators then select the best 5,000 pairs that correctly preserve technical terminology. We also test a term-weighted BLEU metric that gives higher weight to technical words, since standard metrics treat terminology errors and common word errors equally. We show that our weighted metric correlates better with downstream accuracy in code generation and math solving, while standard BLEU gives high scores even for wrong translations. The full implementation, dataset, and model will be made publicly available.</abstract>
<identifier type="citekey">hasan-etal-2026-banglastem</identifier>
<location>
<url>https://aclanthology.org/2026.acl-srw.34/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>403</start>
<end>412</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T BanglaSTEM: A Parallel Corpus and Term-Weighted Evaluation for Technical Bangla-English Translation
%A Hasan, Kazi Reyazul
%A Islam, A. B. M. Alim Al
%A Adnan, Muhammad Abdullah
%Y T.Y.S.S., Santosh
%Y Rodriguez, Juan Diego
%Y de Gibert, Ona
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-393-7
%F hasan-etal-2026-banglastem
%X Large language models excel at technical problem solving in English but struggle when questions are posed in Bangla. While translation offers a practical solution, existing Bangla-English systems frequently mistranslate specialized terminology, altering problem semantics and degrading downstream performance. We present BanglaSTEM, a dataset of 5,000 Bangla-English sentence pairs covering computer science, mathematics, physics, chemistry, and biology. Our pipeline extracts matching passages from official bilingual curriculum textbooks using OCR, then uses LLMs to align sentences and mark technical terms. These aligned examples serve as few-shot prompts for generating over 12,000 new translation pairs from LLMs, avoiding copyright issues. Human evaluators then select the best 5,000 pairs that correctly preserve technical terminology. We also test a term-weighted BLEU metric that gives higher weight to technical words, since standard metrics treat terminology errors and common word errors equally. We show that our weighted metric correlates better with downstream accuracy in code generation and math solving, while standard BLEU gives high scores even for wrong translations. The full implementation, dataset, and model will be made publicly available.
%U https://aclanthology.org/2026.acl-srw.34/
%P 403-412
Markdown (Informal)
[BanglaSTEM: A Parallel Corpus and Term-Weighted Evaluation for Technical Bangla-English Translation](https://aclanthology.org/2026.acl-srw.34/) (Hasan et al., ACL 2026)
ACL