@inproceedings{mahtab-etal-2025-bannerd,
title = "{B}an{NERD}: A Benchmark Dataset and Context-Driven Approach for {B}angla Named Entity Recognition",
author = "Mahtab, Md. Motahar and
Khan, Faisal Ahamed and
Islam, Md. Ekramul and
Chowdhury, Md. Shahad Mahmud and
Chowdhury, Labib Imam and
Afrin, Sadia and
Ali, Hazrat and
Rashid, Mohammad Mamun Or and
Mohammed, Nabeel and
Amin, Mohammad Ruhul",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2025",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-naacl.380/",
doi = "10.18653/v1/2025.findings-naacl.380",
pages = "6807--6828",
ISBN = "979-8-89176-195-7",
abstract = "In this study, we introduce \textbf{BanNERD}, the most extensive human-annotated and validated \textbf{B}angla \textbf{N}amed \textbf{E}ntity \textbf{R}ecognition \textbf{D}ataset to date, comprising over 85,000 sentences. BanNERD is curated from a diverse array of sources, spanning over 29 domains, thereby offering a comprehensive range of generalized contexts. To ensure the dataset{'}s quality, expert linguists developed a detailed annotation guideline tailored to the Bangla language. All annotations underwent rigorous validation by a team of validators, with final labels being determined via majority voting, thereby ensuring the highest annotation quality and a high IAA score of 0.88. In a cross-dataset evaluation, models trained on BanNERD consistently outperformed those trained on four existing Bangla NER datasets. Additionally, we propose a method named \textit{BanNERCEM} (Bangla NER context-ensemble Method) which outperforms existing approaches on Bangla NER datasets and performs competitively on English datasets using lightweight Bangla pretrained LLMs. Our approach passes each context separately to the model instead of previous concatenation-based approaches achieving the highest average macro F1 score of 81.85{\%} across 10 NER classes, outperforming previous approaches and ensuring better context utilization. We are making the code and datasets publicly available at \url{https://github.com/eblict-gigatech/BanNERD} in order to contribute to the further advancement of Bangla NLP."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mahtab-etal-2025-bannerd">
<titleInfo>
<title>BanNERD: A Benchmark Dataset and Context-Driven Approach for Bangla Named Entity Recognition</title>
</titleInfo>
<name type="personal">
<namePart type="given">Md.</namePart>
<namePart type="given">Motahar</namePart>
<namePart type="family">Mahtab</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Faisal</namePart>
<namePart type="given">Ahamed</namePart>
<namePart type="family">Khan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Md.</namePart>
<namePart type="given">Ekramul</namePart>
<namePart type="family">Islam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Md.</namePart>
<namePart type="given">Shahad</namePart>
<namePart type="given">Mahmud</namePart>
<namePart type="family">Chowdhury</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Labib</namePart>
<namePart type="given">Imam</namePart>
<namePart type="family">Chowdhury</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sadia</namePart>
<namePart type="family">Afrin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hazrat</namePart>
<namePart type="family">Ali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Mamun</namePart>
<namePart type="given">Or</namePart>
<namePart type="family">Rashid</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nabeel</namePart>
<namePart type="family">Mohammed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Ruhul</namePart>
<namePart type="family">Amin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-195-7</identifier>
</relatedItem>
<abstract>In this study, we introduce BanNERD, the most extensive human-annotated and validated Bangla Named Entity Recognition Dataset to date, comprising over 85,000 sentences. BanNERD is curated from a diverse array of sources, spanning over 29 domains, thereby offering a comprehensive range of generalized contexts. To ensure the dataset’s quality, expert linguists developed a detailed annotation guideline tailored to the Bangla language. All annotations underwent rigorous validation by a team of validators, with final labels being determined via majority voting, thereby ensuring the highest annotation quality and a high IAA score of 0.88. In a cross-dataset evaluation, models trained on BanNERD consistently outperformed those trained on four existing Bangla NER datasets. Additionally, we propose a method named BanNERCEM (Bangla NER context-ensemble Method) which outperforms existing approaches on Bangla NER datasets and performs competitively on English datasets using lightweight Bangla pretrained LLMs. Our approach passes each context separately to the model instead of previous concatenation-based approaches achieving the highest average macro F1 score of 81.85% across 10 NER classes, outperforming previous approaches and ensuring better context utilization. We are making the code and datasets publicly available at https://github.com/eblict-gigatech/BanNERD in order to contribute to the further advancement of Bangla NLP.</abstract>
<identifier type="citekey">mahtab-etal-2025-bannerd</identifier>
<identifier type="doi">10.18653/v1/2025.findings-naacl.380</identifier>
<location>
<url>https://aclanthology.org/2025.findings-naacl.380/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>6807</start>
<end>6828</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T BanNERD: A Benchmark Dataset and Context-Driven Approach for Bangla Named Entity Recognition
%A Mahtab, Md. Motahar
%A Khan, Faisal Ahamed
%A Islam, Md. Ekramul
%A Chowdhury, Md. Shahad Mahmud
%A Chowdhury, Labib Imam
%A Afrin, Sadia
%A Ali, Hazrat
%A Rashid, Mohammad Mamun Or
%A Mohammed, Nabeel
%A Amin, Mohammad Ruhul
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Findings of the Association for Computational Linguistics: NAACL 2025
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-195-7
%F mahtab-etal-2025-bannerd
%X In this study, we introduce BanNERD, the most extensive human-annotated and validated Bangla Named Entity Recognition Dataset to date, comprising over 85,000 sentences. BanNERD is curated from a diverse array of sources, spanning over 29 domains, thereby offering a comprehensive range of generalized contexts. To ensure the dataset’s quality, expert linguists developed a detailed annotation guideline tailored to the Bangla language. All annotations underwent rigorous validation by a team of validators, with final labels being determined via majority voting, thereby ensuring the highest annotation quality and a high IAA score of 0.88. In a cross-dataset evaluation, models trained on BanNERD consistently outperformed those trained on four existing Bangla NER datasets. Additionally, we propose a method named BanNERCEM (Bangla NER context-ensemble Method) which outperforms existing approaches on Bangla NER datasets and performs competitively on English datasets using lightweight Bangla pretrained LLMs. Our approach passes each context separately to the model instead of previous concatenation-based approaches achieving the highest average macro F1 score of 81.85% across 10 NER classes, outperforming previous approaches and ensuring better context utilization. We are making the code and datasets publicly available at https://github.com/eblict-gigatech/BanNERD in order to contribute to the further advancement of Bangla NLP.
%R 10.18653/v1/2025.findings-naacl.380
%U https://aclanthology.org/2025.findings-naacl.380/
%U https://doi.org/10.18653/v1/2025.findings-naacl.380
%P 6807-6828
Markdown (Informal)
[BanNERD: A Benchmark Dataset and Context-Driven Approach for Bangla Named Entity Recognition](https://aclanthology.org/2025.findings-naacl.380/) (Mahtab et al., Findings 2025)
ACL
- Md. Motahar Mahtab, Faisal Ahamed Khan, Md. Ekramul Islam, Md. Shahad Mahmud Chowdhury, Labib Imam Chowdhury, Sadia Afrin, Hazrat Ali, Mohammad Mamun Or Rashid, Nabeel Mohammed, and Mohammad Ruhul Amin. 2025. BanNERD: A Benchmark Dataset and Context-Driven Approach for Bangla Named Entity Recognition. In Findings of the Association for Computational Linguistics: NAACL 2025, pages 6807–6828, Albuquerque, New Mexico. Association for Computational Linguistics.