@inproceedings{ali-etal-2023-gold,
title = "Gold Standard {B}angla {OCR} Dataset: An In-Depth Look at Data Preprocessing and Annotation Processes",
author = "Ali, Hasmot and
Rabby, AKM Shahariar Azad and
Islam, Md Majedul and
Mahamud, A.k.m and
Hasan, Nazmul and
Rahman, Fuad",
editor = "Wang, Mingxuan and
Zitouni, Imed",
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: Industry Track",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.emnlp-industry.44",
doi = "10.18653/v1/2023.emnlp-industry.44",
pages = "460--470",
abstract = "This research paper focuses on developing an improved Bangla Optical Character Recognition (OCR) system, addressing the challenges posed by the complexity of Bangla text structure, diverse handwriting styles, and the scarcity of comprehensive datasets. Leveraging recent advancements in Deep Learning and OCR techniques, we anticipate a significant enhancement in the performance of Bangla OCR by utilizing a large and diverse collection of labeled Bangla text image datasets. This study introduces the most extensive gold standard corpus for Bangla characters and words, comprising over 4 million human-annotated images. Our dataset encompasses various document types, such as Computer Compose, Letterpress, Typewriters, Outdoor Banner-Poster, and Handwritten documents, gathered from diverse sources. The entire corpus has undergone meticulous human annotation, employing a controlled annotation procedure consisting of three-step annotation and one-step validation, ensuring adherence to gold standard criteria. This paper provides a comprehensive overview of the complete data collection procedure. The ICT Division, Government of the People{'}s Republic of Bangladesh, will make the dataset publicly available, facilitating further research and development in Bangla OCR and related domains.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ali-etal-2023-gold">
<titleInfo>
<title>Gold Standard Bangla OCR Dataset: An In-Depth Look at Data Preprocessing and Annotation Processes</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hasmot</namePart>
<namePart type="family">Ali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">AKM</namePart>
<namePart type="given">Shahariar</namePart>
<namePart type="given">Azad</namePart>
<namePart type="family">Rabby</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Md</namePart>
<namePart type="given">Majedul</namePart>
<namePart type="family">Islam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">A.k.m</namePart>
<namePart type="family">Mahamud</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nazmul</namePart>
<namePart type="family">Hasan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fuad</namePart>
<namePart type="family">Rahman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mingxuan</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Imed</namePart>
<namePart type="family">Zitouni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This research paper focuses on developing an improved Bangla Optical Character Recognition (OCR) system, addressing the challenges posed by the complexity of Bangla text structure, diverse handwriting styles, and the scarcity of comprehensive datasets. Leveraging recent advancements in Deep Learning and OCR techniques, we anticipate a significant enhancement in the performance of Bangla OCR by utilizing a large and diverse collection of labeled Bangla text image datasets. This study introduces the most extensive gold standard corpus for Bangla characters and words, comprising over 4 million human-annotated images. Our dataset encompasses various document types, such as Computer Compose, Letterpress, Typewriters, Outdoor Banner-Poster, and Handwritten documents, gathered from diverse sources. The entire corpus has undergone meticulous human annotation, employing a controlled annotation procedure consisting of three-step annotation and one-step validation, ensuring adherence to gold standard criteria. This paper provides a comprehensive overview of the complete data collection procedure. The ICT Division, Government of the People’s Republic of Bangladesh, will make the dataset publicly available, facilitating further research and development in Bangla OCR and related domains.</abstract>
<identifier type="citekey">ali-etal-2023-gold</identifier>
<identifier type="doi">10.18653/v1/2023.emnlp-industry.44</identifier>
<location>
<url>https://aclanthology.org/2023.emnlp-industry.44</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>460</start>
<end>470</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Gold Standard Bangla OCR Dataset: An In-Depth Look at Data Preprocessing and Annotation Processes
%A Ali, Hasmot
%A Rabby, AKM Shahariar Azad
%A Islam, Md Majedul
%A Mahamud, A.k.m
%A Hasan, Nazmul
%A Rahman, Fuad
%Y Wang, Mingxuan
%Y Zitouni, Imed
%S Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: Industry Track
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F ali-etal-2023-gold
%X This research paper focuses on developing an improved Bangla Optical Character Recognition (OCR) system, addressing the challenges posed by the complexity of Bangla text structure, diverse handwriting styles, and the scarcity of comprehensive datasets. Leveraging recent advancements in Deep Learning and OCR techniques, we anticipate a significant enhancement in the performance of Bangla OCR by utilizing a large and diverse collection of labeled Bangla text image datasets. This study introduces the most extensive gold standard corpus for Bangla characters and words, comprising over 4 million human-annotated images. Our dataset encompasses various document types, such as Computer Compose, Letterpress, Typewriters, Outdoor Banner-Poster, and Handwritten documents, gathered from diverse sources. The entire corpus has undergone meticulous human annotation, employing a controlled annotation procedure consisting of three-step annotation and one-step validation, ensuring adherence to gold standard criteria. This paper provides a comprehensive overview of the complete data collection procedure. The ICT Division, Government of the People’s Republic of Bangladesh, will make the dataset publicly available, facilitating further research and development in Bangla OCR and related domains.
%R 10.18653/v1/2023.emnlp-industry.44
%U https://aclanthology.org/2023.emnlp-industry.44
%U https://doi.org/10.18653/v1/2023.emnlp-industry.44
%P 460-470
Markdown (Informal)
[Gold Standard Bangla OCR Dataset: An In-Depth Look at Data Preprocessing and Annotation Processes](https://aclanthology.org/2023.emnlp-industry.44) (Ali et al., EMNLP 2023)
ACL