@inproceedings{khargharia-borgohain-2024-aid,
title = "An Aid to {A}ssamese Language Processing by Constructing an Offline {A}ssamese Handwritten Dataset",
author = "Khargharia, Debabrata and
Borgohain, Samir Kumar",
editor = "Lalitha Devi, Sobha and
Arora, Karunesh",
booktitle = "Proceedings of the 21st International Conference on Natural Language Processing (ICON)",
month = dec,
year = "2024",
address = "AU-KBC Research Centre, Chennai, India",
publisher = "NLP Association of India (NLPAI)",
url = "https://aclanthology.org/2024.icon-1.10/",
pages = "86--93",
abstract = "Recent years have seen a growing interest in analyzing Indian handwritten documents. In pattern recognition, particularly handwritten document recognition, the availability of standard databases is essential for assessing algorithm efficacy and facilitating result comparisons among research groups. However, there is a notable scarcity of standardized databases for handwritten texts in Indian languages. This paper presents a comprehensive methodology for the development of a novel, unconstrained dataset named OAHTD (Offline Assamese Handwritten Text Dataset) for the Assamese language, derived from offline handwritten documents. The dataset, which represents a significant contribution to the field of Optical Character Recognition (OCR) for handwritten Assamese, is the first of its kind in this domain. The corpus comprises 410 document images, each containing a diverse array of linguistic elements including words, numerals, individual characters, and various symbols. These documents were collected from a demographically diverse cohort of 300 contributors, spanning an age range of 10 to 76 years and representing varied educational backgrounds and genders. This meticulously curated collection aims to provide a robust foundation for the development and evaluation of OCR algorithms specifically tailored to the Assamese script, addressing a critical gap in the existing literature and resources for this language."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="khargharia-borgohain-2024-aid">
<titleInfo>
<title>An Aid to Assamese Language Processing by Constructing an Offline Assamese Handwritten Dataset</title>
</titleInfo>
<name type="personal">
<namePart type="given">Debabrata</namePart>
<namePart type="family">Khargharia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samir</namePart>
<namePart type="given">Kumar</namePart>
<namePart type="family">Borgohain</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 21st International Conference on Natural Language Processing (ICON)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sobha</namePart>
<namePart type="family">Lalitha Devi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Karunesh</namePart>
<namePart type="family">Arora</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>NLP Association of India (NLPAI)</publisher>
<place>
<placeTerm type="text">AU-KBC Research Centre, Chennai, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recent years have seen a growing interest in analyzing Indian handwritten documents. In pattern recognition, particularly handwritten document recognition, the availability of standard databases is essential for assessing algorithm efficacy and facilitating result comparisons among research groups. However, there is a notable scarcity of standardized databases for handwritten texts in Indian languages. This paper presents a comprehensive methodology for the development of a novel, unconstrained dataset named OAHTD (Offline Assamese Handwritten Text Dataset) for the Assamese language, derived from offline handwritten documents. The dataset, which represents a significant contribution to the field of Optical Character Recognition (OCR) for handwritten Assamese, is the first of its kind in this domain. The corpus comprises 410 document images, each containing a diverse array of linguistic elements including words, numerals, individual characters, and various symbols. These documents were collected from a demographically diverse cohort of 300 contributors, spanning an age range of 10 to 76 years and representing varied educational backgrounds and genders. This meticulously curated collection aims to provide a robust foundation for the development and evaluation of OCR algorithms specifically tailored to the Assamese script, addressing a critical gap in the existing literature and resources for this language.</abstract>
<identifier type="citekey">khargharia-borgohain-2024-aid</identifier>
<location>
<url>https://aclanthology.org/2024.icon-1.10/</url>
</location>
<part>
<date>2024-12</date>
<extent unit="page">
<start>86</start>
<end>93</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T An Aid to Assamese Language Processing by Constructing an Offline Assamese Handwritten Dataset
%A Khargharia, Debabrata
%A Borgohain, Samir Kumar
%Y Lalitha Devi, Sobha
%Y Arora, Karunesh
%S Proceedings of the 21st International Conference on Natural Language Processing (ICON)
%D 2024
%8 December
%I NLP Association of India (NLPAI)
%C AU-KBC Research Centre, Chennai, India
%F khargharia-borgohain-2024-aid
%X Recent years have seen a growing interest in analyzing Indian handwritten documents. In pattern recognition, particularly handwritten document recognition, the availability of standard databases is essential for assessing algorithm efficacy and facilitating result comparisons among research groups. However, there is a notable scarcity of standardized databases for handwritten texts in Indian languages. This paper presents a comprehensive methodology for the development of a novel, unconstrained dataset named OAHTD (Offline Assamese Handwritten Text Dataset) for the Assamese language, derived from offline handwritten documents. The dataset, which represents a significant contribution to the field of Optical Character Recognition (OCR) for handwritten Assamese, is the first of its kind in this domain. The corpus comprises 410 document images, each containing a diverse array of linguistic elements including words, numerals, individual characters, and various symbols. These documents were collected from a demographically diverse cohort of 300 contributors, spanning an age range of 10 to 76 years and representing varied educational backgrounds and genders. This meticulously curated collection aims to provide a robust foundation for the development and evaluation of OCR algorithms specifically tailored to the Assamese script, addressing a critical gap in the existing literature and resources for this language.
%U https://aclanthology.org/2024.icon-1.10/
%P 86-93
Markdown (Informal)
[An Aid to Assamese Language Processing by Constructing an Offline Assamese Handwritten Dataset](https://aclanthology.org/2024.icon-1.10/) (Khargharia & Borgohain, ICON 2024)
ACL