@inproceedings{sharma-etal-2025-ancidev,
title = "{A}nci{D}ev: A Dataset for High-Accuracy Handwritten Text Recognition of {A}ncient {D}evanagari Manuscripts",
author = "Sharma, Vriti and
Verma, Rajat and
Saluja, Rohit",
editor = "Bhattacharya, Arnab and
Goyal, Pawan and
Ghosh, Saptarshi and
Ghosh, Kripabandhu",
booktitle = "Proceedings of the 1st Workshop on Benchmarks, Harmonization, Annotation, and Standardization for Human-Centric AI in Indian Languages (BHASHA 2025)",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.bhasha-1.8/",
pages = "91--101",
ISBN = "979-8-89176-313-5",
abstract = "The digital preservation and accessibility of historical documents require accurate and scalable Handwritten Text Recognition (HTR). However, progress in this field is significantly hampered for low-resource scripts, such as ancient forms of the scripts used in historical manuscripts, due to the scarcity of high-quality, transcribed training data. We address this critical gap by introducing the \textbf{AnciDev} Dataset, a novel, publicly available resource comprising 3,000 transcribed text lines sourced from 500 pages of different ancient Devanagari manuscripts. To validate the utility of this new resource, we systematically evaluate and fine-tune several HTR models on the \textbf{AnciDev} Dataset. Our experiments demonstrate a significant performance uplift across all fine-tuned models, with the best-performing architecture achieving a substantial reduction in Character Error Rate (CER), confirming the dataset{'}s efficacy in addressing the unique complexities of ancient handwriting. This work not only provides a crucial, well-curated dataset to the research community but also sets a new, reproducible state-of-the-art for the HTR of historical Devanagari, advancing the effort to digitally preserve India{'}s documentary heritage."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sharma-etal-2025-ancidev">
<titleInfo>
<title>AnciDev: A Dataset for High-Accuracy Handwritten Text Recognition of Ancient Devanagari Manuscripts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vriti</namePart>
<namePart type="family">Sharma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rajat</namePart>
<namePart type="family">Verma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rohit</namePart>
<namePart type="family">Saluja</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on Benchmarks, Harmonization, Annotation, and Standardization for Human-Centric AI in Indian Languages (BHASHA 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Arnab</namePart>
<namePart type="family">Bhattacharya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pawan</namePart>
<namePart type="family">Goyal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Saptarshi</namePart>
<namePart type="family">Ghosh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kripabandhu</namePart>
<namePart type="family">Ghosh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mumbai, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-313-5</identifier>
</relatedItem>
<abstract>The digital preservation and accessibility of historical documents require accurate and scalable Handwritten Text Recognition (HTR). However, progress in this field is significantly hampered for low-resource scripts, such as ancient forms of the scripts used in historical manuscripts, due to the scarcity of high-quality, transcribed training data. We address this critical gap by introducing the AnciDev Dataset, a novel, publicly available resource comprising 3,000 transcribed text lines sourced from 500 pages of different ancient Devanagari manuscripts. To validate the utility of this new resource, we systematically evaluate and fine-tune several HTR models on the AnciDev Dataset. Our experiments demonstrate a significant performance uplift across all fine-tuned models, with the best-performing architecture achieving a substantial reduction in Character Error Rate (CER), confirming the dataset’s efficacy in addressing the unique complexities of ancient handwriting. This work not only provides a crucial, well-curated dataset to the research community but also sets a new, reproducible state-of-the-art for the HTR of historical Devanagari, advancing the effort to digitally preserve India’s documentary heritage.</abstract>
<identifier type="citekey">sharma-etal-2025-ancidev</identifier>
<location>
<url>https://aclanthology.org/2025.bhasha-1.8/</url>
</location>
<part>
<date>2025-12</date>
<extent unit="page">
<start>91</start>
<end>101</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T AnciDev: A Dataset for High-Accuracy Handwritten Text Recognition of Ancient Devanagari Manuscripts
%A Sharma, Vriti
%A Verma, Rajat
%A Saluja, Rohit
%Y Bhattacharya, Arnab
%Y Goyal, Pawan
%Y Ghosh, Saptarshi
%Y Ghosh, Kripabandhu
%S Proceedings of the 1st Workshop on Benchmarks, Harmonization, Annotation, and Standardization for Human-Centric AI in Indian Languages (BHASHA 2025)
%D 2025
%8 December
%I Association for Computational Linguistics
%C Mumbai, India
%@ 979-8-89176-313-5
%F sharma-etal-2025-ancidev
%X The digital preservation and accessibility of historical documents require accurate and scalable Handwritten Text Recognition (HTR). However, progress in this field is significantly hampered for low-resource scripts, such as ancient forms of the scripts used in historical manuscripts, due to the scarcity of high-quality, transcribed training data. We address this critical gap by introducing the AnciDev Dataset, a novel, publicly available resource comprising 3,000 transcribed text lines sourced from 500 pages of different ancient Devanagari manuscripts. To validate the utility of this new resource, we systematically evaluate and fine-tune several HTR models on the AnciDev Dataset. Our experiments demonstrate a significant performance uplift across all fine-tuned models, with the best-performing architecture achieving a substantial reduction in Character Error Rate (CER), confirming the dataset’s efficacy in addressing the unique complexities of ancient handwriting. This work not only provides a crucial, well-curated dataset to the research community but also sets a new, reproducible state-of-the-art for the HTR of historical Devanagari, advancing the effort to digitally preserve India’s documentary heritage.
%U https://aclanthology.org/2025.bhasha-1.8/
%P 91-101
Markdown (Informal)
[AnciDev: A Dataset for High-Accuracy Handwritten Text Recognition of Ancient Devanagari Manuscripts](https://aclanthology.org/2025.bhasha-1.8/) (Sharma et al., BHASHA 2025)
ACL