@inproceedings{miller-etal-2025-systematic,
title = "Systematic Textual Availability of Manuscripts",
author = "Miller, Hadar and
Londner, Samuel and
Kuflik, Tsvi and
Shapira, Daria Vasyutinsky and
Dershowitz, Nachum and
Lavee, Moshe",
editor = "Alam, Mehwish and
Tchechmedjiev, Andon and
Gracia, Jorge and
Gromann, Dagmar and
di Buono, Maria Pia and
Monti, Johanna and
Ionov, Maxim",
booktitle = "Proceedings of the 5th Conference on Language, Data and Knowledge",
month = sep,
year = "2025",
address = "Naples, Italy",
publisher = "Unior Press",
url = "https://aclanthology.org/2025.ldk-1.18/",
pages = "162--173",
ISBN = "978-88-6719-333-2",
abstract = "The digital era has made millions of manuscript images in Hebrew available to all. However, despite major advancements in handwritten text recognition over the past decade, an efficient pipeline for large scale and accurate conversion of these manuscripts into useful machine-readable form is still sorely lacking.We propose a pipeline that significantly improves recognition models for automatic transcription of Hebrew manuscripts. Transfer learning is used to fine-tune pretrained models. For post-recognition correction, it leverages text reuse, a common phenomenon in medieval manuscripts, and state-of-the-art large language models for medieval Hebrew.The framework successfully handles noisy transcriptions and consistently suggests alternate, better readings. Initial results show that word level accuracy increased by 10{\%} for new readings proposed by text-reuse detection. Moreover, the character level accuracy improved by 18{\%} by fine-tuning models on the first few pages of each manuscript."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="miller-etal-2025-systematic">
<titleInfo>
<title>Systematic Textual Availability of Manuscripts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hadar</namePart>
<namePart type="family">Miller</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samuel</namePart>
<namePart type="family">Londner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tsvi</namePart>
<namePart type="family">Kuflik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daria</namePart>
<namePart type="given">Vasyutinsky</namePart>
<namePart type="family">Shapira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nachum</namePart>
<namePart type="family">Dershowitz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Moshe</namePart>
<namePart type="family">Lavee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 5th Conference on Language, Data and Knowledge</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mehwish</namePart>
<namePart type="family">Alam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andon</namePart>
<namePart type="family">Tchechmedjiev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jorge</namePart>
<namePart type="family">Gracia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dagmar</namePart>
<namePart type="family">Gromann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="given">Pia</namePart>
<namePart type="family">di Buono</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Johanna</namePart>
<namePart type="family">Monti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maxim</namePart>
<namePart type="family">Ionov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Unior Press</publisher>
<place>
<placeTerm type="text">Naples, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">978-88-6719-333-2</identifier>
</relatedItem>
<abstract>The digital era has made millions of manuscript images in Hebrew available to all. However, despite major advancements in handwritten text recognition over the past decade, an efficient pipeline for large scale and accurate conversion of these manuscripts into useful machine-readable form is still sorely lacking.We propose a pipeline that significantly improves recognition models for automatic transcription of Hebrew manuscripts. Transfer learning is used to fine-tune pretrained models. For post-recognition correction, it leverages text reuse, a common phenomenon in medieval manuscripts, and state-of-the-art large language models for medieval Hebrew.The framework successfully handles noisy transcriptions and consistently suggests alternate, better readings. Initial results show that word level accuracy increased by 10% for new readings proposed by text-reuse detection. Moreover, the character level accuracy improved by 18% by fine-tuning models on the first few pages of each manuscript.</abstract>
<identifier type="citekey">miller-etal-2025-systematic</identifier>
<location>
<url>https://aclanthology.org/2025.ldk-1.18/</url>
</location>
<part>
<date>2025-09</date>
<extent unit="page">
<start>162</start>
<end>173</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Systematic Textual Availability of Manuscripts
%A Miller, Hadar
%A Londner, Samuel
%A Kuflik, Tsvi
%A Shapira, Daria Vasyutinsky
%A Dershowitz, Nachum
%A Lavee, Moshe
%Y Alam, Mehwish
%Y Tchechmedjiev, Andon
%Y Gracia, Jorge
%Y Gromann, Dagmar
%Y di Buono, Maria Pia
%Y Monti, Johanna
%Y Ionov, Maxim
%S Proceedings of the 5th Conference on Language, Data and Knowledge
%D 2025
%8 September
%I Unior Press
%C Naples, Italy
%@ 978-88-6719-333-2
%F miller-etal-2025-systematic
%X The digital era has made millions of manuscript images in Hebrew available to all. However, despite major advancements in handwritten text recognition over the past decade, an efficient pipeline for large scale and accurate conversion of these manuscripts into useful machine-readable form is still sorely lacking.We propose a pipeline that significantly improves recognition models for automatic transcription of Hebrew manuscripts. Transfer learning is used to fine-tune pretrained models. For post-recognition correction, it leverages text reuse, a common phenomenon in medieval manuscripts, and state-of-the-art large language models for medieval Hebrew.The framework successfully handles noisy transcriptions and consistently suggests alternate, better readings. Initial results show that word level accuracy increased by 10% for new readings proposed by text-reuse detection. Moreover, the character level accuracy improved by 18% by fine-tuning models on the first few pages of each manuscript.
%U https://aclanthology.org/2025.ldk-1.18/
%P 162-173
Markdown (Informal)
[Systematic Textual Availability of Manuscripts](https://aclanthology.org/2025.ldk-1.18/) (Miller et al., LDK 2025)
ACL
- Hadar Miller, Samuel Londner, Tsvi Kuflik, Daria Vasyutinsky Shapira, Nachum Dershowitz, and Moshe Lavee. 2025. Systematic Textual Availability of Manuscripts. In Proceedings of the 5th Conference on Language, Data and Knowledge, pages 162–173, Naples, Italy. Unior Press.