@inproceedings{chaplynskyi-dydyk-meush-2026-digitizing,
title = "Digitizing Old {U}krainian Texts: A Prompt-Based {OCR} Pipeline and Evaluation Dataset",
author = "Chaplynskyi, Dmytro and
Dydyk-Meush, Hanna",
editor = "Romanyshyn, Mariana",
booktitle = "Proceedings of the Fifth {U}krainian Natural Language Processing Conference ({UNLP} 2026)",
month = may,
year = "2026",
address = "Lviv, Ukraine",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.unlp-1.7/",
pages = "58--66",
ISBN = "979-8-89176-359-3",
abstract = "We present a methodology and an open dataset for OCR of handwritten index cards containing a scholarly transcription of an early 17th-century Ukrainian polemical text, Perestoroha by Iov Boretskyi (Lviv, 1605{--}1606). The 430 cards, produced by 20th-century researchers, preserve the text in Old Ukrainian orthography with archaic diacritics, titlos, superscript letters, and ligatures that make automated recognition non-trivial. We develop a prompt-based OCR pipeline driven by a custom instruction set designed iteratively from the source material{'}s orthographic conventions. The pipeline is evaluated against human-proofread ground truth in proprietary and open-source configurations using identical instructions and evaluation data. The proprietary configuration with extended thinking at maximum budget (Claude Opus 4.7, xhigh) achieves a Character Error Rate of 2.5{\%}; an Opus 4.6 baseline at the default 2,048-token thinking budget {---} used for the first batch of the released dataset {---} reaches 4.2{\%}; and two open-source Qwen3.6 variants running locally on consumer hardware reach 14.6{\%} (dense 27B) and 14.8{\%} (35B-A3B MoE). We release the fully digitized text aligned at line level to 300 DPI scanned images, as both a scholarly digital resource and training data for future OCR systems targeting Old Slavic manuscripts."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chaplynskyi-dydyk-meush-2026-digitizing">
<titleInfo>
<title>Digitizing Old Ukrainian Texts: A Prompt-Based OCR Pipeline and Evaluation Dataset</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dmytro</namePart>
<namePart type="family">Chaplynskyi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hanna</namePart>
<namePart type="family">Dydyk-Meush</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Ukrainian Natural Language Processing Conference (UNLP 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mariana</namePart>
<namePart type="family">Romanyshyn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Lviv, Ukraine</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-359-3</identifier>
</relatedItem>
<abstract>We present a methodology and an open dataset for OCR of handwritten index cards containing a scholarly transcription of an early 17th-century Ukrainian polemical text, Perestoroha by Iov Boretskyi (Lviv, 1605–1606). The 430 cards, produced by 20th-century researchers, preserve the text in Old Ukrainian orthography with archaic diacritics, titlos, superscript letters, and ligatures that make automated recognition non-trivial. We develop a prompt-based OCR pipeline driven by a custom instruction set designed iteratively from the source material’s orthographic conventions. The pipeline is evaluated against human-proofread ground truth in proprietary and open-source configurations using identical instructions and evaluation data. The proprietary configuration with extended thinking at maximum budget (Claude Opus 4.7, xhigh) achieves a Character Error Rate of 2.5%; an Opus 4.6 baseline at the default 2,048-token thinking budget — used for the first batch of the released dataset — reaches 4.2%; and two open-source Qwen3.6 variants running locally on consumer hardware reach 14.6% (dense 27B) and 14.8% (35B-A3B MoE). We release the fully digitized text aligned at line level to 300 DPI scanned images, as both a scholarly digital resource and training data for future OCR systems targeting Old Slavic manuscripts.</abstract>
<identifier type="citekey">chaplynskyi-dydyk-meush-2026-digitizing</identifier>
<location>
<url>https://aclanthology.org/2026.unlp-1.7/</url>
</location>
<part>
<date>2026-05</date>
<extent unit="page">
<start>58</start>
<end>66</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Digitizing Old Ukrainian Texts: A Prompt-Based OCR Pipeline and Evaluation Dataset
%A Chaplynskyi, Dmytro
%A Dydyk-Meush, Hanna
%Y Romanyshyn, Mariana
%S Proceedings of the Fifth Ukrainian Natural Language Processing Conference (UNLP 2026)
%D 2026
%8 May
%I Association for Computational Linguistics
%C Lviv, Ukraine
%@ 979-8-89176-359-3
%F chaplynskyi-dydyk-meush-2026-digitizing
%X We present a methodology and an open dataset for OCR of handwritten index cards containing a scholarly transcription of an early 17th-century Ukrainian polemical text, Perestoroha by Iov Boretskyi (Lviv, 1605–1606). The 430 cards, produced by 20th-century researchers, preserve the text in Old Ukrainian orthography with archaic diacritics, titlos, superscript letters, and ligatures that make automated recognition non-trivial. We develop a prompt-based OCR pipeline driven by a custom instruction set designed iteratively from the source material’s orthographic conventions. The pipeline is evaluated against human-proofread ground truth in proprietary and open-source configurations using identical instructions and evaluation data. The proprietary configuration with extended thinking at maximum budget (Claude Opus 4.7, xhigh) achieves a Character Error Rate of 2.5%; an Opus 4.6 baseline at the default 2,048-token thinking budget — used for the first batch of the released dataset — reaches 4.2%; and two open-source Qwen3.6 variants running locally on consumer hardware reach 14.6% (dense 27B) and 14.8% (35B-A3B MoE). We release the fully digitized text aligned at line level to 300 DPI scanned images, as both a scholarly digital resource and training data for future OCR systems targeting Old Slavic manuscripts.
%U https://aclanthology.org/2026.unlp-1.7/
%P 58-66
Markdown (Informal)
[Digitizing Old Ukrainian Texts: A Prompt-Based OCR Pipeline and Evaluation Dataset](https://aclanthology.org/2026.unlp-1.7/) (Chaplynskyi & Dydyk-Meush, UNLP 2026)
ACL