@inproceedings{kanerva-etal-2025-ocr,
title = "{OCR} Error Post-Correction with {LLM}s in Historical Documents: No Free Lunches",
author = {Kanerva, Jenna and
Ledins, Cassandra and
K{\"a}pyaho, Siiri and
Ginter, Filip},
editor = "Holdt, {\v{S}}pela Arhar and
Ilinykh, Nikolai and
Scalvini, Barbara and
Bruton, Micaella and
Debess, Iben Nyholm and
Tudor, Crina Madalina",
booktitle = "Proceedings of the Third Workshop on Resources and Representations for Under-Resourced Languages and Domains (RESOURCEFUL-2025)",
month = mar,
year = "2025",
address = "Tallinn, Estonia",
publisher = "University of Tartu Library, Estonia",
url = "https://aclanthology.org/2025.resourceful-1.8/",
pages = "38--47",
ISBN = "978-9908-53-121-2",
abstract = "Optical Character Recognition (OCR) systems often introduce errors when transcribing historical documents, leaving room for post-correction to improve text quality. This study evaluates the use of open-weight LLMs for OCR error correction in historical English and Finnish datasets. We explore various strategies, including parameter optimization, quantization, segment length effects, and text continuation methods. Our results demonstrate that while modern LLMs show promise in reducing character error rates (CER) in English, a practically useful performance for Finnish was not reached. Our findings highlight the potential and limitations of LLMs in scaling OCR post-correction for large historical corpora."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kanerva-etal-2025-ocr">
<titleInfo>
<title>OCR Error Post-Correction with LLMs in Historical Documents: No Free Lunches</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jenna</namePart>
<namePart type="family">Kanerva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cassandra</namePart>
<namePart type="family">Ledins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Siiri</namePart>
<namePart type="family">Käpyaho</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Filip</namePart>
<namePart type="family">Ginter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Third Workshop on Resources and Representations for Under-Resourced Languages and Domains (RESOURCEFUL-2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Špela</namePart>
<namePart type="given">Arhar</namePart>
<namePart type="family">Holdt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikolai</namePart>
<namePart type="family">Ilinykh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="family">Scalvini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Micaella</namePart>
<namePart type="family">Bruton</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iben</namePart>
<namePart type="given">Nyholm</namePart>
<namePart type="family">Debess</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Crina</namePart>
<namePart type="given">Madalina</namePart>
<namePart type="family">Tudor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>University of Tartu Library, Estonia</publisher>
<place>
<placeTerm type="text">Tallinn, Estonia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">978-9908-53-121-2</identifier>
</relatedItem>
<abstract>Optical Character Recognition (OCR) systems often introduce errors when transcribing historical documents, leaving room for post-correction to improve text quality. This study evaluates the use of open-weight LLMs for OCR error correction in historical English and Finnish datasets. We explore various strategies, including parameter optimization, quantization, segment length effects, and text continuation methods. Our results demonstrate that while modern LLMs show promise in reducing character error rates (CER) in English, a practically useful performance for Finnish was not reached. Our findings highlight the potential and limitations of LLMs in scaling OCR post-correction for large historical corpora.</abstract>
<identifier type="citekey">kanerva-etal-2025-ocr</identifier>
<location>
<url>https://aclanthology.org/2025.resourceful-1.8/</url>
</location>
<part>
<date>2025-03</date>
<extent unit="page">
<start>38</start>
<end>47</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T OCR Error Post-Correction with LLMs in Historical Documents: No Free Lunches
%A Kanerva, Jenna
%A Ledins, Cassandra
%A Käpyaho, Siiri
%A Ginter, Filip
%Y Holdt, Špela Arhar
%Y Ilinykh, Nikolai
%Y Scalvini, Barbara
%Y Bruton, Micaella
%Y Debess, Iben Nyholm
%Y Tudor, Crina Madalina
%S Proceedings of the Third Workshop on Resources and Representations for Under-Resourced Languages and Domains (RESOURCEFUL-2025)
%D 2025
%8 March
%I University of Tartu Library, Estonia
%C Tallinn, Estonia
%@ 978-9908-53-121-2
%F kanerva-etal-2025-ocr
%X Optical Character Recognition (OCR) systems often introduce errors when transcribing historical documents, leaving room for post-correction to improve text quality. This study evaluates the use of open-weight LLMs for OCR error correction in historical English and Finnish datasets. We explore various strategies, including parameter optimization, quantization, segment length effects, and text continuation methods. Our results demonstrate that while modern LLMs show promise in reducing character error rates (CER) in English, a practically useful performance for Finnish was not reached. Our findings highlight the potential and limitations of LLMs in scaling OCR post-correction for large historical corpora.
%U https://aclanthology.org/2025.resourceful-1.8/
%P 38-47
Markdown (Informal)
[OCR Error Post-Correction with LLMs in Historical Documents: No Free Lunches](https://aclanthology.org/2025.resourceful-1.8/) (Kanerva et al., RESOURCEFUL 2025)
ACL