@inproceedings{enstad-etal-2025-comparative,
title = "Comparative analysis of optical character recognition methods for {S{\'a}mi} texts from the National Library of {Norway}",
author = "Enstad, Tita and
Trosterud, Trond and
R{\o}sok, Marie Iversdatter and
Beyer, Yngvil and
Roald, Marie",
editor = "Johansson, Richard and
Stymne, Sara",
booktitle = "Proceedings of the Joint 25th Nordic Conference on Computational Linguistics and 11th Baltic Conference on Human Language Technologies (NoDaLiDa/Baltic-HLT 2025)",
month = mar,
year = "2025",
address = "Tallinn, Estonia",
publisher = "University of Tartu Library",
url = "https://aclanthology.org/2025.nodalida-1.11/",
pages = "98--108",
ISBN = "978-9908-53-109-0",
abstract = "Optical Character Recognition (OCR) is crucial to the National Library of Norway{'}s (NLN) digitisation process as it converts scanned documents into machine-readable text. However, for the S{\'a}mi documents in NLN{'}s collection, the OCR accuracy is insufficient. Given that OCR quality affects downstream processes, evaluating and improving OCR for text written in S{\'a}mi languages is necessary to make these resources accessible. To address this need, this work fine-tunes and evaluates three established OCR approaches, Transkribus, Tesseract and TrOCR, for transcribing S{\'a}mi texts from NLN{'}s collection. Our results show that Transkribus and TrOCR outperform Tesseract on this task, while Tesseract achieves superior performance on an out-of-domain dataset. Furthermore, we show that fine-tuning pre-trained models and supplementing manual annotations with machine annotations and synthetic text images can yield accurate OCR for S{\'a}mi languages, even with a moderate amount of manually annotated data."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="enstad-etal-2025-comparative">
<titleInfo>
<title>Comparative analysis of optical character recognition methods for Sámi texts from the National Library of Norway</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tita</namePart>
<namePart type="family">Enstad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Trond</namePart>
<namePart type="family">Trosterud</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie</namePart>
<namePart type="given">Iversdatter</namePart>
<namePart type="family">Røsok</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yngvil</namePart>
<namePart type="family">Beyer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie</namePart>
<namePart type="family">Roald</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Joint 25th Nordic Conference on Computational Linguistics and 11th Baltic Conference on Human Language Technologies (NoDaLiDa/Baltic-HLT 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Richard</namePart>
<namePart type="family">Johansson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Stymne</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>University of Tartu Library</publisher>
<place>
<placeTerm type="text">Tallinn, Estonia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">978-9908-53-109-0</identifier>
</relatedItem>
<abstract>Optical Character Recognition (OCR) is crucial to the National Library of Norway’s (NLN) digitisation process as it converts scanned documents into machine-readable text. However, for the Sámi documents in NLN’s collection, the OCR accuracy is insufficient. Given that OCR quality affects downstream processes, evaluating and improving OCR for text written in Sámi languages is necessary to make these resources accessible. To address this need, this work fine-tunes and evaluates three established OCR approaches, Transkribus, Tesseract and TrOCR, for transcribing Sámi texts from NLN’s collection. Our results show that Transkribus and TrOCR outperform Tesseract on this task, while Tesseract achieves superior performance on an out-of-domain dataset. Furthermore, we show that fine-tuning pre-trained models and supplementing manual annotations with machine annotations and synthetic text images can yield accurate OCR for Sámi languages, even with a moderate amount of manually annotated data.</abstract>
<identifier type="citekey">enstad-etal-2025-comparative</identifier>
<location>
<url>https://aclanthology.org/2025.nodalida-1.11/</url>
</location>
<part>
<date>2025-03</date>
<extent unit="page">
<start>98</start>
<end>108</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Comparative analysis of optical character recognition methods for Sámi texts from the National Library of Norway
%A Enstad, Tita
%A Trosterud, Trond
%A Røsok, Marie Iversdatter
%A Beyer, Yngvil
%A Roald, Marie
%Y Johansson, Richard
%Y Stymne, Sara
%S Proceedings of the Joint 25th Nordic Conference on Computational Linguistics and 11th Baltic Conference on Human Language Technologies (NoDaLiDa/Baltic-HLT 2025)
%D 2025
%8 March
%I University of Tartu Library
%C Tallinn, Estonia
%@ 978-9908-53-109-0
%F enstad-etal-2025-comparative
%X Optical Character Recognition (OCR) is crucial to the National Library of Norway’s (NLN) digitisation process as it converts scanned documents into machine-readable text. However, for the Sámi documents in NLN’s collection, the OCR accuracy is insufficient. Given that OCR quality affects downstream processes, evaluating and improving OCR for text written in Sámi languages is necessary to make these resources accessible. To address this need, this work fine-tunes and evaluates three established OCR approaches, Transkribus, Tesseract and TrOCR, for transcribing Sámi texts from NLN’s collection. Our results show that Transkribus and TrOCR outperform Tesseract on this task, while Tesseract achieves superior performance on an out-of-domain dataset. Furthermore, we show that fine-tuning pre-trained models and supplementing manual annotations with machine annotations and synthetic text images can yield accurate OCR for Sámi languages, even with a moderate amount of manually annotated data.
%U https://aclanthology.org/2025.nodalida-1.11/
%P 98-108
Markdown (Informal)
[Comparative analysis of optical character recognition methods for Sámi texts from the National Library of Norway](https://aclanthology.org/2025.nodalida-1.11/) (Enstad et al., NoDaLiDa 2025)
ACL