@inproceedings{yilmaz-etal-2026-ocrturk,
title = "{OCRT}urk: A Comprehensive {OCR} Benchmark for {T}urkish",
author = {Y{\i}lmaz, Deniz and
Munis, Evren Ayberk and
Toraman, Cagri and
K{\"o}se, S{\"u}ha Ka{\u{g}}an and
Akta{\c{s}}, Burak and
Baytekin, Mehmet Can and
G{\"o}r{\"u}r, Bilge Kaan},
editor = {Oflazer, Kemal and
K{\"o}ksal, Abdullatif and
Varol, Onur},
booktitle = "Proceedings of the Second Workshop Natural Language Processing for {T}urkic Languages ({SIGTURK} 2026)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.sigturk-1.16/",
pages = "197--208",
ISBN = "979-8-89176-370-8",
abstract = "Document parsing is now widely used in applications, such as large-scale document digitization, retrieval-augmented generation, and domain-specific pipelines in healthcare and education. Benchmarking these models is crucial for assessing their reliability and practical robustness. Existing benchmarks mostly target high-resource languages and provide limited coverage for low-resource settings, such as Turkish. Moreover, existing studies on Turkish document parsing lack a standardized benchmark that reflects real-world scenarios and document diversity. To address this gap, we introduce OCRTurk, a Turkish document parsing benchmark covering multiple layout elements and document categories at three difficulty levels. OCRTurk consists of 180 Turkish documents drawn from academic articles, theses, slide decks, and non-academic articles. We evaluate seven OCR models on OCRTurk using element-wise metrics. Across difficulty levels, PaddleOCR achieves the strongest overall results, leading most element-wise metrics except figures and attaining the best Normalized Edit Distance scores in easy, medium, and hard subsets. We also observe performance variation by document type: models perform well on non-academic documents, while slideshows become the most challenging."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yilmaz-etal-2026-ocrturk">
<titleInfo>
<title>OCRTurk: A Comprehensive OCR Benchmark for Turkish</title>
</titleInfo>
<name type="personal">
<namePart type="given">Deniz</namePart>
<namePart type="family">Yılmaz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Evren</namePart>
<namePart type="given">Ayberk</namePart>
<namePart type="family">Munis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cagri</namePart>
<namePart type="family">Toraman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Süha</namePart>
<namePart type="given">Kağan</namePart>
<namePart type="family">Köse</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Burak</namePart>
<namePart type="family">Aktaş</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mehmet</namePart>
<namePart type="given">Can</namePart>
<namePart type="family">Baytekin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bilge</namePart>
<namePart type="given">Kaan</namePart>
<namePart type="family">Görür</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop Natural Language Processing for Turkic Languages (SIGTURK 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kemal</namePart>
<namePart type="family">Oflazer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abdullatif</namePart>
<namePart type="family">Köksal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Onur</namePart>
<namePart type="family">Varol</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-370-8</identifier>
</relatedItem>
<abstract>Document parsing is now widely used in applications, such as large-scale document digitization, retrieval-augmented generation, and domain-specific pipelines in healthcare and education. Benchmarking these models is crucial for assessing their reliability and practical robustness. Existing benchmarks mostly target high-resource languages and provide limited coverage for low-resource settings, such as Turkish. Moreover, existing studies on Turkish document parsing lack a standardized benchmark that reflects real-world scenarios and document diversity. To address this gap, we introduce OCRTurk, a Turkish document parsing benchmark covering multiple layout elements and document categories at three difficulty levels. OCRTurk consists of 180 Turkish documents drawn from academic articles, theses, slide decks, and non-academic articles. We evaluate seven OCR models on OCRTurk using element-wise metrics. Across difficulty levels, PaddleOCR achieves the strongest overall results, leading most element-wise metrics except figures and attaining the best Normalized Edit Distance scores in easy, medium, and hard subsets. We also observe performance variation by document type: models perform well on non-academic documents, while slideshows become the most challenging.</abstract>
<identifier type="citekey">yilmaz-etal-2026-ocrturk</identifier>
<location>
<url>https://aclanthology.org/2026.sigturk-1.16/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>197</start>
<end>208</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T OCRTurk: A Comprehensive OCR Benchmark for Turkish
%A Yılmaz, Deniz
%A Munis, Evren Ayberk
%A Toraman, Cagri
%A Köse, Süha Kağan
%A Aktaş, Burak
%A Baytekin, Mehmet Can
%A Görür, Bilge Kaan
%Y Oflazer, Kemal
%Y Köksal, Abdullatif
%Y Varol, Onur
%S Proceedings of the Second Workshop Natural Language Processing for Turkic Languages (SIGTURK 2026)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-370-8
%F yilmaz-etal-2026-ocrturk
%X Document parsing is now widely used in applications, such as large-scale document digitization, retrieval-augmented generation, and domain-specific pipelines in healthcare and education. Benchmarking these models is crucial for assessing their reliability and practical robustness. Existing benchmarks mostly target high-resource languages and provide limited coverage for low-resource settings, such as Turkish. Moreover, existing studies on Turkish document parsing lack a standardized benchmark that reflects real-world scenarios and document diversity. To address this gap, we introduce OCRTurk, a Turkish document parsing benchmark covering multiple layout elements and document categories at three difficulty levels. OCRTurk consists of 180 Turkish documents drawn from academic articles, theses, slide decks, and non-academic articles. We evaluate seven OCR models on OCRTurk using element-wise metrics. Across difficulty levels, PaddleOCR achieves the strongest overall results, leading most element-wise metrics except figures and attaining the best Normalized Edit Distance scores in easy, medium, and hard subsets. We also observe performance variation by document type: models perform well on non-academic documents, while slideshows become the most challenging.
%U https://aclanthology.org/2026.sigturk-1.16/
%P 197-208
Markdown (Informal)
[OCRTurk: A Comprehensive OCR Benchmark for Turkish](https://aclanthology.org/2026.sigturk-1.16/) (Yılmaz et al., SIGTURK 2026)
ACL
- Deniz Yılmaz, Evren Ayberk Munis, Cagri Toraman, Süha Kağan Köse, Burak Aktaş, Mehmet Can Baytekin, and Bilge Kaan Görür. 2026. OCRTurk: A Comprehensive OCR Benchmark for Turkish. In Proceedings of the Second Workshop Natural Language Processing for Turkic Languages (SIGTURK 2026), pages 197–208, Rabat, Morocco. Association for Computational Linguistics.