@inproceedings{nazeem-etal-2024-open,
title = "Open-Source {OCR} Libraries: A Comprehensive Study for Low Resource Language",
author = "Nazeem, Meharuniza and
R, Anitha and
S, Navaneeth and
R. R, Rajeev",
editor = "Lalitha Devi, Sobha and
Arora, Karunesh",
booktitle = "Proceedings of the 21st International Conference on Natural Language Processing (ICON)",
month = dec,
year = "2024",
address = "AU-KBC Research Centre, Chennai, India",
publisher = "NLP Association of India (NLPAI)",
url = "https://aclanthology.org/2024.icon-1.48/",
pages = "416--421",
abstract = "This paper reviews numerous OCR programs and libraries employed for optical character recognition tasks. Tesser- act OCR, an open-source program that supports multiple lan- guages and image formats, is highlighted for its accuracy and adaptability. Python-based libraries like EasyOCR, MMOCR, and PaddleOCR are also mentioned, which provide user-friendly interfaces and trained models for text extraction, detection, and recognition. EasyOCR emphasizes ease of use and sim- plicity, while MMOCR and PaddleOCR offer comprehensive OCR capabilities and support for a wide range of languages. According to our study, which evaluates various OCR libraries, Tesseract OCR performs remarkably well in terms of accuracy for Indian languages like Malayalam. We focused on five OCR libraries{---}Tesseract OCR, MMOCR, PaddleOCR, EasyOCR, and Keras OCR{---}and tested them across several languages, including English, Hindi, Arabic, Tamil, and Malayalam. During our comparison, we found that Tesseract OCR was the only library that supported the Malayalam language. While the other libraries did not support Malayalam, Tesseract OCR performed well across all tested languages, achieving accuracy rates of 92{\%} in English, 93{\%} in Hindi, 78{\%} in Tamil, 74{\%} in Arabic, and 93{\%} in Malayalam."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nazeem-etal-2024-open">
<titleInfo>
<title>Open-Source OCR Libraries: A Comprehensive Study for Low Resource Language</title>
</titleInfo>
<name type="personal">
<namePart type="given">Meharuniza</namePart>
<namePart type="family">Nazeem</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anitha</namePart>
<namePart type="family">R</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Navaneeth</namePart>
<namePart type="family">S</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rajeev</namePart>
<namePart type="family">R. R</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 21st International Conference on Natural Language Processing (ICON)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sobha</namePart>
<namePart type="family">Lalitha Devi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Karunesh</namePart>
<namePart type="family">Arora</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>NLP Association of India (NLPAI)</publisher>
<place>
<placeTerm type="text">AU-KBC Research Centre, Chennai, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper reviews numerous OCR programs and libraries employed for optical character recognition tasks. Tesser- act OCR, an open-source program that supports multiple lan- guages and image formats, is highlighted for its accuracy and adaptability. Python-based libraries like EasyOCR, MMOCR, and PaddleOCR are also mentioned, which provide user-friendly interfaces and trained models for text extraction, detection, and recognition. EasyOCR emphasizes ease of use and sim- plicity, while MMOCR and PaddleOCR offer comprehensive OCR capabilities and support for a wide range of languages. According to our study, which evaluates various OCR libraries, Tesseract OCR performs remarkably well in terms of accuracy for Indian languages like Malayalam. We focused on five OCR libraries—Tesseract OCR, MMOCR, PaddleOCR, EasyOCR, and Keras OCR—and tested them across several languages, including English, Hindi, Arabic, Tamil, and Malayalam. During our comparison, we found that Tesseract OCR was the only library that supported the Malayalam language. While the other libraries did not support Malayalam, Tesseract OCR performed well across all tested languages, achieving accuracy rates of 92% in English, 93% in Hindi, 78% in Tamil, 74% in Arabic, and 93% in Malayalam.</abstract>
<identifier type="citekey">nazeem-etal-2024-open</identifier>
<location>
<url>https://aclanthology.org/2024.icon-1.48/</url>
</location>
<part>
<date>2024-12</date>
<extent unit="page">
<start>416</start>
<end>421</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Open-Source OCR Libraries: A Comprehensive Study for Low Resource Language
%A Nazeem, Meharuniza
%A R, Anitha
%A S, Navaneeth
%A R. R, Rajeev
%Y Lalitha Devi, Sobha
%Y Arora, Karunesh
%S Proceedings of the 21st International Conference on Natural Language Processing (ICON)
%D 2024
%8 December
%I NLP Association of India (NLPAI)
%C AU-KBC Research Centre, Chennai, India
%F nazeem-etal-2024-open
%X This paper reviews numerous OCR programs and libraries employed for optical character recognition tasks. Tesser- act OCR, an open-source program that supports multiple lan- guages and image formats, is highlighted for its accuracy and adaptability. Python-based libraries like EasyOCR, MMOCR, and PaddleOCR are also mentioned, which provide user-friendly interfaces and trained models for text extraction, detection, and recognition. EasyOCR emphasizes ease of use and sim- plicity, while MMOCR and PaddleOCR offer comprehensive OCR capabilities and support for a wide range of languages. According to our study, which evaluates various OCR libraries, Tesseract OCR performs remarkably well in terms of accuracy for Indian languages like Malayalam. We focused on five OCR libraries—Tesseract OCR, MMOCR, PaddleOCR, EasyOCR, and Keras OCR—and tested them across several languages, including English, Hindi, Arabic, Tamil, and Malayalam. During our comparison, we found that Tesseract OCR was the only library that supported the Malayalam language. While the other libraries did not support Malayalam, Tesseract OCR performed well across all tested languages, achieving accuracy rates of 92% in English, 93% in Hindi, 78% in Tamil, 74% in Arabic, and 93% in Malayalam.
%U https://aclanthology.org/2024.icon-1.48/
%P 416-421
Markdown (Informal)
[Open-Source OCR Libraries: A Comprehensive Study for Low Resource Language](https://aclanthology.org/2024.icon-1.48/) (Nazeem et al., ICON 2024)
ACL