@inproceedings{nihar-etal-2023-multilingual,
title = "Multilingual Multimodal Text Detection in {I}ndo-{A}ryan Languages",
author = "Basisth, Nihar Jyoti and
Halder, Eisha and
Sachan, Tushar and
Vetagiri, Advaitha and
Pakray, Partha",
editor = "D. Pawar, Jyoti and
Lalitha Devi, Sobha",
booktitle = "Proceedings of the 20th International Conference on Natural Language Processing (ICON)",
month = dec,
year = "2023",
address = "Goa University, Goa, India",
publisher = "NLP Association of India (NLPAI)",
url = "https://aclanthology.org/2023.icon-1.16/",
pages = "162--171",
abstract = "Multi-language text detection and recognition in complex visual scenes is an essential yet challenging task. Traditional pipelines relying on optical character recognition (OCR) often fail to generalize across different languages, fonts, orientations and imaging conditions. This work proposes a novel approach using the YOLOv5 object detection model architecture for multilanguage text detection in images and videos. We curate and annotate a new dataset of over 4,000 scene text images across 4 Indian languages and use specialized data augmentation techniques to improve model robustness. Transfer learning from a base YOLOv5 model pretrained on COCO is combined with tailored optimization strategies for multi-language text detection. Our approach achieves state-of-theart performance, with over 90{\%} accuracy on multi-language text detection across all four languages in our test set. We demonstrate the effectiveness of fine-tuning YOLOv5 for generalized multi-language text extraction across diverse fonts, scales, orientations, and visual contexts. Our approach`s high accuracy and generalizability could enable numerous applications involving multilingual text processing from imagery and video."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nihar-etal-2023-multilingual">
<titleInfo>
<title>Multilingual Multimodal Text Detection in Indo-Aryan Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nihar</namePart>
<namePart type="given">Jyoti</namePart>
<namePart type="family">Basisth</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eisha</namePart>
<namePart type="family">Halder</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tushar</namePart>
<namePart type="family">Sachan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Advaitha</namePart>
<namePart type="family">Vetagiri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Partha</namePart>
<namePart type="family">Pakray</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 20th International Conference on Natural Language Processing (ICON)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jyoti</namePart>
<namePart type="family">D. Pawar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sobha</namePart>
<namePart type="family">Lalitha Devi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>NLP Association of India (NLPAI)</publisher>
<place>
<placeTerm type="text">Goa University, Goa, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Multi-language text detection and recognition in complex visual scenes is an essential yet challenging task. Traditional pipelines relying on optical character recognition (OCR) often fail to generalize across different languages, fonts, orientations and imaging conditions. This work proposes a novel approach using the YOLOv5 object detection model architecture for multilanguage text detection in images and videos. We curate and annotate a new dataset of over 4,000 scene text images across 4 Indian languages and use specialized data augmentation techniques to improve model robustness. Transfer learning from a base YOLOv5 model pretrained on COCO is combined with tailored optimization strategies for multi-language text detection. Our approach achieves state-of-theart performance, with over 90% accuracy on multi-language text detection across all four languages in our test set. We demonstrate the effectiveness of fine-tuning YOLOv5 for generalized multi-language text extraction across diverse fonts, scales, orientations, and visual contexts. Our approach‘s high accuracy and generalizability could enable numerous applications involving multilingual text processing from imagery and video.</abstract>
<identifier type="citekey">nihar-etal-2023-multilingual</identifier>
<location>
<url>https://aclanthology.org/2023.icon-1.16/</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>162</start>
<end>171</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Multilingual Multimodal Text Detection in Indo-Aryan Languages
%A Basisth, Nihar Jyoti
%A Halder, Eisha
%A Sachan, Tushar
%A Vetagiri, Advaitha
%A Pakray, Partha
%Y D. Pawar, Jyoti
%Y Lalitha Devi, Sobha
%S Proceedings of the 20th International Conference on Natural Language Processing (ICON)
%D 2023
%8 December
%I NLP Association of India (NLPAI)
%C Goa University, Goa, India
%F nihar-etal-2023-multilingual
%X Multi-language text detection and recognition in complex visual scenes is an essential yet challenging task. Traditional pipelines relying on optical character recognition (OCR) often fail to generalize across different languages, fonts, orientations and imaging conditions. This work proposes a novel approach using the YOLOv5 object detection model architecture for multilanguage text detection in images and videos. We curate and annotate a new dataset of over 4,000 scene text images across 4 Indian languages and use specialized data augmentation techniques to improve model robustness. Transfer learning from a base YOLOv5 model pretrained on COCO is combined with tailored optimization strategies for multi-language text detection. Our approach achieves state-of-theart performance, with over 90% accuracy on multi-language text detection across all four languages in our test set. We demonstrate the effectiveness of fine-tuning YOLOv5 for generalized multi-language text extraction across diverse fonts, scales, orientations, and visual contexts. Our approach‘s high accuracy and generalizability could enable numerous applications involving multilingual text processing from imagery and video.
%U https://aclanthology.org/2023.icon-1.16/
%P 162-171
Markdown (Informal)
[Multilingual Multimodal Text Detection in Indo-Aryan Languages](https://aclanthology.org/2023.icon-1.16/) (Basisth et al., ICON 2023)
ACL
- Nihar Jyoti Basisth, Eisha Halder, Tushar Sachan, Advaitha Vetagiri, and Partha Pakray. 2023. Multilingual Multimodal Text Detection in Indo-Aryan Languages. In Proceedings of the 20th International Conference on Natural Language Processing (ICON), pages 162–171, Goa University, Goa, India. NLP Association of India (NLPAI).