@inproceedings{nagaraj-rao-shen-2020-misspelling,
title = "Misspelling Detection from Noisy Product Images",
author = "Nagaraj Rao, Varun and
Shen, Mingwei",
editor = "Clifton, Ann and
Napoles, Courtney",
booktitle = "Proceedings of the 28th International Conference on Computational Linguistics: Industry Track",
month = dec,
year = "2020",
address = "Online",
publisher = "International Committee on Computational Linguistics",
url = "https://aclanthology.org/2020.coling-industry.12",
doi = "10.18653/v1/2020.coling-industry.12",
pages = "124--135",
abstract = "Misspellings are introduced on products either due to negligence or as an attempt to deliberately deceive stakeholders. This leads to a revenue loss for online sellers and fosters customer mistrust. Existing spelling research has primarily focused on advancement in misspelling correction and the approach for misspelling detection has remained the use of a large dictionary. The dictionary lookup results in the incorrect detection of several non-dictionary words as misspellings. In this paper, we propose a method to automatically detect misspellings from product images in an attempt to reduce false positive detections. We curate a large scale corpus, define a rich set of features and propose a novel model that leverages importance weighting to account for within class distributional variance. Finally, we experimentally validate this approach on both the curated corpus and an out-of-domain public dataset and show that it leads to a relative improvement of up to 20{\%} in F1 score. The approach thus creates a more robust, generalized deployable solution and reduces reliance on large scale custom dictionaries used today.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nagaraj-rao-shen-2020-misspelling">
<titleInfo>
<title>Misspelling Detection from Noisy Product Images</title>
</titleInfo>
<name type="personal">
<namePart type="given">Varun</namePart>
<namePart type="family">Nagaraj Rao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mingwei</namePart>
<namePart type="family">Shen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 28th International Conference on Computational Linguistics: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ann</namePart>
<namePart type="family">Clifton</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Courtney</namePart>
<namePart type="family">Napoles</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Committee on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Misspellings are introduced on products either due to negligence or as an attempt to deliberately deceive stakeholders. This leads to a revenue loss for online sellers and fosters customer mistrust. Existing spelling research has primarily focused on advancement in misspelling correction and the approach for misspelling detection has remained the use of a large dictionary. The dictionary lookup results in the incorrect detection of several non-dictionary words as misspellings. In this paper, we propose a method to automatically detect misspellings from product images in an attempt to reduce false positive detections. We curate a large scale corpus, define a rich set of features and propose a novel model that leverages importance weighting to account for within class distributional variance. Finally, we experimentally validate this approach on both the curated corpus and an out-of-domain public dataset and show that it leads to a relative improvement of up to 20% in F1 score. The approach thus creates a more robust, generalized deployable solution and reduces reliance on large scale custom dictionaries used today.</abstract>
<identifier type="citekey">nagaraj-rao-shen-2020-misspelling</identifier>
<identifier type="doi">10.18653/v1/2020.coling-industry.12</identifier>
<location>
<url>https://aclanthology.org/2020.coling-industry.12</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>124</start>
<end>135</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Misspelling Detection from Noisy Product Images
%A Nagaraj Rao, Varun
%A Shen, Mingwei
%Y Clifton, Ann
%Y Napoles, Courtney
%S Proceedings of the 28th International Conference on Computational Linguistics: Industry Track
%D 2020
%8 December
%I International Committee on Computational Linguistics
%C Online
%F nagaraj-rao-shen-2020-misspelling
%X Misspellings are introduced on products either due to negligence or as an attempt to deliberately deceive stakeholders. This leads to a revenue loss for online sellers and fosters customer mistrust. Existing spelling research has primarily focused on advancement in misspelling correction and the approach for misspelling detection has remained the use of a large dictionary. The dictionary lookup results in the incorrect detection of several non-dictionary words as misspellings. In this paper, we propose a method to automatically detect misspellings from product images in an attempt to reduce false positive detections. We curate a large scale corpus, define a rich set of features and propose a novel model that leverages importance weighting to account for within class distributional variance. Finally, we experimentally validate this approach on both the curated corpus and an out-of-domain public dataset and show that it leads to a relative improvement of up to 20% in F1 score. The approach thus creates a more robust, generalized deployable solution and reduces reliance on large scale custom dictionaries used today.
%R 10.18653/v1/2020.coling-industry.12
%U https://aclanthology.org/2020.coling-industry.12
%U https://doi.org/10.18653/v1/2020.coling-industry.12
%P 124-135
Markdown (Informal)
[Misspelling Detection from Noisy Product Images](https://aclanthology.org/2020.coling-industry.12) (Nagaraj Rao & Shen, COLING 2020)
ACL
- Varun Nagaraj Rao and Mingwei Shen. 2020. Misspelling Detection from Noisy Product Images. In Proceedings of the 28th International Conference on Computational Linguistics: Industry Track, pages 124–135, Online. International Committee on Computational Linguistics.