@inproceedings{kwon-etal-2021-handling,
title = "Handling Out-Of-Vocabulary Problem in Hangeul Word Embeddings",
author = "Kwon, Ohjoon and
Kim, Dohyun and
Lee, Soo-Ryeon and
Choi, Junyoung and
Lee, SangKeun",
editor = "Merlo, Paola and
Tiedemann, Jorg and
Tsarfaty, Reut",
booktitle = "Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume",
month = apr,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.eacl-main.280",
doi = "10.18653/v1/2021.eacl-main.280",
pages = "3213--3221",
abstract = "Word embedding is considered an essential factor in improving the performance of various Natural Language Processing (NLP) models. However, it is hardly applicable in real-world datasets as word embedding is generally studied with a well-refined corpus. Notably, in Hangeul (Korean writing system), which has a unique writing system, various kinds of Out-Of-Vocabulary (OOV) appear from typos. In this paper, we propose a robust Hangeul word embedding model against typos, while maintaining high performance. The proposed model utilizes a Convolutional Neural Network (CNN) architecture with a channel attention mechanism that learns to infer the original word embeddings. The model train with a dataset that consists of a mix of typos and correct words. To demonstrate the effectiveness of the proposed model, we conduct three kinds of intrinsic and extrinsic tasks. While the existing embedding models fail to maintain stable performance as the noise level increases, the proposed model shows stable performance.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kwon-etal-2021-handling">
<titleInfo>
<title>Handling Out-Of-Vocabulary Problem in Hangeul Word Embeddings</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ohjoon</namePart>
<namePart type="family">Kwon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dohyun</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Soo-Ryeon</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junyoung</namePart>
<namePart type="family">Choi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">SangKeun</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume</title>
</titleInfo>
<name type="personal">
<namePart type="given">Paola</namePart>
<namePart type="family">Merlo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jorg</namePart>
<namePart type="family">Tiedemann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Reut</namePart>
<namePart type="family">Tsarfaty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Word embedding is considered an essential factor in improving the performance of various Natural Language Processing (NLP) models. However, it is hardly applicable in real-world datasets as word embedding is generally studied with a well-refined corpus. Notably, in Hangeul (Korean writing system), which has a unique writing system, various kinds of Out-Of-Vocabulary (OOV) appear from typos. In this paper, we propose a robust Hangeul word embedding model against typos, while maintaining high performance. The proposed model utilizes a Convolutional Neural Network (CNN) architecture with a channel attention mechanism that learns to infer the original word embeddings. The model train with a dataset that consists of a mix of typos and correct words. To demonstrate the effectiveness of the proposed model, we conduct three kinds of intrinsic and extrinsic tasks. While the existing embedding models fail to maintain stable performance as the noise level increases, the proposed model shows stable performance.</abstract>
<identifier type="citekey">kwon-etal-2021-handling</identifier>
<identifier type="doi">10.18653/v1/2021.eacl-main.280</identifier>
<location>
<url>https://aclanthology.org/2021.eacl-main.280</url>
</location>
<part>
<date>2021-04</date>
<extent unit="page">
<start>3213</start>
<end>3221</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Handling Out-Of-Vocabulary Problem in Hangeul Word Embeddings
%A Kwon, Ohjoon
%A Kim, Dohyun
%A Lee, Soo-Ryeon
%A Choi, Junyoung
%A Lee, SangKeun
%Y Merlo, Paola
%Y Tiedemann, Jorg
%Y Tsarfaty, Reut
%S Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume
%D 2021
%8 April
%I Association for Computational Linguistics
%C Online
%F kwon-etal-2021-handling
%X Word embedding is considered an essential factor in improving the performance of various Natural Language Processing (NLP) models. However, it is hardly applicable in real-world datasets as word embedding is generally studied with a well-refined corpus. Notably, in Hangeul (Korean writing system), which has a unique writing system, various kinds of Out-Of-Vocabulary (OOV) appear from typos. In this paper, we propose a robust Hangeul word embedding model against typos, while maintaining high performance. The proposed model utilizes a Convolutional Neural Network (CNN) architecture with a channel attention mechanism that learns to infer the original word embeddings. The model train with a dataset that consists of a mix of typos and correct words. To demonstrate the effectiveness of the proposed model, we conduct three kinds of intrinsic and extrinsic tasks. While the existing embedding models fail to maintain stable performance as the noise level increases, the proposed model shows stable performance.
%R 10.18653/v1/2021.eacl-main.280
%U https://aclanthology.org/2021.eacl-main.280
%U https://doi.org/10.18653/v1/2021.eacl-main.280
%P 3213-3221
Markdown (Informal)
[Handling Out-Of-Vocabulary Problem in Hangeul Word Embeddings](https://aclanthology.org/2021.eacl-main.280) (Kwon et al., EACL 2021)
ACL
- Ohjoon Kwon, Dohyun Kim, Soo-Ryeon Lee, Junyoung Choi, and SangKeun Lee. 2021. Handling Out-Of-Vocabulary Problem in Hangeul Word Embeddings. In Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume, pages 3213–3221, Online. Association for Computational Linguistics.