@inproceedings{fukuda-etal-2020-robust,
title = "Robust {B}acked-off {E}stimation of {O}ut-of-{V}ocabulary {E}mbeddings",
author = "Fukuda, Nobukazu and
Yoshinaga, Naoki and
Kitsuregawa, Masaru",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2020",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.findings-emnlp.434",
doi = "10.18653/v1/2020.findings-emnlp.434",
pages = "4827--4838",
abstract = "Out-of-vocabulary (oov) words cause serious troubles in solving natural language tasks with a neural network. Existing approaches to this problem resort to using subwords, which are shorter and more ambiguous units than words, in order to represent oov words with a bag of subwords. In this study, inspired by the processes for creating words from known words, we propose a robust method of estimating oov word embeddings by referring to pre-trained word embeddings for known words with similar surfaces to target oov words. We collect known words by segmenting oov words and by approximate string matching, and we then aggregate their pre-trained embeddings. Experimental results show that the obtained oov word embeddings improve not only word similarity tasks but also downstream tasks in Twitter and biomedical domains where oov words often appear, even when the computed oov embeddings are integrated into a bert-based strong baseline.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="fukuda-etal-2020-robust">
<titleInfo>
<title>Robust Backed-off Estimation of Out-of-Vocabulary Embeddings</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nobukazu</namePart>
<namePart type="family">Fukuda</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoki</namePart>
<namePart type="family">Yoshinaga</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Masaru</namePart>
<namePart type="family">Kitsuregawa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2020</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Out-of-vocabulary (oov) words cause serious troubles in solving natural language tasks with a neural network. Existing approaches to this problem resort to using subwords, which are shorter and more ambiguous units than words, in order to represent oov words with a bag of subwords. In this study, inspired by the processes for creating words from known words, we propose a robust method of estimating oov word embeddings by referring to pre-trained word embeddings for known words with similar surfaces to target oov words. We collect known words by segmenting oov words and by approximate string matching, and we then aggregate their pre-trained embeddings. Experimental results show that the obtained oov word embeddings improve not only word similarity tasks but also downstream tasks in Twitter and biomedical domains where oov words often appear, even when the computed oov embeddings are integrated into a bert-based strong baseline.</abstract>
<identifier type="citekey">fukuda-etal-2020-robust</identifier>
<identifier type="doi">10.18653/v1/2020.findings-emnlp.434</identifier>
<location>
<url>https://aclanthology.org/2020.findings-emnlp.434</url>
</location>
<part>
<date>2020-11</date>
<extent unit="page">
<start>4827</start>
<end>4838</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Robust Backed-off Estimation of Out-of-Vocabulary Embeddings
%A Fukuda, Nobukazu
%A Yoshinaga, Naoki
%A Kitsuregawa, Masaru
%S Findings of the Association for Computational Linguistics: EMNLP 2020
%D 2020
%8 November
%I Association for Computational Linguistics
%C Online
%F fukuda-etal-2020-robust
%X Out-of-vocabulary (oov) words cause serious troubles in solving natural language tasks with a neural network. Existing approaches to this problem resort to using subwords, which are shorter and more ambiguous units than words, in order to represent oov words with a bag of subwords. In this study, inspired by the processes for creating words from known words, we propose a robust method of estimating oov word embeddings by referring to pre-trained word embeddings for known words with similar surfaces to target oov words. We collect known words by segmenting oov words and by approximate string matching, and we then aggregate their pre-trained embeddings. Experimental results show that the obtained oov word embeddings improve not only word similarity tasks but also downstream tasks in Twitter and biomedical domains where oov words often appear, even when the computed oov embeddings are integrated into a bert-based strong baseline.
%R 10.18653/v1/2020.findings-emnlp.434
%U https://aclanthology.org/2020.findings-emnlp.434
%U https://doi.org/10.18653/v1/2020.findings-emnlp.434
%P 4827-4838
Markdown (Informal)
[Robust Backed-off Estimation of Out-of-Vocabulary Embeddings](https://aclanthology.org/2020.findings-emnlp.434) (Fukuda et al., Findings 2020)
ACL