@inproceedings{aguilar-etal-2021-char2subword-extending,
title = "{C}har2{S}ubword: Extending the Subword Embedding Space Using Robust Character Compositionality",
author = "Aguilar, Gustavo and
McCann, Bryan and
Niu, Tong and
Rajani, Nazneen and
Keskar, Nitish Shirish and
Solorio, Thamar",
editor = "Moens, Marie-Francine and
Huang, Xuanjing and
Specia, Lucia and
Yih, Scott Wen-tau",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2021",
month = nov,
year = "2021",
address = "Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.findings-emnlp.141",
doi = "10.18653/v1/2021.findings-emnlp.141",
pages = "1640--1651",
abstract = "Byte-pair encoding (BPE) is a ubiquitous algorithm in the subword tokenization process of language models as it provides multiple benefits. However, this process is solely based on pre-training data statistics, making it hard for the tokenizer to handle infrequent spellings. On the other hand, though robust to misspellings, pure character-level models often lead to unreasonably long sequences and make it harder for the model to learn meaningful words. To alleviate these challenges, we propose a character-based subword module (char2subword) that learns the subword embedding table in pre-trained models like BERT. Our char2subword module builds representations from characters out of the subword vocabulary, and it can be used as a drop-in replacement of the subword embedding table. The module is robust to character-level alterations such as misspellings, word inflection, casing, and punctuation. We integrate it further with BERT through pre-training while keeping BERT transformer parameters fixed{--}and thus, providing a practical method. Finally, we show that incorporating our module to mBERT significantly improves the performance on the social media linguistic code-switching evaluation (LinCE) benchmark.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="aguilar-etal-2021-char2subword-extending">
<titleInfo>
<title>Char2Subword: Extending the Subword Embedding Space Using Robust Character Compositionality</title>
</titleInfo>
<name type="personal">
<namePart type="given">Gustavo</namePart>
<namePart type="family">Aguilar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bryan</namePart>
<namePart type="family">McCann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tong</namePart>
<namePart type="family">Niu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nazneen</namePart>
<namePart type="family">Rajani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nitish</namePart>
<namePart type="given">Shirish</namePart>
<namePart type="family">Keskar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thamar</namePart>
<namePart type="family">Solorio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2021</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marie-Francine</namePart>
<namePart type="family">Moens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuanjing</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucia</namePart>
<namePart type="family">Specia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Scott</namePart>
<namePart type="given">Wen-tau</namePart>
<namePart type="family">Yih</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Punta Cana, Dominican Republic</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Byte-pair encoding (BPE) is a ubiquitous algorithm in the subword tokenization process of language models as it provides multiple benefits. However, this process is solely based on pre-training data statistics, making it hard for the tokenizer to handle infrequent spellings. On the other hand, though robust to misspellings, pure character-level models often lead to unreasonably long sequences and make it harder for the model to learn meaningful words. To alleviate these challenges, we propose a character-based subword module (char2subword) that learns the subword embedding table in pre-trained models like BERT. Our char2subword module builds representations from characters out of the subword vocabulary, and it can be used as a drop-in replacement of the subword embedding table. The module is robust to character-level alterations such as misspellings, word inflection, casing, and punctuation. We integrate it further with BERT through pre-training while keeping BERT transformer parameters fixed–and thus, providing a practical method. Finally, we show that incorporating our module to mBERT significantly improves the performance on the social media linguistic code-switching evaluation (LinCE) benchmark.</abstract>
<identifier type="citekey">aguilar-etal-2021-char2subword-extending</identifier>
<identifier type="doi">10.18653/v1/2021.findings-emnlp.141</identifier>
<location>
<url>https://aclanthology.org/2021.findings-emnlp.141</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>1640</start>
<end>1651</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Char2Subword: Extending the Subword Embedding Space Using Robust Character Compositionality
%A Aguilar, Gustavo
%A McCann, Bryan
%A Niu, Tong
%A Rajani, Nazneen
%A Keskar, Nitish Shirish
%A Solorio, Thamar
%Y Moens, Marie-Francine
%Y Huang, Xuanjing
%Y Specia, Lucia
%Y Yih, Scott Wen-tau
%S Findings of the Association for Computational Linguistics: EMNLP 2021
%D 2021
%8 November
%I Association for Computational Linguistics
%C Punta Cana, Dominican Republic
%F aguilar-etal-2021-char2subword-extending
%X Byte-pair encoding (BPE) is a ubiquitous algorithm in the subword tokenization process of language models as it provides multiple benefits. However, this process is solely based on pre-training data statistics, making it hard for the tokenizer to handle infrequent spellings. On the other hand, though robust to misspellings, pure character-level models often lead to unreasonably long sequences and make it harder for the model to learn meaningful words. To alleviate these challenges, we propose a character-based subword module (char2subword) that learns the subword embedding table in pre-trained models like BERT. Our char2subword module builds representations from characters out of the subword vocabulary, and it can be used as a drop-in replacement of the subword embedding table. The module is robust to character-level alterations such as misspellings, word inflection, casing, and punctuation. We integrate it further with BERT through pre-training while keeping BERT transformer parameters fixed–and thus, providing a practical method. Finally, we show that incorporating our module to mBERT significantly improves the performance on the social media linguistic code-switching evaluation (LinCE) benchmark.
%R 10.18653/v1/2021.findings-emnlp.141
%U https://aclanthology.org/2021.findings-emnlp.141
%U https://doi.org/10.18653/v1/2021.findings-emnlp.141
%P 1640-1651
Markdown (Informal)
[Char2Subword: Extending the Subword Embedding Space Using Robust Character Compositionality](https://aclanthology.org/2021.findings-emnlp.141) (Aguilar et al., Findings 2021)
ACL