@inproceedings{mofijul-islam-etal-2022-vocabulary,
title = "A Vocabulary-Free Multilingual Neural Tokenizer for End-to-End Task Learning",
author = "Mofijul Islam, Md and
Aguilar, Gustavo and
Ponnusamy, Pragaash and
Solomon Mathialagan, Clint and
Ma, Chengyuan and
Guo, Chenlei",
booktitle = "Proceedings of the 7th Workshop on Representation Learning for NLP",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.repl4nlp-1.10",
doi = "10.18653/v1/2022.repl4nlp-1.10",
pages = "91--99",
abstract = "Subword tokenization is a commonly used input pre-processing step in most recent NLP models. However, it limits the models{'} ability to leverage end-to-end task learning. Its frequency-based vocabulary creation compromises tokenization in low-resource languages, leading models to produce suboptimal representations. Additionally, the dependency on a fixed vocabulary limits the subword models{'} adaptability across languages and domains. In this work, we propose a vocabulary-free neural tokenizer by distilling segmentation information from heuristic-based subword tokenization. We pre-train our character-based tokenizer by processing unique words from multilingual corpus, thereby extensively increasing word diversity across languages. Unlike the predefined and fixed vocabularies in subword methods, our tokenizer allows end-to-end task learning, resulting in optimal task-specific tokenization. The experimental results show that replacing the subword tokenizer with our neural tokenizer consistently improves performance on multilingual (NLI) and code-switching (sentiment analysis) tasks, with larger gains in low-resource languages. Additionally, our neural tokenizer exhibits a robust performance on downstream tasks when adversarial noise is present (typos and misspelling), further increasing the initial improvements over statistical subword tokenizers.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mofijul-islam-etal-2022-vocabulary">
<titleInfo>
<title>A Vocabulary-Free Multilingual Neural Tokenizer for End-to-End Task Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Md</namePart>
<namePart type="family">Mofijul Islam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gustavo</namePart>
<namePart type="family">Aguilar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pragaash</namePart>
<namePart type="family">Ponnusamy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Clint</namePart>
<namePart type="family">Solomon Mathialagan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chengyuan</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chenlei</namePart>
<namePart type="family">Guo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 7th Workshop on Representation Learning for NLP</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Subword tokenization is a commonly used input pre-processing step in most recent NLP models. However, it limits the models’ ability to leverage end-to-end task learning. Its frequency-based vocabulary creation compromises tokenization in low-resource languages, leading models to produce suboptimal representations. Additionally, the dependency on a fixed vocabulary limits the subword models’ adaptability across languages and domains. In this work, we propose a vocabulary-free neural tokenizer by distilling segmentation information from heuristic-based subword tokenization. We pre-train our character-based tokenizer by processing unique words from multilingual corpus, thereby extensively increasing word diversity across languages. Unlike the predefined and fixed vocabularies in subword methods, our tokenizer allows end-to-end task learning, resulting in optimal task-specific tokenization. The experimental results show that replacing the subword tokenizer with our neural tokenizer consistently improves performance on multilingual (NLI) and code-switching (sentiment analysis) tasks, with larger gains in low-resource languages. Additionally, our neural tokenizer exhibits a robust performance on downstream tasks when adversarial noise is present (typos and misspelling), further increasing the initial improvements over statistical subword tokenizers.</abstract>
<identifier type="citekey">mofijul-islam-etal-2022-vocabulary</identifier>
<identifier type="doi">10.18653/v1/2022.repl4nlp-1.10</identifier>
<location>
<url>https://aclanthology.org/2022.repl4nlp-1.10</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>91</start>
<end>99</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Vocabulary-Free Multilingual Neural Tokenizer for End-to-End Task Learning
%A Mofijul Islam, Md
%A Aguilar, Gustavo
%A Ponnusamy, Pragaash
%A Solomon Mathialagan, Clint
%A Ma, Chengyuan
%A Guo, Chenlei
%S Proceedings of the 7th Workshop on Representation Learning for NLP
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland
%F mofijul-islam-etal-2022-vocabulary
%X Subword tokenization is a commonly used input pre-processing step in most recent NLP models. However, it limits the models’ ability to leverage end-to-end task learning. Its frequency-based vocabulary creation compromises tokenization in low-resource languages, leading models to produce suboptimal representations. Additionally, the dependency on a fixed vocabulary limits the subword models’ adaptability across languages and domains. In this work, we propose a vocabulary-free neural tokenizer by distilling segmentation information from heuristic-based subword tokenization. We pre-train our character-based tokenizer by processing unique words from multilingual corpus, thereby extensively increasing word diversity across languages. Unlike the predefined and fixed vocabularies in subword methods, our tokenizer allows end-to-end task learning, resulting in optimal task-specific tokenization. The experimental results show that replacing the subword tokenizer with our neural tokenizer consistently improves performance on multilingual (NLI) and code-switching (sentiment analysis) tasks, with larger gains in low-resource languages. Additionally, our neural tokenizer exhibits a robust performance on downstream tasks when adversarial noise is present (typos and misspelling), further increasing the initial improvements over statistical subword tokenizers.
%R 10.18653/v1/2022.repl4nlp-1.10
%U https://aclanthology.org/2022.repl4nlp-1.10
%U https://doi.org/10.18653/v1/2022.repl4nlp-1.10
%P 91-99
Markdown (Informal)
[A Vocabulary-Free Multilingual Neural Tokenizer for End-to-End Task Learning](https://aclanthology.org/2022.repl4nlp-1.10) (Mofijul Islam et al., RepL4NLP 2022)
ACL