@inproceedings{gutierrez-vasques-etal-2021-characters,
title = "From characters to words: the turning point of {BPE} merges",
author = "Gutierrez-Vasques, Ximena and
Bentz, Christian and
Sozinova, Olga and
Samardzic, Tanja",
editor = "Merlo, Paola and
Tiedemann, Jorg and
Tsarfaty, Reut",
booktitle = "Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume",
month = apr,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.eacl-main.302",
doi = "10.18653/v1/2021.eacl-main.302",
pages = "3454--3468",
abstract = "The distributions of orthographic word types are very different across languages due to typological characteristics, different writing traditions and potentially other factors. The wide range of cross-linguistic diversity is still a major challenge for NLP and the study of language. We use BPE and information-theoretic measures to investigate if distributions become similar under specific levels of subword tokenization. We perform a cross-linguistic comparison, following incremental merges of BPE (we go from characters to words) for 47 diverse languages. We show that text entropy values (a feature of probability distributions) tend to converge at specific subword levels: relatively few BPE merges (around 350) lead to the most similar distributions across languages. Additionally, we analyze the interaction between subword and word-level distributions and show that our findings can be interpreted in light of the ongoing discussion regarding different types of morphological complexity.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gutierrez-vasques-etal-2021-characters">
<titleInfo>
<title>From characters to words: the turning point of BPE merges</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ximena</namePart>
<namePart type="family">Gutierrez-Vasques</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christian</namePart>
<namePart type="family">Bentz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Olga</namePart>
<namePart type="family">Sozinova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanja</namePart>
<namePart type="family">Samardzic</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume</title>
</titleInfo>
<name type="personal">
<namePart type="given">Paola</namePart>
<namePart type="family">Merlo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jorg</namePart>
<namePart type="family">Tiedemann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Reut</namePart>
<namePart type="family">Tsarfaty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The distributions of orthographic word types are very different across languages due to typological characteristics, different writing traditions and potentially other factors. The wide range of cross-linguistic diversity is still a major challenge for NLP and the study of language. We use BPE and information-theoretic measures to investigate if distributions become similar under specific levels of subword tokenization. We perform a cross-linguistic comparison, following incremental merges of BPE (we go from characters to words) for 47 diverse languages. We show that text entropy values (a feature of probability distributions) tend to converge at specific subword levels: relatively few BPE merges (around 350) lead to the most similar distributions across languages. Additionally, we analyze the interaction between subword and word-level distributions and show that our findings can be interpreted in light of the ongoing discussion regarding different types of morphological complexity.</abstract>
<identifier type="citekey">gutierrez-vasques-etal-2021-characters</identifier>
<identifier type="doi">10.18653/v1/2021.eacl-main.302</identifier>
<location>
<url>https://aclanthology.org/2021.eacl-main.302</url>
</location>
<part>
<date>2021-04</date>
<extent unit="page">
<start>3454</start>
<end>3468</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T From characters to words: the turning point of BPE merges
%A Gutierrez-Vasques, Ximena
%A Bentz, Christian
%A Sozinova, Olga
%A Samardzic, Tanja
%Y Merlo, Paola
%Y Tiedemann, Jorg
%Y Tsarfaty, Reut
%S Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume
%D 2021
%8 April
%I Association for Computational Linguistics
%C Online
%F gutierrez-vasques-etal-2021-characters
%X The distributions of orthographic word types are very different across languages due to typological characteristics, different writing traditions and potentially other factors. The wide range of cross-linguistic diversity is still a major challenge for NLP and the study of language. We use BPE and information-theoretic measures to investigate if distributions become similar under specific levels of subword tokenization. We perform a cross-linguistic comparison, following incremental merges of BPE (we go from characters to words) for 47 diverse languages. We show that text entropy values (a feature of probability distributions) tend to converge at specific subword levels: relatively few BPE merges (around 350) lead to the most similar distributions across languages. Additionally, we analyze the interaction between subword and word-level distributions and show that our findings can be interpreted in light of the ongoing discussion regarding different types of morphological complexity.
%R 10.18653/v1/2021.eacl-main.302
%U https://aclanthology.org/2021.eacl-main.302
%U https://doi.org/10.18653/v1/2021.eacl-main.302
%P 3454-3468
Markdown (Informal)
[From characters to words: the turning point of BPE merges](https://aclanthology.org/2021.eacl-main.302) (Gutierrez-Vasques et al., EACL 2021)
ACL
- Ximena Gutierrez-Vasques, Christian Bentz, Olga Sozinova, and Tanja Samardzic. 2021. From characters to words: the turning point of BPE merges. In Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume, pages 3454–3468, Online. Association for Computational Linguistics.