@inproceedings{kolonin-ramesh-2022-unsupervised,
title = "Unsupervised Tokenization Learning",
author = "Kolonin, Anton and
Ramesh, Vignav",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.emnlp-main.239",
doi = "10.18653/v1/2022.emnlp-main.239",
pages = "3649--3664",
abstract = "In the presented study, we discover that the so-called {``}transition freedom{''} metric appears superior for unsupervised tokenization purposes in comparison to statistical metrics such as mutual information and conditional probability, providing F-measure scores in range from 0.71 to 1.0 across explored multilingual corpora. We find that different languages require different offshoots of that metric (such as derivative, variance, and {``}peak values{''}) for successful tokenization. Larger training corpora do not necessarily result in better tokenization quality, while compressing the models by eliminating statistically weak evidence tends to improve performance. The proposed unsupervised tokenization technique provides quality better than or comparable to lexicon-based ones, depending on the language.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kolonin-ramesh-2022-unsupervised">
<titleInfo>
<title>Unsupervised Tokenization Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anton</namePart>
<namePart type="family">Kolonin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vignav</namePart>
<namePart type="family">Ramesh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In the presented study, we discover that the so-called “transition freedom” metric appears superior for unsupervised tokenization purposes in comparison to statistical metrics such as mutual information and conditional probability, providing F-measure scores in range from 0.71 to 1.0 across explored multilingual corpora. We find that different languages require different offshoots of that metric (such as derivative, variance, and “peak values”) for successful tokenization. Larger training corpora do not necessarily result in better tokenization quality, while compressing the models by eliminating statistically weak evidence tends to improve performance. The proposed unsupervised tokenization technique provides quality better than or comparable to lexicon-based ones, depending on the language.</abstract>
<identifier type="citekey">kolonin-ramesh-2022-unsupervised</identifier>
<identifier type="doi">10.18653/v1/2022.emnlp-main.239</identifier>
<location>
<url>https://aclanthology.org/2022.emnlp-main.239</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>3649</start>
<end>3664</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Unsupervised Tokenization Learning
%A Kolonin, Anton
%A Ramesh, Vignav
%Y Goldberg, Yoav
%Y Kozareva, Zornitsa
%Y Zhang, Yue
%S Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F kolonin-ramesh-2022-unsupervised
%X In the presented study, we discover that the so-called “transition freedom” metric appears superior for unsupervised tokenization purposes in comparison to statistical metrics such as mutual information and conditional probability, providing F-measure scores in range from 0.71 to 1.0 across explored multilingual corpora. We find that different languages require different offshoots of that metric (such as derivative, variance, and “peak values”) for successful tokenization. Larger training corpora do not necessarily result in better tokenization quality, while compressing the models by eliminating statistically weak evidence tends to improve performance. The proposed unsupervised tokenization technique provides quality better than or comparable to lexicon-based ones, depending on the language.
%R 10.18653/v1/2022.emnlp-main.239
%U https://aclanthology.org/2022.emnlp-main.239
%U https://doi.org/10.18653/v1/2022.emnlp-main.239
%P 3649-3664
Markdown (Informal)
[Unsupervised Tokenization Learning](https://aclanthology.org/2022.emnlp-main.239) (Kolonin & Ramesh, EMNLP 2022)
ACL
- Anton Kolonin and Vignav Ramesh. 2022. Unsupervised Tokenization Learning. In Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, pages 3649–3664, Abu Dhabi, United Arab Emirates. Association for Computational Linguistics.