@article{shao-etal-2018-universal,
title = "Universal Word Segmentation: Implementation and Interpretation",
author = "Shao, Yan and
Hardmeier, Christian and
Nivre, Joakim",
editor = "Lee, Lillian and
Johnson, Mark and
Toutanova, Kristina and
Roark, Brian",
journal = "Transactions of the Association for Computational Linguistics",
volume = "6",
year = "2018",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/Q18-1030",
doi = "10.1162/tacl_a_00033",
pages = "421--435",
abstract = "Word segmentation is a low-level NLP task that is non-trivial for a considerable number of languages. In this paper, we present a sequence tagging framework and apply it to word segmentation for a wide range of languages with different writing systems and typological characteristics. Additionally, we investigate the correlations between various typological factors and word segmentation accuracy. The experimental results indicate that segmentation accuracy is positively related to word boundary markers and negatively to the number of unique non-segmental terms. Based on the analysis, we design a small set of language-specific settings and extensively evaluate the segmentation system on the Universal Dependencies datasets. Our model obtains state-of-the-art accuracies on all the UD languages. It performs substantially better on languages that are non-trivial to segment, such as Chinese, Japanese, Arabic and Hebrew, when compared to previous work.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="shao-etal-2018-universal">
<titleInfo>
<title>Universal Word Segmentation: Implementation and Interpretation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yan</namePart>
<namePart type="family">Shao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christian</namePart>
<namePart type="family">Hardmeier</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joakim</namePart>
<namePart type="family">Nivre</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Word segmentation is a low-level NLP task that is non-trivial for a considerable number of languages. In this paper, we present a sequence tagging framework and apply it to word segmentation for a wide range of languages with different writing systems and typological characteristics. Additionally, we investigate the correlations between various typological factors and word segmentation accuracy. The experimental results indicate that segmentation accuracy is positively related to word boundary markers and negatively to the number of unique non-segmental terms. Based on the analysis, we design a small set of language-specific settings and extensively evaluate the segmentation system on the Universal Dependencies datasets. Our model obtains state-of-the-art accuracies on all the UD languages. It performs substantially better on languages that are non-trivial to segment, such as Chinese, Japanese, Arabic and Hebrew, when compared to previous work.</abstract>
<identifier type="citekey">shao-etal-2018-universal</identifier>
<identifier type="doi">10.1162/tacl_a_00033</identifier>
<location>
<url>https://aclanthology.org/Q18-1030</url>
</location>
<part>
<date>2018</date>
<detail type="volume"><number>6</number></detail>
<extent unit="page">
<start>421</start>
<end>435</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Universal Word Segmentation: Implementation and Interpretation
%A Shao, Yan
%A Hardmeier, Christian
%A Nivre, Joakim
%J Transactions of the Association for Computational Linguistics
%D 2018
%V 6
%I MIT Press
%C Cambridge, MA
%F shao-etal-2018-universal
%X Word segmentation is a low-level NLP task that is non-trivial for a considerable number of languages. In this paper, we present a sequence tagging framework and apply it to word segmentation for a wide range of languages with different writing systems and typological characteristics. Additionally, we investigate the correlations between various typological factors and word segmentation accuracy. The experimental results indicate that segmentation accuracy is positively related to word boundary markers and negatively to the number of unique non-segmental terms. Based on the analysis, we design a small set of language-specific settings and extensively evaluate the segmentation system on the Universal Dependencies datasets. Our model obtains state-of-the-art accuracies on all the UD languages. It performs substantially better on languages that are non-trivial to segment, such as Chinese, Japanese, Arabic and Hebrew, when compared to previous work.
%R 10.1162/tacl_a_00033
%U https://aclanthology.org/Q18-1030
%U https://doi.org/10.1162/tacl_a_00033
%P 421-435
Markdown (Informal)
[Universal Word Segmentation: Implementation and Interpretation](https://aclanthology.org/Q18-1030) (Shao et al., TACL 2018)
ACL