@inproceedings{yu-etal-2016-even,
title = "If You {E}ven Don{'}t Have a Bit of {B}ible: Learning Delexicalized {POS} Taggers",
author = "Yu, Zhiwei and
Mare{\v{c}}ek, David and
{\v{Z}}abokrtsk{\'y}, Zden{\v{e}}k and
Zeman, Daniel",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Declerck, Thierry and
Goggi, Sara and
Grobelnik, Marko and
Maegaard, Bente and
Mariani, Joseph and
Mazo, Helene and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Tenth International Conference on Language Resources and Evaluation ({LREC}'16)",
month = may,
year = "2016",
address = "Portoro{\v{z}}, Slovenia",
publisher = "European Language Resources Association (ELRA)",
url = "https://aclanthology.org/L16-1015",
pages = "96--103",
abstract = "Part-of-speech (POS) induction is one of the most popular tasks in research on unsupervised NLP. Various unsupervised and semi-supervised methods have been proposed to tag an unseen language. However, many of them require some partial understanding of the target language because they rely on dictionaries or parallel corpora such as the Bible. In this paper, we propose a different method named delexicalized tagging, for which we only need a raw corpus of the target language. We transfer tagging models trained on annotated corpora of one or more resource-rich languages. We employ language-independent features such as word length, frequency, neighborhood entropy, character classes (alphabetic vs. numeric vs. punctuation) etc. We demonstrate that such features can, to certain extent, serve as predictors of the part of speech, represented by the universal POS tag.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yu-etal-2016-even">
<titleInfo>
<title>If You Even Don’t Have a Bit of Bible: Learning Delexicalized POS Taggers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhiwei</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Mareček</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zdeněk</namePart>
<namePart type="family">Žabokrtský</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Zeman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2016-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC’16)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Goggi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marko</namePart>
<namePart type="family">Grobelnik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helene</namePart>
<namePart type="family">Mazo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asuncion</namePart>
<namePart type="family">Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Portorož, Slovenia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Part-of-speech (POS) induction is one of the most popular tasks in research on unsupervised NLP. Various unsupervised and semi-supervised methods have been proposed to tag an unseen language. However, many of them require some partial understanding of the target language because they rely on dictionaries or parallel corpora such as the Bible. In this paper, we propose a different method named delexicalized tagging, for which we only need a raw corpus of the target language. We transfer tagging models trained on annotated corpora of one or more resource-rich languages. We employ language-independent features such as word length, frequency, neighborhood entropy, character classes (alphabetic vs. numeric vs. punctuation) etc. We demonstrate that such features can, to certain extent, serve as predictors of the part of speech, represented by the universal POS tag.</abstract>
<identifier type="citekey">yu-etal-2016-even</identifier>
<location>
<url>https://aclanthology.org/L16-1015</url>
</location>
<part>
<date>2016-05</date>
<extent unit="page">
<start>96</start>
<end>103</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T If You Even Don’t Have a Bit of Bible: Learning Delexicalized POS Taggers
%A Yu, Zhiwei
%A Mareček, David
%A Žabokrtský, Zdeněk
%A Zeman, Daniel
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Declerck, Thierry
%Y Goggi, Sara
%Y Grobelnik, Marko
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Mazo, Helene
%Y Moreno, Asuncion
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC’16)
%D 2016
%8 May
%I European Language Resources Association (ELRA)
%C Portorož, Slovenia
%F yu-etal-2016-even
%X Part-of-speech (POS) induction is one of the most popular tasks in research on unsupervised NLP. Various unsupervised and semi-supervised methods have been proposed to tag an unseen language. However, many of them require some partial understanding of the target language because they rely on dictionaries or parallel corpora such as the Bible. In this paper, we propose a different method named delexicalized tagging, for which we only need a raw corpus of the target language. We transfer tagging models trained on annotated corpora of one or more resource-rich languages. We employ language-independent features such as word length, frequency, neighborhood entropy, character classes (alphabetic vs. numeric vs. punctuation) etc. We demonstrate that such features can, to certain extent, serve as predictors of the part of speech, represented by the universal POS tag.
%U https://aclanthology.org/L16-1015
%P 96-103
Markdown (Informal)
[If You Even Don’t Have a Bit of Bible: Learning Delexicalized POS Taggers](https://aclanthology.org/L16-1015) (Yu et al., LREC 2016)
ACL