@inproceedings{tsarfaty-goldberg-2008-word,
title = "Word-Based or Morpheme-Based? Annotation Strategies for {M}odern {H}ebrew Clitics",
author = "Tsarfaty, Reut and
Goldberg, Yoav",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Maegaard, Bente and
Mariani, Joseph and
Odijk, Jan and
Piperidis, Stelios and
Tapias, Daniel",
booktitle = "Proceedings of the Sixth International Conference on Language Resources and Evaluation ({LREC}'08)",
month = may,
year = "2008",
address = "Marrakech, Morocco",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2008/pdf/361_paper.pdf",
abstract = "Morphologically rich languages pose a challenge to the annotators of treebanks with respect to the status of orthographic (space-delimited) words in the syntactic parse trees. In such languages an orthographic word may carry various, distinct, sorts of information and the question arises whether we should represent such words as a sequence of their constituent morphemes (i.e., a Morpheme-Based annotation strategy) or whether we should preserve their special orthographic status within the trees (i.e., a Word-Based annotation strategy). In this paper we empirically address this challenge in the context of the development of Language Resources for Modern Hebrew. We compare and contrast the Morpheme-Based and Word-Based annotation strategies of pronominal clitics in Modern Hebrew and we show that the Word-Based strategy is more adequate for the purpose of training statistical parsers as it provides a better PP-attachment disambiguation capacity and a better alignment with initial surface forms. Our findings in turn raise new questions concerning the interaction of morphological and syntactic processing of which investigation is facilitated by the parallel treebank we made available.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tsarfaty-goldberg-2008-word">
<titleInfo>
<title>Word-Based or Morpheme-Based? Annotation Strategies for Modern Hebrew Clitics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Reut</namePart>
<namePart type="family">Tsarfaty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2008-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Sixth International Conference on Language Resources and Evaluation (LREC’08)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Tapias</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Marrakech, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Morphologically rich languages pose a challenge to the annotators of treebanks with respect to the status of orthographic (space-delimited) words in the syntactic parse trees. In such languages an orthographic word may carry various, distinct, sorts of information and the question arises whether we should represent such words as a sequence of their constituent morphemes (i.e., a Morpheme-Based annotation strategy) or whether we should preserve their special orthographic status within the trees (i.e., a Word-Based annotation strategy). In this paper we empirically address this challenge in the context of the development of Language Resources for Modern Hebrew. We compare and contrast the Morpheme-Based and Word-Based annotation strategies of pronominal clitics in Modern Hebrew and we show that the Word-Based strategy is more adequate for the purpose of training statistical parsers as it provides a better PP-attachment disambiguation capacity and a better alignment with initial surface forms. Our findings in turn raise new questions concerning the interaction of morphological and syntactic processing of which investigation is facilitated by the parallel treebank we made available.</abstract>
<identifier type="citekey">tsarfaty-goldberg-2008-word</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2008/pdf/361_paper.pdf</url>
</location>
<part>
<date>2008-05</date>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Word-Based or Morpheme-Based? Annotation Strategies for Modern Hebrew Clitics
%A Tsarfaty, Reut
%A Goldberg, Yoav
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Odijk, Jan
%Y Piperidis, Stelios
%Y Tapias, Daniel
%S Proceedings of the Sixth International Conference on Language Resources and Evaluation (LREC’08)
%D 2008
%8 May
%I European Language Resources Association (ELRA)
%C Marrakech, Morocco
%F tsarfaty-goldberg-2008-word
%X Morphologically rich languages pose a challenge to the annotators of treebanks with respect to the status of orthographic (space-delimited) words in the syntactic parse trees. In such languages an orthographic word may carry various, distinct, sorts of information and the question arises whether we should represent such words as a sequence of their constituent morphemes (i.e., a Morpheme-Based annotation strategy) or whether we should preserve their special orthographic status within the trees (i.e., a Word-Based annotation strategy). In this paper we empirically address this challenge in the context of the development of Language Resources for Modern Hebrew. We compare and contrast the Morpheme-Based and Word-Based annotation strategies of pronominal clitics in Modern Hebrew and we show that the Word-Based strategy is more adequate for the purpose of training statistical parsers as it provides a better PP-attachment disambiguation capacity and a better alignment with initial surface forms. Our findings in turn raise new questions concerning the interaction of morphological and syntactic processing of which investigation is facilitated by the parallel treebank we made available.
%U http://www.lrec-conf.org/proceedings/lrec2008/pdf/361_paper.pdf
Markdown (Informal)
[Word-Based or Morpheme-Based? Annotation Strategies for Modern Hebrew Clitics](http://www.lrec-conf.org/proceedings/lrec2008/pdf/361_paper.pdf) (Tsarfaty & Goldberg, LREC 2008)
ACL