@article{wang-eisner-2018-surface,
title = "Surface Statistics of an Unknown Language Indicate How to Parse It",
author = "Wang, Dingquan and
Eisner, Jason",
editor = "Lee, Lillian and
Johnson, Mark and
Toutanova, Kristina and
Roark, Brian",
journal = "Transactions of the Association for Computational Linguistics",
volume = "6",
year = "2018",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/Q18-1046",
doi = "10.1162/tacl_a_00248",
pages = "667--685",
abstract = "We introduce a novel framework for delexicalized dependency parsing in a new language. We show that useful features of the target language can be extracted automatically from an unparsed corpus, which consists only of gold part-of-speech (POS) sequences. Providing these features to our neural parser enables it to parse sequences like those in the corpus. Strikingly, our system has no supervision in the target language. Rather, it is a multilingual system that is trained end-to-end on a variety of other languages, so it learns a feature extractor that works well. We show experimentally across multiple languages: (1) Features computed from the unparsed corpus improve parsing accuracy. (2) Including thousands of synthetic languages in the training yields further improvement. (3) Despite being computed from unparsed corpora, our learned task-specific features beat previous work{'}s interpretable typological features that require parsed corpora or expert categorization of the language. Our best method improved attachment scores on held-out test languages by an average of 5.6 percentage points over past work that does not inspect the unparsed data (McDonald et al., 2011), and by 20.7 points over past {``}grammar induction{''} work that does not use training languages (Naseem et al., 2010).",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-eisner-2018-surface">
<titleInfo>
<title>Surface Statistics of an Unknown Language Indicate How to Parse It</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dingquan</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jason</namePart>
<namePart type="family">Eisner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>We introduce a novel framework for delexicalized dependency parsing in a new language. We show that useful features of the target language can be extracted automatically from an unparsed corpus, which consists only of gold part-of-speech (POS) sequences. Providing these features to our neural parser enables it to parse sequences like those in the corpus. Strikingly, our system has no supervision in the target language. Rather, it is a multilingual system that is trained end-to-end on a variety of other languages, so it learns a feature extractor that works well. We show experimentally across multiple languages: (1) Features computed from the unparsed corpus improve parsing accuracy. (2) Including thousands of synthetic languages in the training yields further improvement. (3) Despite being computed from unparsed corpora, our learned task-specific features beat previous work’s interpretable typological features that require parsed corpora or expert categorization of the language. Our best method improved attachment scores on held-out test languages by an average of 5.6 percentage points over past work that does not inspect the unparsed data (McDonald et al., 2011), and by 20.7 points over past “grammar induction” work that does not use training languages (Naseem et al., 2010).</abstract>
<identifier type="citekey">wang-eisner-2018-surface</identifier>
<identifier type="doi">10.1162/tacl_a_00248</identifier>
<location>
<url>https://aclanthology.org/Q18-1046</url>
</location>
<part>
<date>2018</date>
<detail type="volume"><number>6</number></detail>
<extent unit="page">
<start>667</start>
<end>685</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Surface Statistics of an Unknown Language Indicate How to Parse It
%A Wang, Dingquan
%A Eisner, Jason
%J Transactions of the Association for Computational Linguistics
%D 2018
%V 6
%I MIT Press
%C Cambridge, MA
%F wang-eisner-2018-surface
%X We introduce a novel framework for delexicalized dependency parsing in a new language. We show that useful features of the target language can be extracted automatically from an unparsed corpus, which consists only of gold part-of-speech (POS) sequences. Providing these features to our neural parser enables it to parse sequences like those in the corpus. Strikingly, our system has no supervision in the target language. Rather, it is a multilingual system that is trained end-to-end on a variety of other languages, so it learns a feature extractor that works well. We show experimentally across multiple languages: (1) Features computed from the unparsed corpus improve parsing accuracy. (2) Including thousands of synthetic languages in the training yields further improvement. (3) Despite being computed from unparsed corpora, our learned task-specific features beat previous work’s interpretable typological features that require parsed corpora or expert categorization of the language. Our best method improved attachment scores on held-out test languages by an average of 5.6 percentage points over past work that does not inspect the unparsed data (McDonald et al., 2011), and by 20.7 points over past “grammar induction” work that does not use training languages (Naseem et al., 2010).
%R 10.1162/tacl_a_00248
%U https://aclanthology.org/Q18-1046
%U https://doi.org/10.1162/tacl_a_00248
%P 667-685
Markdown (Informal)
[Surface Statistics of an Unknown Language Indicate How to Parse It](https://aclanthology.org/Q18-1046) (Wang & Eisner, TACL 2018)
ACL