@inproceedings{kirov-etal-2016-large,
title = "Very-large Scale Parsing and Normalization of {W}iktionary Morphological Paradigms",
author = "Kirov, Christo and
Sylak-Glassman, John and
Que, Roger and
Yarowsky, David",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Declerck, Thierry and
Goggi, Sara and
Grobelnik, Marko and
Maegaard, Bente and
Mariani, Joseph and
Mazo, Helene and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Tenth International Conference on Language Resources and Evaluation ({LREC}'16)",
month = may,
year = "2016",
address = "Portoro{\v{z}}, Slovenia",
publisher = "European Language Resources Association (ELRA)",
url = "https://aclanthology.org/L16-1498",
pages = "3121--3126",
abstract = "Wiktionary is a large-scale resource for cross-lingual lexical information with great potential utility for machine translation (MT) and many other NLP tasks, especially automatic morphological analysis and generation. However, it is designed primarily for human viewing rather than machine readability, and presents numerous challenges for generalized parsing and extraction due to a lack of standardized formatting and grammatical descriptor definitions. This paper describes a large-scale effort to automatically extract and standardize the data in Wiktionary and make it available for use by the NLP research community. The methodological innovations include a multidimensional table parsing algorithm, a cross-lexeme, token-frequency-based method of separating inflectional form data from grammatical descriptors, the normalization of grammatical descriptors to a unified annotation scheme that accounts for cross-linguistic diversity, and a verification and correction process that exploits within-language, cross-lexeme table format consistency to minimize human effort. The effort described here resulted in the extraction of a uniquely large normalized resource of nearly 1,000,000 inflectional paradigms across 350 languages. Evaluation shows that even though the data is extracted using a language-independent approach, it is comparable in quantity and quality to data extracted using hand-tuned, language-specific approaches.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kirov-etal-2016-large">
<titleInfo>
<title>Very-large Scale Parsing and Normalization of Wiktionary Morphological Paradigms</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christo</namePart>
<namePart type="family">Kirov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">John</namePart>
<namePart type="family">Sylak-Glassman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roger</namePart>
<namePart type="family">Que</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Yarowsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2016-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC’16)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Goggi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marko</namePart>
<namePart type="family">Grobelnik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helene</namePart>
<namePart type="family">Mazo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asuncion</namePart>
<namePart type="family">Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Portorož, Slovenia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Wiktionary is a large-scale resource for cross-lingual lexical information with great potential utility for machine translation (MT) and many other NLP tasks, especially automatic morphological analysis and generation. However, it is designed primarily for human viewing rather than machine readability, and presents numerous challenges for generalized parsing and extraction due to a lack of standardized formatting and grammatical descriptor definitions. This paper describes a large-scale effort to automatically extract and standardize the data in Wiktionary and make it available for use by the NLP research community. The methodological innovations include a multidimensional table parsing algorithm, a cross-lexeme, token-frequency-based method of separating inflectional form data from grammatical descriptors, the normalization of grammatical descriptors to a unified annotation scheme that accounts for cross-linguistic diversity, and a verification and correction process that exploits within-language, cross-lexeme table format consistency to minimize human effort. The effort described here resulted in the extraction of a uniquely large normalized resource of nearly 1,000,000 inflectional paradigms across 350 languages. Evaluation shows that even though the data is extracted using a language-independent approach, it is comparable in quantity and quality to data extracted using hand-tuned, language-specific approaches.</abstract>
<identifier type="citekey">kirov-etal-2016-large</identifier>
<location>
<url>https://aclanthology.org/L16-1498</url>
</location>
<part>
<date>2016-05</date>
<extent unit="page">
<start>3121</start>
<end>3126</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Very-large Scale Parsing and Normalization of Wiktionary Morphological Paradigms
%A Kirov, Christo
%A Sylak-Glassman, John
%A Que, Roger
%A Yarowsky, David
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Declerck, Thierry
%Y Goggi, Sara
%Y Grobelnik, Marko
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Mazo, Helene
%Y Moreno, Asuncion
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC’16)
%D 2016
%8 May
%I European Language Resources Association (ELRA)
%C Portorož, Slovenia
%F kirov-etal-2016-large
%X Wiktionary is a large-scale resource for cross-lingual lexical information with great potential utility for machine translation (MT) and many other NLP tasks, especially automatic morphological analysis and generation. However, it is designed primarily for human viewing rather than machine readability, and presents numerous challenges for generalized parsing and extraction due to a lack of standardized formatting and grammatical descriptor definitions. This paper describes a large-scale effort to automatically extract and standardize the data in Wiktionary and make it available for use by the NLP research community. The methodological innovations include a multidimensional table parsing algorithm, a cross-lexeme, token-frequency-based method of separating inflectional form data from grammatical descriptors, the normalization of grammatical descriptors to a unified annotation scheme that accounts for cross-linguistic diversity, and a verification and correction process that exploits within-language, cross-lexeme table format consistency to minimize human effort. The effort described here resulted in the extraction of a uniquely large normalized resource of nearly 1,000,000 inflectional paradigms across 350 languages. Evaluation shows that even though the data is extracted using a language-independent approach, it is comparable in quantity and quality to data extracted using hand-tuned, language-specific approaches.
%U https://aclanthology.org/L16-1498
%P 3121-3126
Markdown (Informal)
[Very-large Scale Parsing and Normalization of Wiktionary Morphological Paradigms](https://aclanthology.org/L16-1498) (Kirov et al., LREC 2016)
ACL