@inproceedings{pierrehumbert-granell-2018-hapax,
title = "On Hapax Legomena and Morphological Productivity",
author = "Pierrehumbert, Janet and
Granell, Ramon",
editor = "Kuebler, Sandra and
Nicolai, Garrett",
booktitle = "Proceedings of the Fifteenth Workshop on Computational Research in Phonetics, Phonology, and Morphology",
month = oct,
year = "2018",
address = "Brussels, Belgium",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W18-5814",
doi = "10.18653/v1/W18-5814",
pages = "125--130",
abstract = "Quantifying and predicting morphological productivity is a long-standing challenge in corpus linguistics and psycholinguistics. The same challenge reappears in natural language processing in the context of handling words that were not seen in the training set (out-of-vocabulary, or OOV, words). Prior research showed that a good indicator of the productivity of a morpheme is the number of words involving it that occur exactly once (the \textit{hapax legomena}). A technical connection was adduced between this result and Good-Turing smoothing, which assigns probability mass to unseen events on the basis of the simplifying assumption that word frequencies are stationary. In a large-scale study of 133 affixes in Wikipedia, we develop evidence that success in fact depends on tapping the frequency range in which the assumptions of Good-Turing are violated.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pierrehumbert-granell-2018-hapax">
<titleInfo>
<title>On Hapax Legomena and Morphological Productivity</title>
</titleInfo>
<name type="personal">
<namePart type="given">Janet</namePart>
<namePart type="family">Pierrehumbert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ramon</namePart>
<namePart type="family">Granell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifteenth Workshop on Computational Research in Phonetics, Phonology, and Morphology</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sandra</namePart>
<namePart type="family">Kuebler</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Garrett</namePart>
<namePart type="family">Nicolai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Brussels, Belgium</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Quantifying and predicting morphological productivity is a long-standing challenge in corpus linguistics and psycholinguistics. The same challenge reappears in natural language processing in the context of handling words that were not seen in the training set (out-of-vocabulary, or OOV, words). Prior research showed that a good indicator of the productivity of a morpheme is the number of words involving it that occur exactly once (the hapax legomena). A technical connection was adduced between this result and Good-Turing smoothing, which assigns probability mass to unseen events on the basis of the simplifying assumption that word frequencies are stationary. In a large-scale study of 133 affixes in Wikipedia, we develop evidence that success in fact depends on tapping the frequency range in which the assumptions of Good-Turing are violated.</abstract>
<identifier type="citekey">pierrehumbert-granell-2018-hapax</identifier>
<identifier type="doi">10.18653/v1/W18-5814</identifier>
<location>
<url>https://aclanthology.org/W18-5814</url>
</location>
<part>
<date>2018-10</date>
<extent unit="page">
<start>125</start>
<end>130</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T On Hapax Legomena and Morphological Productivity
%A Pierrehumbert, Janet
%A Granell, Ramon
%Y Kuebler, Sandra
%Y Nicolai, Garrett
%S Proceedings of the Fifteenth Workshop on Computational Research in Phonetics, Phonology, and Morphology
%D 2018
%8 October
%I Association for Computational Linguistics
%C Brussels, Belgium
%F pierrehumbert-granell-2018-hapax
%X Quantifying and predicting morphological productivity is a long-standing challenge in corpus linguistics and psycholinguistics. The same challenge reappears in natural language processing in the context of handling words that were not seen in the training set (out-of-vocabulary, or OOV, words). Prior research showed that a good indicator of the productivity of a morpheme is the number of words involving it that occur exactly once (the hapax legomena). A technical connection was adduced between this result and Good-Turing smoothing, which assigns probability mass to unseen events on the basis of the simplifying assumption that word frequencies are stationary. In a large-scale study of 133 affixes in Wikipedia, we develop evidence that success in fact depends on tapping the frequency range in which the assumptions of Good-Turing are violated.
%R 10.18653/v1/W18-5814
%U https://aclanthology.org/W18-5814
%U https://doi.org/10.18653/v1/W18-5814
%P 125-130
Markdown (Informal)
[On Hapax Legomena and Morphological Productivity](https://aclanthology.org/W18-5814) (Pierrehumbert & Granell, EMNLP 2018)
ACL
- Janet Pierrehumbert and Ramon Granell. 2018. On Hapax Legomena and Morphological Productivity. In Proceedings of the Fifteenth Workshop on Computational Research in Phonetics, Phonology, and Morphology, pages 125–130, Brussels, Belgium. Association for Computational Linguistics.