@inproceedings{bergmanis-goldwater-2017-segmentation,
title = "From Segmentation to Analyses: a Probabilistic Model for Unsupervised Morphology Induction",
author = "Bergmanis, Toms and
Goldwater, Sharon",
editor = "Lapata, Mirella and
Blunsom, Phil and
Koller, Alexander",
booktitle = "Proceedings of the 15th Conference of the {E}uropean Chapter of the Association for Computational Linguistics: Volume 1, Long Papers",
month = apr,
year = "2017",
address = "Valencia, Spain",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/E17-1032",
pages = "337--346",
abstract = "A major motivation for unsupervised morphological analysis is to reduce the sparse data problem in under-resourced languages. Most previous work focus on segmenting surface forms into their constituent morphs (taking: tak +ing), but surface form segmentation does not solve the sparse data problem as the analyses of take and taking are not connected to each other. We present a system that adapts the MorphoChains system (Narasimhan et al., 2015) to provide morphological analyses that aim to abstract over spelling differences in functionally similar morphs. This results in analyses that are not compelled to use all the orthographic material of a word (stopping: stop +ing) or limited to only that material (acidified: acid +ify +ed). On average across six typologically varied languages our system has a similar or better F-score on EMMA (a measure of underlying morpheme accuracy) than three strong baselines; moreover, the total number of distinct morphemes identified by our system is on average 12.8{\%} lower than for Morfessor (Virpioja et al., 2013), a state-of-the-art surface segmentation system.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bergmanis-goldwater-2017-segmentation">
<titleInfo>
<title>From Segmentation to Analyses: a Probabilistic Model for Unsupervised Morphology Induction</title>
</titleInfo>
<name type="personal">
<namePart type="given">Toms</namePart>
<namePart type="family">Bergmanis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sharon</namePart>
<namePart type="family">Goldwater</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2017-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 1, Long Papers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mirella</namePart>
<namePart type="family">Lapata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Phil</namePart>
<namePart type="family">Blunsom</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexander</namePart>
<namePart type="family">Koller</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Valencia, Spain</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>A major motivation for unsupervised morphological analysis is to reduce the sparse data problem in under-resourced languages. Most previous work focus on segmenting surface forms into their constituent morphs (taking: tak +ing), but surface form segmentation does not solve the sparse data problem as the analyses of take and taking are not connected to each other. We present a system that adapts the MorphoChains system (Narasimhan et al., 2015) to provide morphological analyses that aim to abstract over spelling differences in functionally similar morphs. This results in analyses that are not compelled to use all the orthographic material of a word (stopping: stop +ing) or limited to only that material (acidified: acid +ify +ed). On average across six typologically varied languages our system has a similar or better F-score on EMMA (a measure of underlying morpheme accuracy) than three strong baselines; moreover, the total number of distinct morphemes identified by our system is on average 12.8% lower than for Morfessor (Virpioja et al., 2013), a state-of-the-art surface segmentation system.</abstract>
<identifier type="citekey">bergmanis-goldwater-2017-segmentation</identifier>
<location>
<url>https://aclanthology.org/E17-1032</url>
</location>
<part>
<date>2017-04</date>
<extent unit="page">
<start>337</start>
<end>346</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T From Segmentation to Analyses: a Probabilistic Model for Unsupervised Morphology Induction
%A Bergmanis, Toms
%A Goldwater, Sharon
%Y Lapata, Mirella
%Y Blunsom, Phil
%Y Koller, Alexander
%S Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 1, Long Papers
%D 2017
%8 April
%I Association for Computational Linguistics
%C Valencia, Spain
%F bergmanis-goldwater-2017-segmentation
%X A major motivation for unsupervised morphological analysis is to reduce the sparse data problem in under-resourced languages. Most previous work focus on segmenting surface forms into their constituent morphs (taking: tak +ing), but surface form segmentation does not solve the sparse data problem as the analyses of take and taking are not connected to each other. We present a system that adapts the MorphoChains system (Narasimhan et al., 2015) to provide morphological analyses that aim to abstract over spelling differences in functionally similar morphs. This results in analyses that are not compelled to use all the orthographic material of a word (stopping: stop +ing) or limited to only that material (acidified: acid +ify +ed). On average across six typologically varied languages our system has a similar or better F-score on EMMA (a measure of underlying morpheme accuracy) than three strong baselines; moreover, the total number of distinct morphemes identified by our system is on average 12.8% lower than for Morfessor (Virpioja et al., 2013), a state-of-the-art surface segmentation system.
%U https://aclanthology.org/E17-1032
%P 337-346
Markdown (Informal)
[From Segmentation to Analyses: a Probabilistic Model for Unsupervised Morphology Induction](https://aclanthology.org/E17-1032) (Bergmanis & Goldwater, EACL 2017)
ACL