@inproceedings{rueter-etal-2025-mansi,
title = "A {M}ansi {FST} and spellchecker",
author = "Rueter, Jack and
Horv{\'a}th, Csilla and
Trosterud, Trond",
editor = "Trosterud, Trond and
Wiechetek, Linda and
Pirinen, Flammie",
booktitle = "Proceedings of the 9th Workshop on Constraint Grammar and Finite State NLP",
month = mar,
year = "2025",
address = "Tallinn, Estonia",
publisher = "University of Tartu Library",
url = "https://aclanthology.org/2025.cgmta-1.5/",
pages = "32--37",
ISBN = "978-9908-53-113-7",
abstract = "The article presents a finite state transducer and spellchecker for Mansi, an Ob-Ugric Uralic language spoken in northwestern Siberia. Mansi has a rich but mostly agglutinative morphology, with a morphophonology dominated by sandhi phenomena. With a small set of morphophonological rules (32 twolc rules) and a lexicon consisting of 12,000 Mansi entries and a larger set of propernouns we were able to build a transducer covering 98.9 {\%} of a large (700k) newspaper corpus. Being a part of the GiellaLT infrastructure, the transducer was turned into a spellchecker. The most common spelling error in Mansi is the omission of length marks on vowels, and for the 1000 most common words containing long vowels, the spellchecker was able to give a correct suggestion as top-five in 98.3 {\%} of the cases, and as first suggestion in 91.3 {\%} of the cases."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="rueter-etal-2025-mansi">
<titleInfo>
<title>A Mansi FST and spellchecker</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jack</namePart>
<namePart type="family">Rueter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Csilla</namePart>
<namePart type="family">Horváth</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Trond</namePart>
<namePart type="family">Trosterud</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 9th Workshop on Constraint Grammar and Finite State NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Trond</namePart>
<namePart type="family">Trosterud</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Linda</namePart>
<namePart type="family">Wiechetek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Flammie</namePart>
<namePart type="family">Pirinen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>University of Tartu Library</publisher>
<place>
<placeTerm type="text">Tallinn, Estonia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">978-9908-53-113-7</identifier>
</relatedItem>
<abstract>The article presents a finite state transducer and spellchecker for Mansi, an Ob-Ugric Uralic language spoken in northwestern Siberia. Mansi has a rich but mostly agglutinative morphology, with a morphophonology dominated by sandhi phenomena. With a small set of morphophonological rules (32 twolc rules) and a lexicon consisting of 12,000 Mansi entries and a larger set of propernouns we were able to build a transducer covering 98.9 % of a large (700k) newspaper corpus. Being a part of the GiellaLT infrastructure, the transducer was turned into a spellchecker. The most common spelling error in Mansi is the omission of length marks on vowels, and for the 1000 most common words containing long vowels, the spellchecker was able to give a correct suggestion as top-five in 98.3 % of the cases, and as first suggestion in 91.3 % of the cases.</abstract>
<identifier type="citekey">rueter-etal-2025-mansi</identifier>
<location>
<url>https://aclanthology.org/2025.cgmta-1.5/</url>
</location>
<part>
<date>2025-03</date>
<extent unit="page">
<start>32</start>
<end>37</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Mansi FST and spellchecker
%A Rueter, Jack
%A Horváth, Csilla
%A Trosterud, Trond
%Y Trosterud, Trond
%Y Wiechetek, Linda
%Y Pirinen, Flammie
%S Proceedings of the 9th Workshop on Constraint Grammar and Finite State NLP
%D 2025
%8 March
%I University of Tartu Library
%C Tallinn, Estonia
%@ 978-9908-53-113-7
%F rueter-etal-2025-mansi
%X The article presents a finite state transducer and spellchecker for Mansi, an Ob-Ugric Uralic language spoken in northwestern Siberia. Mansi has a rich but mostly agglutinative morphology, with a morphophonology dominated by sandhi phenomena. With a small set of morphophonological rules (32 twolc rules) and a lexicon consisting of 12,000 Mansi entries and a larger set of propernouns we were able to build a transducer covering 98.9 % of a large (700k) newspaper corpus. Being a part of the GiellaLT infrastructure, the transducer was turned into a spellchecker. The most common spelling error in Mansi is the omission of length marks on vowels, and for the 1000 most common words containing long vowels, the spellchecker was able to give a correct suggestion as top-five in 98.3 % of the cases, and as first suggestion in 91.3 % of the cases.
%U https://aclanthology.org/2025.cgmta-1.5/
%P 32-37
Markdown (Informal)
[A Mansi FST and spellchecker](https://aclanthology.org/2025.cgmta-1.5/) (Rueter et al., cgmta 2025)
ACL
- Jack Rueter, Csilla Horváth, and Trond Trosterud. 2025. A Mansi FST and spellchecker. In Proceedings of the 9th Workshop on Constraint Grammar and Finite State NLP, pages 32–37, Tallinn, Estonia. University of Tartu Library.