@inproceedings{poswiata-perelkiewicz-2019-numbers,
title = "Numbers Normalisation in the Inflected Languages: a Case Study of {P}olish",
author = "Po{\'s}wiata, Rafa{\l} and
Pere{\l}kiewicz, Micha{\l}",
editor = "Erjavec, Toma{\v{z}} and
Marci{\'n}czuk, Micha{\l} and
Nakov, Preslav and
Piskorski, Jakub and
Pivovarova, Lidia and
{\v{S}}najder, Jan and
Steinberger, Josef and
Yangarber, Roman",
booktitle = "Proceedings of the 7th Workshop on Balto-Slavic Natural Language Processing",
month = aug,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W19-3703",
doi = "10.18653/v1/W19-3703",
pages = "23--28",
abstract = "Text normalisation in Text-to-Speech systems is a process of converting written expressions to their spoken forms. This task is complicated because in many cases the normalised form depends on the context. Furthermore, when we analysed languages like Croatian, Lithuanian, Polish, Russian or Slovak there is additional difficulty related to their inflected nature. In this paper we want to show how to deal with this problem for one of these languages: Polish, without having a large dedicated data set and using solutions prepared for other NLP tasks. We limited our study to only numbers expressions, which are the most common non-standard words to normalise. The proposed solution is a combination of morphological tagger and transducer supported by a dictionary of numbers in their spoken forms. The data set used for evaluation is based on the part of 1-million word subset of the National Corpus of Polish. The accuracy of the described approach is presented with a comparison to a simple baseline and two commercial systems: Google Cloud Text-to-Speech and Amazon Polly.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="poswiata-perelkiewicz-2019-numbers">
<titleInfo>
<title>Numbers Normalisation in the Inflected Languages: a Case Study of Polish</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rafał</namePart>
<namePart type="family">Poświata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michał</namePart>
<namePart type="family">Perełkiewicz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 7th Workshop on Balto-Slavic Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tomaž</namePart>
<namePart type="family">Erjavec</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michał</namePart>
<namePart type="family">Marcińczuk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preslav</namePart>
<namePart type="family">Nakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jakub</namePart>
<namePart type="family">Piskorski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lidia</namePart>
<namePart type="family">Pivovarova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Šnajder</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Josef</namePart>
<namePart type="family">Steinberger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roman</namePart>
<namePart type="family">Yangarber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Florence, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Text normalisation in Text-to-Speech systems is a process of converting written expressions to their spoken forms. This task is complicated because in many cases the normalised form depends on the context. Furthermore, when we analysed languages like Croatian, Lithuanian, Polish, Russian or Slovak there is additional difficulty related to their inflected nature. In this paper we want to show how to deal with this problem for one of these languages: Polish, without having a large dedicated data set and using solutions prepared for other NLP tasks. We limited our study to only numbers expressions, which are the most common non-standard words to normalise. The proposed solution is a combination of morphological tagger and transducer supported by a dictionary of numbers in their spoken forms. The data set used for evaluation is based on the part of 1-million word subset of the National Corpus of Polish. The accuracy of the described approach is presented with a comparison to a simple baseline and two commercial systems: Google Cloud Text-to-Speech and Amazon Polly.</abstract>
<identifier type="citekey">poswiata-perelkiewicz-2019-numbers</identifier>
<identifier type="doi">10.18653/v1/W19-3703</identifier>
<location>
<url>https://aclanthology.org/W19-3703</url>
</location>
<part>
<date>2019-08</date>
<extent unit="page">
<start>23</start>
<end>28</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Numbers Normalisation in the Inflected Languages: a Case Study of Polish
%A Poświata, Rafał
%A Perełkiewicz, Michał
%Y Erjavec, Tomaž
%Y Marcińczuk, Michał
%Y Nakov, Preslav
%Y Piskorski, Jakub
%Y Pivovarova, Lidia
%Y Šnajder, Jan
%Y Steinberger, Josef
%Y Yangarber, Roman
%S Proceedings of the 7th Workshop on Balto-Slavic Natural Language Processing
%D 2019
%8 August
%I Association for Computational Linguistics
%C Florence, Italy
%F poswiata-perelkiewicz-2019-numbers
%X Text normalisation in Text-to-Speech systems is a process of converting written expressions to their spoken forms. This task is complicated because in many cases the normalised form depends on the context. Furthermore, when we analysed languages like Croatian, Lithuanian, Polish, Russian or Slovak there is additional difficulty related to their inflected nature. In this paper we want to show how to deal with this problem for one of these languages: Polish, without having a large dedicated data set and using solutions prepared for other NLP tasks. We limited our study to only numbers expressions, which are the most common non-standard words to normalise. The proposed solution is a combination of morphological tagger and transducer supported by a dictionary of numbers in their spoken forms. The data set used for evaluation is based on the part of 1-million word subset of the National Corpus of Polish. The accuracy of the described approach is presented with a comparison to a simple baseline and two commercial systems: Google Cloud Text-to-Speech and Amazon Polly.
%R 10.18653/v1/W19-3703
%U https://aclanthology.org/W19-3703
%U https://doi.org/10.18653/v1/W19-3703
%P 23-28
Markdown (Informal)
[Numbers Normalisation in the Inflected Languages: a Case Study of Polish](https://aclanthology.org/W19-3703) (Poświata & Perełkiewicz, BSNLP 2019)
ACL