@inproceedings{hamalainen-alnajjar-and-tuuli-tuisk-2022-help,
title = "Help from the Neighbors: {E}stonian Dialect Normalization Using a {F}innish Dialect Generator",
author = {H{\"a}m{\"a}l{\"a}inen, Mika and
Alnajjar, Khalid and
Tuisk, Tuuli},
editor = "Cherry, Colin and
Fan, Angela and
Foster, George and
Haffari, Gholamreza (Reza) and
Khadivi, Shahram and
Peng, Nanyun (Violet) and
Ren, Xiang and
Shareghi, Ehsan and
Swayamdipta, Swabha",
booktitle = "Proceedings of the Third Workshop on Deep Learning for Low-Resource Natural Language Processing",
month = jul,
year = "2022",
address = "Hybrid",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.deeplo-1.7",
doi = "10.18653/v1/2022.deeplo-1.7",
pages = "61--66",
abstract = "While standard Estonian is not a low-resourced language, the different dialects of the language are under-resourced from the point of view of NLP, given that there are no vast hand normalized resources available for training a machine learning model to normalize dialectal Estonian to standard Estonian. In this paper, we crawl a small corpus of parallel dialectal Estonian - standard Estonian sentences. In addition, we take a savvy approach of generating more synthetic training data for the normalization task by using an existing dialect generator model built for Finnish to {``}dialectalize{''} standard Estonian sentences from the Universal Dependencies tree banks. Our BERT based normalization model achieves a word error rate that is 26.49 points lower when using both the synthetic data and Estonian data in comparison to training the model with only the available Estonian data. Our results suggest that synthetic data generated by a model trained on a more resourced related language can indeed boost the results for a less resourced language.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hamalainen-alnajjar-and-tuuli-tuisk-2022-help">
<titleInfo>
<title>Help from the Neighbors: Estonian Dialect Normalization Using a Finnish Dialect Generator</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mika</namePart>
<namePart type="family">Hämäläinen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Alnajjar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tuuli</namePart>
<namePart type="family">Tuisk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Third Workshop on Deep Learning for Low-Resource Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Colin</namePart>
<namePart type="family">Cherry</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Angela</namePart>
<namePart type="family">Fan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">George</namePart>
<namePart type="family">Foster</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gholamreza</namePart>
<namePart type="given">(Reza)</namePart>
<namePart type="family">Haffari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shahram</namePart>
<namePart type="family">Khadivi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nanyun</namePart>
<namePart type="given">(Violet)</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiang</namePart>
<namePart type="family">Ren</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ehsan</namePart>
<namePart type="family">Shareghi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Swabha</namePart>
<namePart type="family">Swayamdipta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Hybrid</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>While standard Estonian is not a low-resourced language, the different dialects of the language are under-resourced from the point of view of NLP, given that there are no vast hand normalized resources available for training a machine learning model to normalize dialectal Estonian to standard Estonian. In this paper, we crawl a small corpus of parallel dialectal Estonian - standard Estonian sentences. In addition, we take a savvy approach of generating more synthetic training data for the normalization task by using an existing dialect generator model built for Finnish to “dialectalize” standard Estonian sentences from the Universal Dependencies tree banks. Our BERT based normalization model achieves a word error rate that is 26.49 points lower when using both the synthetic data and Estonian data in comparison to training the model with only the available Estonian data. Our results suggest that synthetic data generated by a model trained on a more resourced related language can indeed boost the results for a less resourced language.</abstract>
<identifier type="citekey">hamalainen-alnajjar-and-tuuli-tuisk-2022-help</identifier>
<identifier type="doi">10.18653/v1/2022.deeplo-1.7</identifier>
<location>
<url>https://aclanthology.org/2022.deeplo-1.7</url>
</location>
<part>
<date>2022-07</date>
<extent unit="page">
<start>61</start>
<end>66</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Help from the Neighbors: Estonian Dialect Normalization Using a Finnish Dialect Generator
%A Hämäläinen, Mika
%A Alnajjar, Khalid
%A Tuisk, Tuuli
%Y Cherry, Colin
%Y Fan, Angela
%Y Foster, George
%Y Haffari, Gholamreza (Reza)
%Y Khadivi, Shahram
%Y Peng, Nanyun (Violet)
%Y Ren, Xiang
%Y Shareghi, Ehsan
%Y Swayamdipta, Swabha
%S Proceedings of the Third Workshop on Deep Learning for Low-Resource Natural Language Processing
%D 2022
%8 July
%I Association for Computational Linguistics
%C Hybrid
%F hamalainen-alnajjar-and-tuuli-tuisk-2022-help
%X While standard Estonian is not a low-resourced language, the different dialects of the language are under-resourced from the point of view of NLP, given that there are no vast hand normalized resources available for training a machine learning model to normalize dialectal Estonian to standard Estonian. In this paper, we crawl a small corpus of parallel dialectal Estonian - standard Estonian sentences. In addition, we take a savvy approach of generating more synthetic training data for the normalization task by using an existing dialect generator model built for Finnish to “dialectalize” standard Estonian sentences from the Universal Dependencies tree banks. Our BERT based normalization model achieves a word error rate that is 26.49 points lower when using both the synthetic data and Estonian data in comparison to training the model with only the available Estonian data. Our results suggest that synthetic data generated by a model trained on a more resourced related language can indeed boost the results for a less resourced language.
%R 10.18653/v1/2022.deeplo-1.7
%U https://aclanthology.org/2022.deeplo-1.7
%U https://doi.org/10.18653/v1/2022.deeplo-1.7
%P 61-66
Markdown (Informal)
[Help from the Neighbors: Estonian Dialect Normalization Using a Finnish Dialect Generator](https://aclanthology.org/2022.deeplo-1.7) (Hämäläinen et al., DeepLo 2022)
ACL