@inproceedings{millour-fort-2019-unsupervised,
title = "Unsupervised Data Augmentation for Less-Resourced Languages with no Standardized Spelling",
author = {Millour, Alice and
Fort, Kar{\"e}n},
editor = "Mitkov, Ruslan and
Angelova, Galia",
booktitle = "Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2019)",
month = sep,
year = "2019",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd.",
url = "https://aclanthology.org/R19-1090",
doi = "10.26615/978-954-452-056-4_090",
pages = "776--784",
abstract = "Building representative linguistic resources and NLP tools for non-standardized languages is challenging: when spelling is not determined by a norm, multiple written forms can be encountered for a given word, inducing a large proportion of out-of-vocabulary words. To embrace this diversity, we propose a methodology based on crowdsourced alternative spellings we use to extract rules applied to match OOV words with one of their spelling variants. This virtuous process enables the unsupervised augmentation of multi-variant lexicons without expert rule definition. We apply this multilingual methodology on Alsatian, a French regional language and provide an intrinsic evaluation of the correctness of the variants pairs, and an extrinsic evaluation on a downstream task. We show that in a low-resource scenario, 145 inital pairs can lead to the generation of 876 additional variant pairs, and a diminution of OOV words improving the part-of-speech tagging performance by 1 to 4{\%}.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="millour-fort-2019-unsupervised">
<titleInfo>
<title>Unsupervised Data Augmentation for Less-Resourced Languages with no Standardized Spelling</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alice</namePart>
<namePart type="family">Millour</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Karën</namePart>
<namePart type="family">Fort</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2019)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Galia</namePart>
<namePart type="family">Angelova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd.</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Building representative linguistic resources and NLP tools for non-standardized languages is challenging: when spelling is not determined by a norm, multiple written forms can be encountered for a given word, inducing a large proportion of out-of-vocabulary words. To embrace this diversity, we propose a methodology based on crowdsourced alternative spellings we use to extract rules applied to match OOV words with one of their spelling variants. This virtuous process enables the unsupervised augmentation of multi-variant lexicons without expert rule definition. We apply this multilingual methodology on Alsatian, a French regional language and provide an intrinsic evaluation of the correctness of the variants pairs, and an extrinsic evaluation on a downstream task. We show that in a low-resource scenario, 145 inital pairs can lead to the generation of 876 additional variant pairs, and a diminution of OOV words improving the part-of-speech tagging performance by 1 to 4%.</abstract>
<identifier type="citekey">millour-fort-2019-unsupervised</identifier>
<identifier type="doi">10.26615/978-954-452-056-4_090</identifier>
<location>
<url>https://aclanthology.org/R19-1090</url>
</location>
<part>
<date>2019-09</date>
<extent unit="page">
<start>776</start>
<end>784</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Unsupervised Data Augmentation for Less-Resourced Languages with no Standardized Spelling
%A Millour, Alice
%A Fort, Karën
%Y Mitkov, Ruslan
%Y Angelova, Galia
%S Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2019)
%D 2019
%8 September
%I INCOMA Ltd.
%C Varna, Bulgaria
%F millour-fort-2019-unsupervised
%X Building representative linguistic resources and NLP tools for non-standardized languages is challenging: when spelling is not determined by a norm, multiple written forms can be encountered for a given word, inducing a large proportion of out-of-vocabulary words. To embrace this diversity, we propose a methodology based on crowdsourced alternative spellings we use to extract rules applied to match OOV words with one of their spelling variants. This virtuous process enables the unsupervised augmentation of multi-variant lexicons without expert rule definition. We apply this multilingual methodology on Alsatian, a French regional language and provide an intrinsic evaluation of the correctness of the variants pairs, and an extrinsic evaluation on a downstream task. We show that in a low-resource scenario, 145 inital pairs can lead to the generation of 876 additional variant pairs, and a diminution of OOV words improving the part-of-speech tagging performance by 1 to 4%.
%R 10.26615/978-954-452-056-4_090
%U https://aclanthology.org/R19-1090
%U https://doi.org/10.26615/978-954-452-056-4_090
%P 776-784
Markdown (Informal)
[Unsupervised Data Augmentation for Less-Resourced Languages with no Standardized Spelling](https://aclanthology.org/R19-1090) (Millour & Fort, RANLP 2019)
ACL