@inproceedings{matos-veliz-etal-2019-comparing,
title = "Comparing {MT} Approaches for Text Normalization",
author = "Matos Veliz, Claudia and
De Clercq, Orphee and
Hoste, Veronique",
editor = "Mitkov, Ruslan and
Angelova, Galia",
booktitle = "Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2019)",
month = sep,
year = "2019",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd.",
url = "https://aclanthology.org/R19-1086/",
doi = "10.26615/978-954-452-056-4_086",
pages = "740--749",
abstract = "One of the main characteristics of social media data is the use of non-standard language. Since NLP tools have been trained on traditional text material their performance drops when applied to social media data. One way to overcome this is to first perform text normalization. In this work, we apply text normalization to noisy English and Dutch text coming from different social media genres: text messages, message board posts and tweets. We consider the normalization task as a Machine Translation problem and test the two leading paradigms: statistical and neural machine translation. For SMT we explore the added value of varying background corpora for training the language model. For NMT we have a look at data augmentation since the parallel datasets we are working with are limited in size. Our results reveal that when relying on SMT to perform the normalization it is beneficial to use a background corpus that is close to the genre you are normalizing. Regarding NMT, we find that the translations - or normalizations - coming out of this model are far from perfect and that for a low-resource language like Dutch adding additional training data works better than artificially augmenting the data."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="matos-veliz-etal-2019-comparing">
<titleInfo>
<title>Comparing MT Approaches for Text Normalization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Claudia</namePart>
<namePart type="family">Matos Veliz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Orphee</namePart>
<namePart type="family">De Clercq</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2019)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Galia</namePart>
<namePart type="family">Angelova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd.</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>One of the main characteristics of social media data is the use of non-standard language. Since NLP tools have been trained on traditional text material their performance drops when applied to social media data. One way to overcome this is to first perform text normalization. In this work, we apply text normalization to noisy English and Dutch text coming from different social media genres: text messages, message board posts and tweets. We consider the normalization task as a Machine Translation problem and test the two leading paradigms: statistical and neural machine translation. For SMT we explore the added value of varying background corpora for training the language model. For NMT we have a look at data augmentation since the parallel datasets we are working with are limited in size. Our results reveal that when relying on SMT to perform the normalization it is beneficial to use a background corpus that is close to the genre you are normalizing. Regarding NMT, we find that the translations - or normalizations - coming out of this model are far from perfect and that for a low-resource language like Dutch adding additional training data works better than artificially augmenting the data.</abstract>
<identifier type="citekey">matos-veliz-etal-2019-comparing</identifier>
<identifier type="doi">10.26615/978-954-452-056-4_086</identifier>
<location>
<url>https://aclanthology.org/R19-1086/</url>
</location>
<part>
<date>2019-09</date>
<extent unit="page">
<start>740</start>
<end>749</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Comparing MT Approaches for Text Normalization
%A Matos Veliz, Claudia
%A De Clercq, Orphee
%A Hoste, Veronique
%Y Mitkov, Ruslan
%Y Angelova, Galia
%S Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2019)
%D 2019
%8 September
%I INCOMA Ltd.
%C Varna, Bulgaria
%F matos-veliz-etal-2019-comparing
%X One of the main characteristics of social media data is the use of non-standard language. Since NLP tools have been trained on traditional text material their performance drops when applied to social media data. One way to overcome this is to first perform text normalization. In this work, we apply text normalization to noisy English and Dutch text coming from different social media genres: text messages, message board posts and tweets. We consider the normalization task as a Machine Translation problem and test the two leading paradigms: statistical and neural machine translation. For SMT we explore the added value of varying background corpora for training the language model. For NMT we have a look at data augmentation since the parallel datasets we are working with are limited in size. Our results reveal that when relying on SMT to perform the normalization it is beneficial to use a background corpus that is close to the genre you are normalizing. Regarding NMT, we find that the translations - or normalizations - coming out of this model are far from perfect and that for a low-resource language like Dutch adding additional training data works better than artificially augmenting the data.
%R 10.26615/978-954-452-056-4_086
%U https://aclanthology.org/R19-1086/
%U https://doi.org/10.26615/978-954-452-056-4_086
%P 740-749
Markdown (Informal)
[Comparing MT Approaches for Text Normalization](https://aclanthology.org/R19-1086/) (Matos Veliz et al., RANLP 2019)
ACL
- Claudia Matos Veliz, Orphee De Clercq, and Veronique Hoste. 2019. Comparing MT Approaches for Text Normalization. In Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2019), pages 740–749, Varna, Bulgaria. INCOMA Ltd..