@inproceedings{rosales-nunez-etal-2021-noisy,
title = "Noisy {UGC} Translation at the Character Level: Revisiting Open-Vocabulary Capabilities and Robustness of Char-Based Models",
author = "Rosales N{\'u}{\~n}ez, Jos{\'e} Carlos and
Wisniewski, Guillaume and
Seddah, Djam{\'e}",
booktitle = "Proceedings of the Seventh Workshop on Noisy User-generated Text (W-NUT 2021)",
month = nov,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.wnut-1.23",
doi = "10.18653/v1/2021.wnut-1.23",
pages = "199--211",
abstract = "This work explores the capacities of character-based Neural Machine Translation to translate noisy User-Generated Content (UGC) with a strong focus on exploring the limits of such approaches to handle productive UGC phenomena, which almost by definition, cannot be seen at training time. Within a strict zero-shot scenario, we first study the detrimental impact on translation performance of various user-generated content phenomena on a small annotated dataset we developed and then show that such models are indeed incapable of handling unknown letters, which leads to catastrophic translation failure once such characters are encountered. We further confirm this behavior with a simple, yet insightful, copy task experiment and highlight the importance of reducing the vocabulary size hyper-parameter to increase the robustness of character-based models for machine translation.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="rosales-nunez-etal-2021-noisy">
<titleInfo>
<title>Noisy UGC Translation at the Character Level: Revisiting Open-Vocabulary Capabilities and Robustness of Char-Based Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">José</namePart>
<namePart type="given">Carlos</namePart>
<namePart type="family">Rosales Núñez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guillaume</namePart>
<namePart type="family">Wisniewski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Djamé</namePart>
<namePart type="family">Seddah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Seventh Workshop on Noisy User-generated Text (W-NUT 2021)</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This work explores the capacities of character-based Neural Machine Translation to translate noisy User-Generated Content (UGC) with a strong focus on exploring the limits of such approaches to handle productive UGC phenomena, which almost by definition, cannot be seen at training time. Within a strict zero-shot scenario, we first study the detrimental impact on translation performance of various user-generated content phenomena on a small annotated dataset we developed and then show that such models are indeed incapable of handling unknown letters, which leads to catastrophic translation failure once such characters are encountered. We further confirm this behavior with a simple, yet insightful, copy task experiment and highlight the importance of reducing the vocabulary size hyper-parameter to increase the robustness of character-based models for machine translation.</abstract>
<identifier type="citekey">rosales-nunez-etal-2021-noisy</identifier>
<identifier type="doi">10.18653/v1/2021.wnut-1.23</identifier>
<location>
<url>https://aclanthology.org/2021.wnut-1.23</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>199</start>
<end>211</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Noisy UGC Translation at the Character Level: Revisiting Open-Vocabulary Capabilities and Robustness of Char-Based Models
%A Rosales Núñez, José Carlos
%A Wisniewski, Guillaume
%A Seddah, Djamé
%S Proceedings of the Seventh Workshop on Noisy User-generated Text (W-NUT 2021)
%D 2021
%8 November
%I Association for Computational Linguistics
%C Online
%F rosales-nunez-etal-2021-noisy
%X This work explores the capacities of character-based Neural Machine Translation to translate noisy User-Generated Content (UGC) with a strong focus on exploring the limits of such approaches to handle productive UGC phenomena, which almost by definition, cannot be seen at training time. Within a strict zero-shot scenario, we first study the detrimental impact on translation performance of various user-generated content phenomena on a small annotated dataset we developed and then show that such models are indeed incapable of handling unknown letters, which leads to catastrophic translation failure once such characters are encountered. We further confirm this behavior with a simple, yet insightful, copy task experiment and highlight the importance of reducing the vocabulary size hyper-parameter to increase the robustness of character-based models for machine translation.
%R 10.18653/v1/2021.wnut-1.23
%U https://aclanthology.org/2021.wnut-1.23
%U https://doi.org/10.18653/v1/2021.wnut-1.23
%P 199-211
Markdown (Informal)
[Noisy UGC Translation at the Character Level: Revisiting Open-Vocabulary Capabilities and Robustness of Char-Based Models](https://aclanthology.org/2021.wnut-1.23) (Rosales Núñez et al., WNUT 2021)
ACL