@inproceedings{samuel-straka-2021-ufal,
title = "{{\'U}FAL} at {M}ulti{L}ex{N}orm 2021: Improving Multilingual Lexical Normalization by Fine-tuning {B}y{T}5",
author = "Samuel, David and
Straka, Milan",
editor = "Xu, Wei and
Ritter, Alan and
Baldwin, Tim and
Rahimi, Afshin",
booktitle = "Proceedings of the Seventh Workshop on Noisy User-generated Text (W-NUT 2021)",
month = nov,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.wnut-1.54/",
doi = "10.18653/v1/2021.wnut-1.54",
pages = "483--492",
abstract = "We present the winning entry to the Multilingual Lexical Normalization (MultiLexNorm) shared task at W-NUT 2021 (van der Goot et al., 2021a), which evaluates lexical-normalization systems on 12 social media datasets in 11 languages. We base our solution on a pre-trained byte-level language model, ByT5 (Xue et al., 2021a), which we further pre-train on synthetic data and then fine-tune on authentic normalization data. Our system achieves the best performance by a wide margin in intrinsic evaluation, and also the best performance in extrinsic evaluation through dependency parsing. The source code is released at \url{https://github.com/ufal/multilexnorm2021} and the fine-tuned models at \url{https://huggingface.co/ufal}."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="samuel-straka-2021-ufal">
<titleInfo>
<title>ÚFAL at MultiLexNorm 2021: Improving Multilingual Lexical Normalization by Fine-tuning ByT5</title>
</titleInfo>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Samuel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Milan</namePart>
<namePart type="family">Straka</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Seventh Workshop on Noisy User-generated Text (W-NUT 2021)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tim</namePart>
<namePart type="family">Baldwin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Afshin</namePart>
<namePart type="family">Rahimi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present the winning entry to the Multilingual Lexical Normalization (MultiLexNorm) shared task at W-NUT 2021 (van der Goot et al., 2021a), which evaluates lexical-normalization systems on 12 social media datasets in 11 languages. We base our solution on a pre-trained byte-level language model, ByT5 (Xue et al., 2021a), which we further pre-train on synthetic data and then fine-tune on authentic normalization data. Our system achieves the best performance by a wide margin in intrinsic evaluation, and also the best performance in extrinsic evaluation through dependency parsing. The source code is released at https://github.com/ufal/multilexnorm2021 and the fine-tuned models at https://huggingface.co/ufal.</abstract>
<identifier type="citekey">samuel-straka-2021-ufal</identifier>
<identifier type="doi">10.18653/v1/2021.wnut-1.54</identifier>
<location>
<url>https://aclanthology.org/2021.wnut-1.54/</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>483</start>
<end>492</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ÚFAL at MultiLexNorm 2021: Improving Multilingual Lexical Normalization by Fine-tuning ByT5
%A Samuel, David
%A Straka, Milan
%Y Xu, Wei
%Y Ritter, Alan
%Y Baldwin, Tim
%Y Rahimi, Afshin
%S Proceedings of the Seventh Workshop on Noisy User-generated Text (W-NUT 2021)
%D 2021
%8 November
%I Association for Computational Linguistics
%C Online
%F samuel-straka-2021-ufal
%X We present the winning entry to the Multilingual Lexical Normalization (MultiLexNorm) shared task at W-NUT 2021 (van der Goot et al., 2021a), which evaluates lexical-normalization systems on 12 social media datasets in 11 languages. We base our solution on a pre-trained byte-level language model, ByT5 (Xue et al., 2021a), which we further pre-train on synthetic data and then fine-tune on authentic normalization data. Our system achieves the best performance by a wide margin in intrinsic evaluation, and also the best performance in extrinsic evaluation through dependency parsing. The source code is released at https://github.com/ufal/multilexnorm2021 and the fine-tuned models at https://huggingface.co/ufal.
%R 10.18653/v1/2021.wnut-1.54
%U https://aclanthology.org/2021.wnut-1.54/
%U https://doi.org/10.18653/v1/2021.wnut-1.54
%P 483-492
Markdown (Informal)
[ÚFAL at MultiLexNorm 2021: Improving Multilingual Lexical Normalization by Fine-tuning ByT5](https://aclanthology.org/2021.wnut-1.54/) (Samuel & Straka, WNUT 2021)
ACL