@inproceedings{ronnqvist-etal-2021-multilingual,
title = "Multilingual and Zero-Shot is Closing in on Monolingual Web Register Classification",
author = {R{\"o}nnqvist, Samuel and
Skantsi, Valtteri and
Oinonen, Miika and
Laippala, Veronika},
editor = "Dobnik, Simon and
{\O}vrelid, Lilja",
booktitle = "Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa)",
month = may # " 31--2 " # jun,
year = "2021",
address = "Reykjavik, Iceland (Online)",
publisher = {Link{\"o}ping University Electronic Press, Sweden},
url = "https://aclanthology.org/2021.nodalida-main.16",
pages = "157--165",
abstract = "This article studies register classification of documents from the unrestricted web, such as news articles or opinion blogs, in a multilingual setting, exploring both the benefit of training on multiple languages and the capabilities for zero-shot cross-lingual transfer. While the wide range of linguistic variation found on the web poses challenges for register classification, recent studies have shown that good levels of cross-lingual transfer from the extensive English CORE corpus to other languages can be achieved. In this study, we show that training on multiple languages 1) benefits languages with limited amounts of register-annotated data, 2) on average achieves performance on par with monolingual models, and 3) greatly improves upon previous zero-shot results in Finnish, French and Swedish. The best results are achieved with the multilingual XLM-R model. As data, we use the CORE corpus series featuring register annotated data from the unrestricted web.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ronnqvist-etal-2021-multilingual">
<titleInfo>
<title>Multilingual and Zero-Shot is Closing in on Monolingual Web Register Classification</title>
</titleInfo>
<name type="personal">
<namePart type="given">Samuel</namePart>
<namePart type="family">Rönnqvist</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Valtteri</namePart>
<namePart type="family">Skantsi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Miika</namePart>
<namePart type="family">Oinonen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronika</namePart>
<namePart type="family">Laippala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-may 31–2 jun</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Dobnik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lilja</namePart>
<namePart type="family">Øvrelid</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Linköping University Electronic Press, Sweden</publisher>
<place>
<placeTerm type="text">Reykjavik, Iceland (Online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This article studies register classification of documents from the unrestricted web, such as news articles or opinion blogs, in a multilingual setting, exploring both the benefit of training on multiple languages and the capabilities for zero-shot cross-lingual transfer. While the wide range of linguistic variation found on the web poses challenges for register classification, recent studies have shown that good levels of cross-lingual transfer from the extensive English CORE corpus to other languages can be achieved. In this study, we show that training on multiple languages 1) benefits languages with limited amounts of register-annotated data, 2) on average achieves performance on par with monolingual models, and 3) greatly improves upon previous zero-shot results in Finnish, French and Swedish. The best results are achieved with the multilingual XLM-R model. As data, we use the CORE corpus series featuring register annotated data from the unrestricted web.</abstract>
<identifier type="citekey">ronnqvist-etal-2021-multilingual</identifier>
<location>
<url>https://aclanthology.org/2021.nodalida-main.16</url>
</location>
<part>
<date>2021-may 31–2 jun</date>
<extent unit="page">
<start>157</start>
<end>165</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Multilingual and Zero-Shot is Closing in on Monolingual Web Register Classification
%A Rönnqvist, Samuel
%A Skantsi, Valtteri
%A Oinonen, Miika
%A Laippala, Veronika
%Y Dobnik, Simon
%Y Øvrelid, Lilja
%S Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa)
%D 2021
%8 may 31–2 jun
%I Linköping University Electronic Press, Sweden
%C Reykjavik, Iceland (Online)
%F ronnqvist-etal-2021-multilingual
%X This article studies register classification of documents from the unrestricted web, such as news articles or opinion blogs, in a multilingual setting, exploring both the benefit of training on multiple languages and the capabilities for zero-shot cross-lingual transfer. While the wide range of linguistic variation found on the web poses challenges for register classification, recent studies have shown that good levels of cross-lingual transfer from the extensive English CORE corpus to other languages can be achieved. In this study, we show that training on multiple languages 1) benefits languages with limited amounts of register-annotated data, 2) on average achieves performance on par with monolingual models, and 3) greatly improves upon previous zero-shot results in Finnish, French and Swedish. The best results are achieved with the multilingual XLM-R model. As data, we use the CORE corpus series featuring register annotated data from the unrestricted web.
%U https://aclanthology.org/2021.nodalida-main.16
%P 157-165
Markdown (Informal)
[Multilingual and Zero-Shot is Closing in on Monolingual Web Register Classification](https://aclanthology.org/2021.nodalida-main.16) (Rönnqvist et al., NoDaLiDa 2021)
ACL