@inproceedings{repo-etal-2021-beyond,
title = "Beyond the {E}nglish Web: Zero-Shot Cross-Lingual and Lightweight Monolingual Classification of Registers",
author = {Repo, Liina and
Skantsi, Valtteri and
R{\"o}nnqvist, Samuel and
Hellstr{\"o}m, Saara and
Oinonen, Miika and
Salmela, Anna and
Biber, Douglas and
Egbert, Jesse and
Pyysalo, Sampo and
Laippala, Veronika},
editor = "Sorodoc, Ionut-Teodor and
Sushil, Madhumita and
Takmaz, Ece and
Agirre, Eneko",
booktitle = "Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Student Research Workshop",
month = apr,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.eacl-srw.24",
doi = "10.18653/v1/2021.eacl-srw.24",
pages = "183--191",
abstract = "We explore cross-lingual transfer of register classification for web documents. Registers, that is, text varieties such as blogs or news are one of the primary predictors of linguistic variation and thus affect the automatic processing of language. We introduce two new register-annotated corpora, FreCORE and SweCORE, for French and Swedish. We demonstrate that deep pre-trained language models perform strongly in these languages and outperform previous state-of-the-art in English and Finnish. Specifically, we show 1) that zero-shot cross-lingual transfer from the large English CORE corpus can match or surpass previously published monolingual models, and 2) that lightweight monolingual classification requiring very little training data can reach or surpass our zero-shot performance. We further analyse classification results finding that certain registers continue to pose challenges in particular for cross-lingual transfer.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="repo-etal-2021-beyond">
<titleInfo>
<title>Beyond the English Web: Zero-Shot Cross-Lingual and Lightweight Monolingual Classification of Registers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Liina</namePart>
<namePart type="family">Repo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Valtteri</namePart>
<namePart type="family">Skantsi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samuel</namePart>
<namePart type="family">Rönnqvist</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Saara</namePart>
<namePart type="family">Hellström</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Miika</namePart>
<namePart type="family">Oinonen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Salmela</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Douglas</namePart>
<namePart type="family">Biber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jesse</namePart>
<namePart type="family">Egbert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sampo</namePart>
<namePart type="family">Pyysalo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronika</namePart>
<namePart type="family">Laippala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Student Research Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ionut-Teodor</namePart>
<namePart type="family">Sorodoc</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Madhumita</namePart>
<namePart type="family">Sushil</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ece</namePart>
<namePart type="family">Takmaz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eneko</namePart>
<namePart type="family">Agirre</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We explore cross-lingual transfer of register classification for web documents. Registers, that is, text varieties such as blogs or news are one of the primary predictors of linguistic variation and thus affect the automatic processing of language. We introduce two new register-annotated corpora, FreCORE and SweCORE, for French and Swedish. We demonstrate that deep pre-trained language models perform strongly in these languages and outperform previous state-of-the-art in English and Finnish. Specifically, we show 1) that zero-shot cross-lingual transfer from the large English CORE corpus can match or surpass previously published monolingual models, and 2) that lightweight monolingual classification requiring very little training data can reach or surpass our zero-shot performance. We further analyse classification results finding that certain registers continue to pose challenges in particular for cross-lingual transfer.</abstract>
<identifier type="citekey">repo-etal-2021-beyond</identifier>
<identifier type="doi">10.18653/v1/2021.eacl-srw.24</identifier>
<location>
<url>https://aclanthology.org/2021.eacl-srw.24</url>
</location>
<part>
<date>2021-04</date>
<extent unit="page">
<start>183</start>
<end>191</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Beyond the English Web: Zero-Shot Cross-Lingual and Lightweight Monolingual Classification of Registers
%A Repo, Liina
%A Skantsi, Valtteri
%A Rönnqvist, Samuel
%A Hellström, Saara
%A Oinonen, Miika
%A Salmela, Anna
%A Biber, Douglas
%A Egbert, Jesse
%A Pyysalo, Sampo
%A Laippala, Veronika
%Y Sorodoc, Ionut-Teodor
%Y Sushil, Madhumita
%Y Takmaz, Ece
%Y Agirre, Eneko
%S Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Student Research Workshop
%D 2021
%8 April
%I Association for Computational Linguistics
%C Online
%F repo-etal-2021-beyond
%X We explore cross-lingual transfer of register classification for web documents. Registers, that is, text varieties such as blogs or news are one of the primary predictors of linguistic variation and thus affect the automatic processing of language. We introduce two new register-annotated corpora, FreCORE and SweCORE, for French and Swedish. We demonstrate that deep pre-trained language models perform strongly in these languages and outperform previous state-of-the-art in English and Finnish. Specifically, we show 1) that zero-shot cross-lingual transfer from the large English CORE corpus can match or surpass previously published monolingual models, and 2) that lightweight monolingual classification requiring very little training data can reach or surpass our zero-shot performance. We further analyse classification results finding that certain registers continue to pose challenges in particular for cross-lingual transfer.
%R 10.18653/v1/2021.eacl-srw.24
%U https://aclanthology.org/2021.eacl-srw.24
%U https://doi.org/10.18653/v1/2021.eacl-srw.24
%P 183-191
Markdown (Informal)
[Beyond the English Web: Zero-Shot Cross-Lingual and Lightweight Monolingual Classification of Registers](https://aclanthology.org/2021.eacl-srw.24) (Repo et al., EACL 2021)
ACL
- Liina Repo, Valtteri Skantsi, Samuel Rönnqvist, Saara Hellström, Miika Oinonen, Anna Salmela, Douglas Biber, Jesse Egbert, Sampo Pyysalo, and Veronika Laippala. 2021. Beyond the English Web: Zero-Shot Cross-Lingual and Lightweight Monolingual Classification of Registers. In Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Student Research Workshop, pages 183–191, Online. Association for Computational Linguistics.