@inproceedings{laippala-etal-2022-towards,
title = "Towards better structured and less noisy Web data: Oscar with Register annotations",
author = {Laippala, Veronika and
Salmela, Anna and
R{\"o}nnqvist, Samuel and
Aji, Alham Fikri and
Chang, Li-Hsin and
Dhifallah, Asma and
Goulart, Larissa and
Kortelainen, Henna and
P{\`a}mies, Marc and
Prina Dutra, Deise and
Skantsi, Valtteri and
Sutawika, Lintang and
Pyysalo, Sampo},
booktitle = "Proceedings of the Eighth Workshop on Noisy User-generated Text (W-NUT 2022)",
month = oct,
year = "2022",
address = "Gyeongju, Republic of Korea",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.wnut-1.23",
pages = "215--221",
abstract = {Web-crawled datasets are known to be noisy, as they feature a wide range of language use covering both user-generated and professionally edited content as well as noise originating from the crawling process. This article presents one solution to reduce this noise by using automatic register (genre) identification -whether the texts are, e.g., forum discussions, lyrical or how-to pages. We apply the multilingual register identification model by R{\"o}nnqvist et al. (2021) and label the widely used Oscar dataset. Additionally, we evaluate the model against eight new languages, showing that the performance is comparable to previous findings on a restricted set of languages. Finally, we present and apply a machine learning method for further cleaning text files originating from Web crawls from remains of boilerplate and other elements not belonging to the main text of the Web page. The register labeled and cleaned dataset covers 351 million documents in 14 languages and is available at https://huggingface.co/datasets/TurkuNLP/register{\_}oscar.},
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="laippala-etal-2022-towards">
<titleInfo>
<title>Towards better structured and less noisy Web data: Oscar with Register annotations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Veronika</namePart>
<namePart type="family">Laippala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Salmela</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samuel</namePart>
<namePart type="family">Rönnqvist</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alham</namePart>
<namePart type="given">Fikri</namePart>
<namePart type="family">Aji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Li-Hsin</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asma</namePart>
<namePart type="family">Dhifallah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Larissa</namePart>
<namePart type="family">Goulart</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Henna</namePart>
<namePart type="family">Kortelainen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marc</namePart>
<namePart type="family">Pàmies</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Deise</namePart>
<namePart type="family">Prina Dutra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Valtteri</namePart>
<namePart type="family">Skantsi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lintang</namePart>
<namePart type="family">Sutawika</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sampo</namePart>
<namePart type="family">Pyysalo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Eighth Workshop on Noisy User-generated Text (W-NUT 2022)</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Gyeongju, Republic of Korea</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Web-crawled datasets are known to be noisy, as they feature a wide range of language use covering both user-generated and professionally edited content as well as noise originating from the crawling process. This article presents one solution to reduce this noise by using automatic register (genre) identification -whether the texts are, e.g., forum discussions, lyrical or how-to pages. We apply the multilingual register identification model by Rönnqvist et al. (2021) and label the widely used Oscar dataset. Additionally, we evaluate the model against eight new languages, showing that the performance is comparable to previous findings on a restricted set of languages. Finally, we present and apply a machine learning method for further cleaning text files originating from Web crawls from remains of boilerplate and other elements not belonging to the main text of the Web page. The register labeled and cleaned dataset covers 351 million documents in 14 languages and is available at https://huggingface.co/datasets/TurkuNLP/register_oscar.</abstract>
<identifier type="citekey">laippala-etal-2022-towards</identifier>
<location>
<url>https://aclanthology.org/2022.wnut-1.23</url>
</location>
<part>
<date>2022-10</date>
<extent unit="page">
<start>215</start>
<end>221</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Towards better structured and less noisy Web data: Oscar with Register annotations
%A Laippala, Veronika
%A Salmela, Anna
%A Rönnqvist, Samuel
%A Aji, Alham Fikri
%A Chang, Li-Hsin
%A Dhifallah, Asma
%A Goulart, Larissa
%A Kortelainen, Henna
%A Pàmies, Marc
%A Prina Dutra, Deise
%A Skantsi, Valtteri
%A Sutawika, Lintang
%A Pyysalo, Sampo
%S Proceedings of the Eighth Workshop on Noisy User-generated Text (W-NUT 2022)
%D 2022
%8 October
%I Association for Computational Linguistics
%C Gyeongju, Republic of Korea
%F laippala-etal-2022-towards
%X Web-crawled datasets are known to be noisy, as they feature a wide range of language use covering both user-generated and professionally edited content as well as noise originating from the crawling process. This article presents one solution to reduce this noise by using automatic register (genre) identification -whether the texts are, e.g., forum discussions, lyrical or how-to pages. We apply the multilingual register identification model by Rönnqvist et al. (2021) and label the widely used Oscar dataset. Additionally, we evaluate the model against eight new languages, showing that the performance is comparable to previous findings on a restricted set of languages. Finally, we present and apply a machine learning method for further cleaning text files originating from Web crawls from remains of boilerplate and other elements not belonging to the main text of the Web page. The register labeled and cleaned dataset covers 351 million documents in 14 languages and is available at https://huggingface.co/datasets/TurkuNLP/register_oscar.
%U https://aclanthology.org/2022.wnut-1.23
%P 215-221
Markdown (Informal)
[Towards better structured and less noisy Web data: Oscar with Register annotations](https://aclanthology.org/2022.wnut-1.23) (Laippala et al., WNUT 2022)
ACL
- Veronika Laippala, Anna Salmela, Samuel Rönnqvist, Alham Fikri Aji, Li-Hsin Chang, Asma Dhifallah, Larissa Goulart, Henna Kortelainen, Marc Pàmies, Deise Prina Dutra, Valtteri Skantsi, Lintang Sutawika, and Sampo Pyysalo. 2022. Towards better structured and less noisy Web data: Oscar with Register annotations. In Proceedings of the Eighth Workshop on Noisy User-generated Text (W-NUT 2022), pages 215–221, Gyeongju, Republic of Korea. Association for Computational Linguistics.