@inproceedings{kuzman-etal-2023-get,
title = "Get to Know Your Parallel Data: Performing {E}nglish Variety and Genre Classification over {M}a{C}o{C}u Corpora",
author = "Kuzman, Taja and
Rupnik, Peter and
Ljube{\v{s}}i{\'c}, Nikola",
editor = {Scherrer, Yves and
Jauhiainen, Tommi and
Ljube{\v{s}}i{\'c}, Nikola and
Nakov, Preslav and
Tiedemann, J{\"o}rg and
Zampieri, Marcos},
booktitle = "Tenth Workshop on NLP for Similar Languages, Varieties and Dialects (VarDial 2023)",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.vardial-1.9",
doi = "10.18653/v1/2023.vardial-1.9",
pages = "91--103",
abstract = "Collecting texts from the web enables a rapid creation of monolingual and parallel corpora of unprecedented size. However, unlike manually-collected corpora, authors and end users do not know which texts make up the web collections. In this work, we analyse the content of seven European parallel web corpora, collected from national top-level domains, by analysing the English variety and genre distribution in them. We develop and provide a lexicon-based British-American variety classifier, which we use to identify the English variety. In addition, we apply a Transformer-based genre classifier to corpora to analyse genre distribution and the interplay between genres and English varieties. The results reveal significant differences among the seven corpora in terms of different genre distribution and different preference for English varieties.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kuzman-etal-2023-get">
<titleInfo>
<title>Get to Know Your Parallel Data: Performing English Variety and Genre Classification over MaCoCu Corpora</title>
</titleInfo>
<name type="personal">
<namePart type="given">Taja</namePart>
<namePart type="family">Kuzman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peter</namePart>
<namePart type="family">Rupnik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikola</namePart>
<namePart type="family">Ljubešić</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Tenth Workshop on NLP for Similar Languages, Varieties and Dialects (VarDial 2023)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yves</namePart>
<namePart type="family">Scherrer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tommi</namePart>
<namePart type="family">Jauhiainen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikola</namePart>
<namePart type="family">Ljubešić</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preslav</namePart>
<namePart type="family">Nakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jörg</namePart>
<namePart type="family">Tiedemann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcos</namePart>
<namePart type="family">Zampieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dubrovnik, Croatia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Collecting texts from the web enables a rapid creation of monolingual and parallel corpora of unprecedented size. However, unlike manually-collected corpora, authors and end users do not know which texts make up the web collections. In this work, we analyse the content of seven European parallel web corpora, collected from national top-level domains, by analysing the English variety and genre distribution in them. We develop and provide a lexicon-based British-American variety classifier, which we use to identify the English variety. In addition, we apply a Transformer-based genre classifier to corpora to analyse genre distribution and the interplay between genres and English varieties. The results reveal significant differences among the seven corpora in terms of different genre distribution and different preference for English varieties.</abstract>
<identifier type="citekey">kuzman-etal-2023-get</identifier>
<identifier type="doi">10.18653/v1/2023.vardial-1.9</identifier>
<location>
<url>https://aclanthology.org/2023.vardial-1.9</url>
</location>
<part>
<date>2023-05</date>
<extent unit="page">
<start>91</start>
<end>103</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Get to Know Your Parallel Data: Performing English Variety and Genre Classification over MaCoCu Corpora
%A Kuzman, Taja
%A Rupnik, Peter
%A Ljubešić, Nikola
%Y Scherrer, Yves
%Y Jauhiainen, Tommi
%Y Ljubešić, Nikola
%Y Nakov, Preslav
%Y Tiedemann, Jörg
%Y Zampieri, Marcos
%S Tenth Workshop on NLP for Similar Languages, Varieties and Dialects (VarDial 2023)
%D 2023
%8 May
%I Association for Computational Linguistics
%C Dubrovnik, Croatia
%F kuzman-etal-2023-get
%X Collecting texts from the web enables a rapid creation of monolingual and parallel corpora of unprecedented size. However, unlike manually-collected corpora, authors and end users do not know which texts make up the web collections. In this work, we analyse the content of seven European parallel web corpora, collected from national top-level domains, by analysing the English variety and genre distribution in them. We develop and provide a lexicon-based British-American variety classifier, which we use to identify the English variety. In addition, we apply a Transformer-based genre classifier to corpora to analyse genre distribution and the interplay between genres and English varieties. The results reveal significant differences among the seven corpora in terms of different genre distribution and different preference for English varieties.
%R 10.18653/v1/2023.vardial-1.9
%U https://aclanthology.org/2023.vardial-1.9
%U https://doi.org/10.18653/v1/2023.vardial-1.9
%P 91-103
Markdown (Informal)
[Get to Know Your Parallel Data: Performing English Variety and Genre Classification over MaCoCu Corpora](https://aclanthology.org/2023.vardial-1.9) (Kuzman et al., VarDial 2023)
ACL