@inproceedings{lastrucci-etal-2023-preparing,
title = "Preparing the Vuk{'}uzenzele and {ZA}-gov-multilingual {S}outh {A}frican multilingual corpora",
author = "Lastrucci, Richard and
Rajab, Jenalea and
Shingange, Matimba and
Njini, Daniel and
Marivate, Vukosi",
editor = "Mabuya, Rooweither and
Mthobela, Don and
Setaka, Mmasibidi and
Van Zaanen, Menno",
booktitle = "Proceedings of the Fourth workshop on Resources for African Indigenous Languages (RAIL 2023)",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.rail-1.3",
doi = "10.18653/v1/2023.rail-1.3",
pages = "18--25",
abstract = "This paper introduces two multilingual government themed corpora in various South African languages. The corpora were collected by gathering South African government speeches (ZA-gov-multilingual), as well as the South African Government newspaper (Vuk{'}uzenzele), that are translated into all 11 South African official languages. The corpora can be used for a myriad of downstream NLP tasks. The corpora were created to allow researchers to study the language used in South African government publications, with a focus on understanding how South African government officials communicate with their constituents. In this paper we highlight the process of gathering, cleaning and making available the corpora. We create parallel sentence corpora for Neural Machine Translation tasks using Language-Agnostic Sentence Representations (LASER) embeddings. With these aligned sentences we then provide NMT benchmarks for 9 indigenous languages by fine-tuning massively multilingual pre-trained language model.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lastrucci-etal-2023-preparing">
<titleInfo>
<title>Preparing the Vuk’uzenzele and ZA-gov-multilingual South African multilingual corpora</title>
</titleInfo>
<name type="personal">
<namePart type="given">Richard</namePart>
<namePart type="family">Lastrucci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jenalea</namePart>
<namePart type="family">Rajab</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matimba</namePart>
<namePart type="family">Shingange</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Njini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vukosi</namePart>
<namePart type="family">Marivate</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fourth workshop on Resources for African Indigenous Languages (RAIL 2023)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rooweither</namePart>
<namePart type="family">Mabuya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Don</namePart>
<namePart type="family">Mthobela</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mmasibidi</namePart>
<namePart type="family">Setaka</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Menno</namePart>
<namePart type="family">Van Zaanen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dubrovnik, Croatia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper introduces two multilingual government themed corpora in various South African languages. The corpora were collected by gathering South African government speeches (ZA-gov-multilingual), as well as the South African Government newspaper (Vuk’uzenzele), that are translated into all 11 South African official languages. The corpora can be used for a myriad of downstream NLP tasks. The corpora were created to allow researchers to study the language used in South African government publications, with a focus on understanding how South African government officials communicate with their constituents. In this paper we highlight the process of gathering, cleaning and making available the corpora. We create parallel sentence corpora for Neural Machine Translation tasks using Language-Agnostic Sentence Representations (LASER) embeddings. With these aligned sentences we then provide NMT benchmarks for 9 indigenous languages by fine-tuning massively multilingual pre-trained language model.</abstract>
<identifier type="citekey">lastrucci-etal-2023-preparing</identifier>
<identifier type="doi">10.18653/v1/2023.rail-1.3</identifier>
<location>
<url>https://aclanthology.org/2023.rail-1.3</url>
</location>
<part>
<date>2023-05</date>
<extent unit="page">
<start>18</start>
<end>25</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Preparing the Vuk’uzenzele and ZA-gov-multilingual South African multilingual corpora
%A Lastrucci, Richard
%A Rajab, Jenalea
%A Shingange, Matimba
%A Njini, Daniel
%A Marivate, Vukosi
%Y Mabuya, Rooweither
%Y Mthobela, Don
%Y Setaka, Mmasibidi
%Y Van Zaanen, Menno
%S Proceedings of the Fourth workshop on Resources for African Indigenous Languages (RAIL 2023)
%D 2023
%8 May
%I Association for Computational Linguistics
%C Dubrovnik, Croatia
%F lastrucci-etal-2023-preparing
%X This paper introduces two multilingual government themed corpora in various South African languages. The corpora were collected by gathering South African government speeches (ZA-gov-multilingual), as well as the South African Government newspaper (Vuk’uzenzele), that are translated into all 11 South African official languages. The corpora can be used for a myriad of downstream NLP tasks. The corpora were created to allow researchers to study the language used in South African government publications, with a focus on understanding how South African government officials communicate with their constituents. In this paper we highlight the process of gathering, cleaning and making available the corpora. We create parallel sentence corpora for Neural Machine Translation tasks using Language-Agnostic Sentence Representations (LASER) embeddings. With these aligned sentences we then provide NMT benchmarks for 9 indigenous languages by fine-tuning massively multilingual pre-trained language model.
%R 10.18653/v1/2023.rail-1.3
%U https://aclanthology.org/2023.rail-1.3
%U https://doi.org/10.18653/v1/2023.rail-1.3
%P 18-25
Markdown (Informal)
[Preparing the Vuk’uzenzele and ZA-gov-multilingual South African multilingual corpora](https://aclanthology.org/2023.rail-1.3) (Lastrucci et al., RAIL 2023)
ACL