@inproceedings{non-etal-2022-macocu,
title = "{M}a{C}o{C}u: Massive collection and curation of monolingual and bilingual data: focus on under-resourced languages",
author = "Ba{\~n}{\'o}n, Marta and
Espl{\`a}-Gomis, Miquel and
Forcada, Mikel L. and
Garc{\'\i}a-Romero, Cristian and
Kuzman, Taja and
Ljube{\v{s}}i{\'c}, Nikola and
van Noord, Rik and
Sempere, Leopoldo Pla and
Ram{\'\i}rez-S{\'a}nchez, Gema and
Rupnik, Peter and
Suchomel, V{\'\i}t and
Toral, Antonio and
van der Werff, Tobias and
Zaragoza, Jaume",
booktitle = "Proceedings of the 23rd Annual Conference of the European Association for Machine Translation",
month = jun,
year = "2022",
address = "Ghent, Belgium",
publisher = "European Association for Machine Translation",
url = "https://aclanthology.org/2022.eamt-1.41",
pages = "303--304",
abstract = "We introduce the project {``}MaCoCu: Massive collection and curation of monolingual and bilingual data: focus on under-resourced languages{''}, funded by the Connecting Europe Facility, which is aimed at building monolingual and parallel corpora for under-resourced European languages. The approach followed consists of crawling large amounts of textual data from carefully selected top-level domains of the Internet, and then applying a curation and enrichment pipeline. In addition to corpora, the project will release successive versions of the free/open-source web crawling and curation software used.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="non-etal-2022-macocu">
<titleInfo>
<title>MaCoCu: Massive collection and curation of monolingual and bilingual data: focus on under-resourced languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marta</namePart>
<namePart type="family">Bañón</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Miquel</namePart>
<namePart type="family">Esplà-Gomis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mikel</namePart>
<namePart type="given">L</namePart>
<namePart type="family">Forcada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cristian</namePart>
<namePart type="family">García-Romero</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Taja</namePart>
<namePart type="family">Kuzman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikola</namePart>
<namePart type="family">Ljubešić</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rik</namePart>
<namePart type="family">van Noord</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leopoldo</namePart>
<namePart type="given">Pla</namePart>
<namePart type="family">Sempere</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gema</namePart>
<namePart type="family">Ramírez-Sánchez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peter</namePart>
<namePart type="family">Rupnik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vít</namePart>
<namePart type="family">Suchomel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonio</namePart>
<namePart type="family">Toral</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tobias</namePart>
<namePart type="family">van der Werff</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jaume</namePart>
<namePart type="family">Zaragoza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 23rd Annual Conference of the European Association for Machine Translation</title>
</titleInfo>
<originInfo>
<publisher>European Association for Machine Translation</publisher>
<place>
<placeTerm type="text">Ghent, Belgium</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We introduce the project “MaCoCu: Massive collection and curation of monolingual and bilingual data: focus on under-resourced languages”, funded by the Connecting Europe Facility, which is aimed at building monolingual and parallel corpora for under-resourced European languages. The approach followed consists of crawling large amounts of textual data from carefully selected top-level domains of the Internet, and then applying a curation and enrichment pipeline. In addition to corpora, the project will release successive versions of the free/open-source web crawling and curation software used.</abstract>
<identifier type="citekey">non-etal-2022-macocu</identifier>
<location>
<url>https://aclanthology.org/2022.eamt-1.41</url>
</location>
<part>
<date>2022-06</date>
<extent unit="page">
<start>303</start>
<end>304</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MaCoCu: Massive collection and curation of monolingual and bilingual data: focus on under-resourced languages
%A Bañón, Marta
%A Esplà-Gomis, Miquel
%A Forcada, Mikel L.
%A García-Romero, Cristian
%A Kuzman, Taja
%A Ljubešić, Nikola
%A van Noord, Rik
%A Sempere, Leopoldo Pla
%A Ramírez-Sánchez, Gema
%A Rupnik, Peter
%A Suchomel, Vít
%A Toral, Antonio
%A van der Werff, Tobias
%A Zaragoza, Jaume
%S Proceedings of the 23rd Annual Conference of the European Association for Machine Translation
%D 2022
%8 June
%I European Association for Machine Translation
%C Ghent, Belgium
%F non-etal-2022-macocu
%X We introduce the project “MaCoCu: Massive collection and curation of monolingual and bilingual data: focus on under-resourced languages”, funded by the Connecting Europe Facility, which is aimed at building monolingual and parallel corpora for under-resourced European languages. The approach followed consists of crawling large amounts of textual data from carefully selected top-level domains of the Internet, and then applying a curation and enrichment pipeline. In addition to corpora, the project will release successive versions of the free/open-source web crawling and curation software used.
%U https://aclanthology.org/2022.eamt-1.41
%P 303-304
Markdown (Informal)
[MaCoCu: Massive collection and curation of monolingual and bilingual data: focus on under-resourced languages](https://aclanthology.org/2022.eamt-1.41) (Bañón et al., EAMT 2022)
ACL
- Marta Bañón, Miquel Esplà-Gomis, Mikel L. Forcada, Cristian García-Romero, Taja Kuzman, Nikola Ljubešić, Rik van Noord, Leopoldo Pla Sempere, Gema Ramírez-Sánchez, Peter Rupnik, Vít Suchomel, Antonio Toral, Tobias van der Werff, and Jaume Zaragoza. 2022. MaCoCu: Massive collection and curation of monolingual and bilingual data: focus on under-resourced languages. In Proceedings of the 23rd Annual Conference of the European Association for Machine Translation, pages 303–304, Ghent, Belgium. European Association for Machine Translation.