@inproceedings{gonzalez-agirre-etal-2024-building-data,
title = "Building a Data Infrastructure for a Mid-Resource Language: The Case of {C}atalan",
author = "Gonzalez-Agirre, Aitor and
Marimon, Montserrat and
Rodriguez-Penagos, Carlos and
Aula-Blasco, Javier and
Baucells, Irene and
Armentano-Oller, Carme and
Palomar-Giner, Jorge and
Kulebi, Baybars and
Villegas, Marta",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.231",
pages = "2556--2566",
abstract = "Current LLM-based applications are becoming steadily available for everyone with a reliable access to technology and the internet. These applications offer benefits to their users that leave those without access to them at a serious disadvantage. Given the vastly large amount of data needed to train LLMs, the gap between languages with access to such quantity of data and those without it is currently larger than ever. Aimed at saving this gap, the Aina Project was created to provide Catalan with the necessary resources to keep being relevant in the context of AI/NLP applications based on LLMs. We thus present a set of strategies to consider when improving technology support for a mid- or low-resource language, specially addressing sustainability of high-quality data acquisition and the challenges involved in the process. We also introduce a large amount of new annotated data for Catalan. Our hope is that those interested in replicating this work for another language can learn from what worked for us, the challenges that we faced, and the sometimes disheartening truth of working with mid- and low-resource languages.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gonzalez-agirre-etal-2024-building-data">
<titleInfo>
<title>Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aitor</namePart>
<namePart type="family">Gonzalez-Agirre</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Montserrat</namePart>
<namePart type="family">Marimon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carlos</namePart>
<namePart type="family">Rodriguez-Penagos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Javier</namePart>
<namePart type="family">Aula-Blasco</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Irene</namePart>
<namePart type="family">Baucells</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carme</namePart>
<namePart type="family">Armentano-Oller</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jorge</namePart>
<namePart type="family">Palomar-Giner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Baybars</namePart>
<namePart type="family">Kulebi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marta</namePart>
<namePart type="family">Villegas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Current LLM-based applications are becoming steadily available for everyone with a reliable access to technology and the internet. These applications offer benefits to their users that leave those without access to them at a serious disadvantage. Given the vastly large amount of data needed to train LLMs, the gap between languages with access to such quantity of data and those without it is currently larger than ever. Aimed at saving this gap, the Aina Project was created to provide Catalan with the necessary resources to keep being relevant in the context of AI/NLP applications based on LLMs. We thus present a set of strategies to consider when improving technology support for a mid- or low-resource language, specially addressing sustainability of high-quality data acquisition and the challenges involved in the process. We also introduce a large amount of new annotated data for Catalan. Our hope is that those interested in replicating this work for another language can learn from what worked for us, the challenges that we faced, and the sometimes disheartening truth of working with mid- and low-resource languages.</abstract>
<identifier type="citekey">gonzalez-agirre-etal-2024-building-data</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.231</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>2556</start>
<end>2566</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan
%A Gonzalez-Agirre, Aitor
%A Marimon, Montserrat
%A Rodriguez-Penagos, Carlos
%A Aula-Blasco, Javier
%A Baucells, Irene
%A Armentano-Oller, Carme
%A Palomar-Giner, Jorge
%A Kulebi, Baybars
%A Villegas, Marta
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F gonzalez-agirre-etal-2024-building-data
%X Current LLM-based applications are becoming steadily available for everyone with a reliable access to technology and the internet. These applications offer benefits to their users that leave those without access to them at a serious disadvantage. Given the vastly large amount of data needed to train LLMs, the gap between languages with access to such quantity of data and those without it is currently larger than ever. Aimed at saving this gap, the Aina Project was created to provide Catalan with the necessary resources to keep being relevant in the context of AI/NLP applications based on LLMs. We thus present a set of strategies to consider when improving technology support for a mid- or low-resource language, specially addressing sustainability of high-quality data acquisition and the challenges involved in the process. We also introduce a large amount of new annotated data for Catalan. Our hope is that those interested in replicating this work for another language can learn from what worked for us, the challenges that we faced, and the sometimes disheartening truth of working with mid- and low-resource languages.
%U https://aclanthology.org/2024.lrec-main.231
%P 2556-2566
Markdown (Informal)
[Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan](https://aclanthology.org/2024.lrec-main.231) (Gonzalez-Agirre et al., LREC-COLING 2024)
ACL
- Aitor Gonzalez-Agirre, Montserrat Marimon, Carlos Rodriguez-Penagos, Javier Aula-Blasco, Irene Baucells, Carme Armentano-Oller, Jorge Palomar-Giner, Baybars Kulebi, and Marta Villegas. 2024. Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan. In Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), pages 2556–2566, Torino, Italia. ELRA and ICCL.