@inproceedings{wiechetek-etal-2024-ethical,
title = "The Ethical Question {--} Use of Indigenous Corpora for Large Language Models",
author = "Wiechetek, Linda and
Pirinen, Flammie A. and
Gaup, B{\o}rre and
Trosterud, Trond and
Kappfjell, Maja Lisa and
Moshagen, Sjur",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.1383",
pages = "15922--15931",
abstract = "Creating language technology based on language data has become very popular with the recent advances of large language models and neural network technologies. This makes language resources very valuable, and especially in case of indigenous languages, the scarce resources are even more precious. Given the good results of simply fetching everything you can from the internet and feeding it to neural networks in English, there has been more work on doing the same for all languages. However, indigenous language resources as they are on the web are not comparable in that they would encode the most recent normativised language in all its aspects. This problematic is further due to not understanding the texts input to models or output by models by the people who work on them. Corpora also have intelligent property rights and copyrights that are not respected. Furthermore, the web is filled with the result of language model -generated texts. In this article we describe an ethical and sustainable way to work with indigenous languages.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wiechetek-etal-2024-ethical">
<titleInfo>
<title>The Ethical Question – Use of Indigenous Corpora for Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Linda</namePart>
<namePart type="family">Wiechetek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Flammie</namePart>
<namePart type="given">A</namePart>
<namePart type="family">Pirinen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Børre</namePart>
<namePart type="family">Gaup</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Trond</namePart>
<namePart type="family">Trosterud</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maja</namePart>
<namePart type="given">Lisa</namePart>
<namePart type="family">Kappfjell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sjur</namePart>
<namePart type="family">Moshagen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Creating language technology based on language data has become very popular with the recent advances of large language models and neural network technologies. This makes language resources very valuable, and especially in case of indigenous languages, the scarce resources are even more precious. Given the good results of simply fetching everything you can from the internet and feeding it to neural networks in English, there has been more work on doing the same for all languages. However, indigenous language resources as they are on the web are not comparable in that they would encode the most recent normativised language in all its aspects. This problematic is further due to not understanding the texts input to models or output by models by the people who work on them. Corpora also have intelligent property rights and copyrights that are not respected. Furthermore, the web is filled with the result of language model -generated texts. In this article we describe an ethical and sustainable way to work with indigenous languages.</abstract>
<identifier type="citekey">wiechetek-etal-2024-ethical</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.1383</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>15922</start>
<end>15931</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The Ethical Question – Use of Indigenous Corpora for Large Language Models
%A Wiechetek, Linda
%A Pirinen, Flammie A.
%A Gaup, Børre
%A Trosterud, Trond
%A Kappfjell, Maja Lisa
%A Moshagen, Sjur
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F wiechetek-etal-2024-ethical
%X Creating language technology based on language data has become very popular with the recent advances of large language models and neural network technologies. This makes language resources very valuable, and especially in case of indigenous languages, the scarce resources are even more precious. Given the good results of simply fetching everything you can from the internet and feeding it to neural networks in English, there has been more work on doing the same for all languages. However, indigenous language resources as they are on the web are not comparable in that they would encode the most recent normativised language in all its aspects. This problematic is further due to not understanding the texts input to models or output by models by the people who work on them. Corpora also have intelligent property rights and copyrights that are not respected. Furthermore, the web is filled with the result of language model -generated texts. In this article we describe an ethical and sustainable way to work with indigenous languages.
%U https://aclanthology.org/2024.lrec-main.1383
%P 15922-15931
Markdown (Informal)
[The Ethical Question – Use of Indigenous Corpora for Large Language Models](https://aclanthology.org/2024.lrec-main.1383) (Wiechetek et al., LREC-COLING 2024)
ACL
- Linda Wiechetek, Flammie A. Pirinen, Børre Gaup, Trond Trosterud, Maja Lisa Kappfjell, and Sjur Moshagen. 2024. The Ethical Question – Use of Indigenous Corpora for Large Language Models. In Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), pages 15922–15931, Torino, Italia. ELRA and ICCL.