@inproceedings{a-pirinen-2024-keeping,
title = "Keeping Up Appearances{---}or how to get all {U}ralic languages included into bleeding edge research and software: generate, convert, and {LLM} your way into multilingual datasets",
author = "A Pirinen, Flammie",
editor = {H{\"a}m{\"a}l{\"a}inen, Mika and
Pirinen, Flammie and
Macias, Melany and
Crespo Avila, Mario},
booktitle = "Proceedings of the 9th International Workshop on Computational Linguistics for Uralic Languages",
month = nov,
year = "2024",
address = "Helsinki, Finland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.iwclul-1.16",
pages = "123--131",
abstract = "The current trends in natural language processing strongly favor large language models and generative AIs as the basis for everything. For Uralic languages that are not largely present in publically available data on the Internet, this can be problematic. In the current computational linguistic scene, it is very important to have representation of your language in popular datasets. Languages that are included in well-known datasets are also included in shared tasks, products by large technology corporations, and so forth. This inclusion will become especially important for under-resourced, under-studied minority, and Indigenous languages, which will otherwise be easily forgotten. In this article, we present the resources that are often deemed necessary for digital presence of a language in the large language model obsessed world of today. We show that there are methods and tricks available to alleviate the problems with a lack of data and a lack of creators and annotators of the data, some more successful than others.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="a-pirinen-2024-keeping">
<titleInfo>
<title>Keeping Up Appearances—or how to get all Uralic languages included into bleeding edge research and software: generate, convert, and LLM your way into multilingual datasets</title>
</titleInfo>
<name type="personal">
<namePart type="given">Flammie</namePart>
<namePart type="family">A Pirinen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 9th International Workshop on Computational Linguistics for Uralic Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mika</namePart>
<namePart type="family">Hämäläinen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Flammie</namePart>
<namePart type="family">Pirinen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Melany</namePart>
<namePart type="family">Macias</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mario</namePart>
<namePart type="family">Crespo Avila</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Helsinki, Finland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The current trends in natural language processing strongly favor large language models and generative AIs as the basis for everything. For Uralic languages that are not largely present in publically available data on the Internet, this can be problematic. In the current computational linguistic scene, it is very important to have representation of your language in popular datasets. Languages that are included in well-known datasets are also included in shared tasks, products by large technology corporations, and so forth. This inclusion will become especially important for under-resourced, under-studied minority, and Indigenous languages, which will otherwise be easily forgotten. In this article, we present the resources that are often deemed necessary for digital presence of a language in the large language model obsessed world of today. We show that there are methods and tricks available to alleviate the problems with a lack of data and a lack of creators and annotators of the data, some more successful than others.</abstract>
<identifier type="citekey">a-pirinen-2024-keeping</identifier>
<location>
<url>https://aclanthology.org/2024.iwclul-1.16</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>123</start>
<end>131</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Keeping Up Appearances—or how to get all Uralic languages included into bleeding edge research and software: generate, convert, and LLM your way into multilingual datasets
%A A Pirinen, Flammie
%Y Hämäläinen, Mika
%Y Pirinen, Flammie
%Y Macias, Melany
%Y Crespo Avila, Mario
%S Proceedings of the 9th International Workshop on Computational Linguistics for Uralic Languages
%D 2024
%8 November
%I Association for Computational Linguistics
%C Helsinki, Finland
%F a-pirinen-2024-keeping
%X The current trends in natural language processing strongly favor large language models and generative AIs as the basis for everything. For Uralic languages that are not largely present in publically available data on the Internet, this can be problematic. In the current computational linguistic scene, it is very important to have representation of your language in popular datasets. Languages that are included in well-known datasets are also included in shared tasks, products by large technology corporations, and so forth. This inclusion will become especially important for under-resourced, under-studied minority, and Indigenous languages, which will otherwise be easily forgotten. In this article, we present the resources that are often deemed necessary for digital presence of a language in the large language model obsessed world of today. We show that there are methods and tricks available to alleviate the problems with a lack of data and a lack of creators and annotators of the data, some more successful than others.
%U https://aclanthology.org/2024.iwclul-1.16
%P 123-131
Markdown (Informal)
[Keeping Up Appearances—or how to get all Uralic languages included into bleeding edge research and software: generate, convert, and LLM your way into multilingual datasets](https://aclanthology.org/2024.iwclul-1.16) (A Pirinen, IWCLUL 2024)
ACL