@inproceedings{purason-etal-2025-llms,
title = "{LLM}s for Extremely Low-Resource {F}inno-{U}gric Languages",
author = "Purason, Taido and
Kuulmets, Hele-Andra and
Fishel, Mark",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2025",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-naacl.373/",
doi = "10.18653/v1/2025.findings-naacl.373",
pages = "6677--6697",
ISBN = "979-8-89176-195-7",
abstract = "The advancement of large language models (LLMs) has predominantly focused on high-resource languages, leaving low-resource languages, such as those in the Finno-Ugric family, significantly underrepresented. This paper addresses this gap by focusing on V{\~o}ro, Livonian, and Komi. We cover almost the entire cycle of LLM creation, from data collection to instruction tuning and evaluation. Our contributions include developing multilingual base and instruction-tuned models; creating evaluation benchmarks, including the smugri-MT-bench multi-turn conversational benchmark; and conducting human evaluation. We intend for this work to promote linguistic diversity, ensuring that lesser-resourced languages can benefit from advancements in NLP."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="purason-etal-2025-llms">
<titleInfo>
<title>LLMs for Extremely Low-Resource Finno-Ugric Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Taido</namePart>
<namePart type="family">Purason</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hele-Andra</namePart>
<namePart type="family">Kuulmets</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Fishel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-195-7</identifier>
</relatedItem>
<abstract>The advancement of large language models (LLMs) has predominantly focused on high-resource languages, leaving low-resource languages, such as those in the Finno-Ugric family, significantly underrepresented. This paper addresses this gap by focusing on Võro, Livonian, and Komi. We cover almost the entire cycle of LLM creation, from data collection to instruction tuning and evaluation. Our contributions include developing multilingual base and instruction-tuned models; creating evaluation benchmarks, including the smugri-MT-bench multi-turn conversational benchmark; and conducting human evaluation. We intend for this work to promote linguistic diversity, ensuring that lesser-resourced languages can benefit from advancements in NLP.</abstract>
<identifier type="citekey">purason-etal-2025-llms</identifier>
<identifier type="doi">10.18653/v1/2025.findings-naacl.373</identifier>
<location>
<url>https://aclanthology.org/2025.findings-naacl.373/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>6677</start>
<end>6697</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T LLMs for Extremely Low-Resource Finno-Ugric Languages
%A Purason, Taido
%A Kuulmets, Hele-Andra
%A Fishel, Mark
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Findings of the Association for Computational Linguistics: NAACL 2025
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-195-7
%F purason-etal-2025-llms
%X The advancement of large language models (LLMs) has predominantly focused on high-resource languages, leaving low-resource languages, such as those in the Finno-Ugric family, significantly underrepresented. This paper addresses this gap by focusing on Võro, Livonian, and Komi. We cover almost the entire cycle of LLM creation, from data collection to instruction tuning and evaluation. Our contributions include developing multilingual base and instruction-tuned models; creating evaluation benchmarks, including the smugri-MT-bench multi-turn conversational benchmark; and conducting human evaluation. We intend for this work to promote linguistic diversity, ensuring that lesser-resourced languages can benefit from advancements in NLP.
%R 10.18653/v1/2025.findings-naacl.373
%U https://aclanthology.org/2025.findings-naacl.373/
%U https://doi.org/10.18653/v1/2025.findings-naacl.373
%P 6677-6697
Markdown (Informal)
[LLMs for Extremely Low-Resource Finno-Ugric Languages](https://aclanthology.org/2025.findings-naacl.373/) (Purason et al., Findings 2025)
ACL
- Taido Purason, Hele-Andra Kuulmets, and Mark Fishel. 2025. LLMs for Extremely Low-Resource Finno-Ugric Languages. In Findings of the Association for Computational Linguistics: NAACL 2025, pages 6677–6697, Albuquerque, New Mexico. Association for Computational Linguistics.