@inproceedings{getman-etal-2025-towards,
title = "Towards large-scale speech foundation models for a low-resource minority language",
author = "Getman, Yaroslav and
Gr{\'o}sz, Tam{\'a}s and
Hiovain-Asikainen, Katri and
Lehtonen, Tommi and
Kurimo, Mikko",
editor = "Johansson, Richard and
Stymne, Sara",
booktitle = "Proceedings of the Joint 25th Nordic Conference on Computational Linguistics and 11th Baltic Conference on Human Language Technologies (NoDaLiDa/Baltic-HLT 2025)",
month = mar,
year = "2025",
address = "Tallinn, Estonia",
publisher = "University of Tartu Library",
url = "https://aclanthology.org/2025.nodalida-1.19/",
pages = "192--200",
ISBN = "978-9908-53-109-0",
abstract = "Modern ASR systems require massive amounts of training data. While ASR training data for most languages are scarce and expensive to transcribe, a practical solution is to collect huge amounts of raw untranscribed speech and pre-train the ASR model in a self-supervised manner. Unfortunately, for many low-resource minority languages, even untranscribed speech data are scarce. In this paper, we propose a solution for the Northern S{\'a}mi language with 22,400 hours of speech extracted from the Finnish radio and television archives. We evaluated the model performance with different decoding algorithms and examined the models' internal behavior with interpretation-based techniques."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="getman-etal-2025-towards">
<titleInfo>
<title>Towards large-scale speech foundation models for a low-resource minority language</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaroslav</namePart>
<namePart type="family">Getman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tamás</namePart>
<namePart type="family">Grósz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Katri</namePart>
<namePart type="family">Hiovain-Asikainen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tommi</namePart>
<namePart type="family">Lehtonen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mikko</namePart>
<namePart type="family">Kurimo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Joint 25th Nordic Conference on Computational Linguistics and 11th Baltic Conference on Human Language Technologies (NoDaLiDa/Baltic-HLT 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Richard</namePart>
<namePart type="family">Johansson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Stymne</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>University of Tartu Library</publisher>
<place>
<placeTerm type="text">Tallinn, Estonia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">978-9908-53-109-0</identifier>
</relatedItem>
<abstract>Modern ASR systems require massive amounts of training data. While ASR training data for most languages are scarce and expensive to transcribe, a practical solution is to collect huge amounts of raw untranscribed speech and pre-train the ASR model in a self-supervised manner. Unfortunately, for many low-resource minority languages, even untranscribed speech data are scarce. In this paper, we propose a solution for the Northern Sámi language with 22,400 hours of speech extracted from the Finnish radio and television archives. We evaluated the model performance with different decoding algorithms and examined the models’ internal behavior with interpretation-based techniques.</abstract>
<identifier type="citekey">getman-etal-2025-towards</identifier>
<location>
<url>https://aclanthology.org/2025.nodalida-1.19/</url>
</location>
<part>
<date>2025-03</date>
<extent unit="page">
<start>192</start>
<end>200</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Towards large-scale speech foundation models for a low-resource minority language
%A Getman, Yaroslav
%A Grósz, Tamás
%A Hiovain-Asikainen, Katri
%A Lehtonen, Tommi
%A Kurimo, Mikko
%Y Johansson, Richard
%Y Stymne, Sara
%S Proceedings of the Joint 25th Nordic Conference on Computational Linguistics and 11th Baltic Conference on Human Language Technologies (NoDaLiDa/Baltic-HLT 2025)
%D 2025
%8 March
%I University of Tartu Library
%C Tallinn, Estonia
%@ 978-9908-53-109-0
%F getman-etal-2025-towards
%X Modern ASR systems require massive amounts of training data. While ASR training data for most languages are scarce and expensive to transcribe, a practical solution is to collect huge amounts of raw untranscribed speech and pre-train the ASR model in a self-supervised manner. Unfortunately, for many low-resource minority languages, even untranscribed speech data are scarce. In this paper, we propose a solution for the Northern Sámi language with 22,400 hours of speech extracted from the Finnish radio and television archives. We evaluated the model performance with different decoding algorithms and examined the models’ internal behavior with interpretation-based techniques.
%U https://aclanthology.org/2025.nodalida-1.19/
%P 192-200
Markdown (Informal)
[Towards large-scale speech foundation models for a low-resource minority language](https://aclanthology.org/2025.nodalida-1.19/) (Getman et al., NoDaLiDa 2025)
ACL
- Yaroslav Getman, Tamás Grósz, Katri Hiovain-Asikainen, Tommi Lehtonen, and Mikko Kurimo. 2025. Towards large-scale speech foundation models for a low-resource minority language. In Proceedings of the Joint 25th Nordic Conference on Computational Linguistics and 11th Baltic Conference on Human Language Technologies (NoDaLiDa/Baltic-HLT 2025), pages 192–200, Tallinn, Estonia. University of Tartu Library.