@inproceedings{orlovskyi-etal-2026-scaling,
title = "Scaling {ASR} for Hutsul Dialect: Multi-Speaker Data Collection, Enhanced Transcription and Cross-Speaker Evaluation",
author = "Orlovskyi, Artem and
Guzii, Zakhar and
Onyshchenko, Bohdan and
Kyslyi, Roman and
Khomenko, Pavlo",
editor = "Romanyshyn, Mariana",
booktitle = "Proceedings of the Fifth {U}krainian Natural Language Processing Conference ({UNLP} 2026)",
month = may,
year = "2026",
address = "Lviv, Ukraine",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.unlp-1.16/",
pages = "184--198",
ISBN = "979-8-89176-359-3",
abstract = "We present a significant expansion of ASR resources for the Hutsul dialect of Ukrainian, building on prior work that established the first aligned speech corpus from a single literary source. In this work, we scale the dataset from a single speaker to a multi-speaker corpus comprising 40 speakers and 60.63 hours of audio drawn from diverse sources: YouTube channels (with author permissions), field recordings from native speakers, linguist student recordings, and regional radio broadcasts. To obtain reference transcriptions for audio without existing text, we introduce a novel RAG-enhanced correction pipeline: audio is first transcribed using ElevenLabs, then corrected through a RAG pipeline backed by a dialect-aware language model. We evaluate a fine-tuned ASR models across five distinct speaker datasets, demonstrating that while the model achieves strong performance on in-domain speakers (CER 3.24{\%}), cross-speaker generalization remains challenging, with CER ranging from 5.33{\%} to 17.24{\%} depending on speaker characteristics. All data, code, and models are released publicly to support further research on Ukrainian dialect speech technologies."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="orlovskyi-etal-2026-scaling">
<titleInfo>
<title>Scaling ASR for Hutsul Dialect: Multi-Speaker Data Collection, Enhanced Transcription and Cross-Speaker Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Artem</namePart>
<namePart type="family">Orlovskyi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zakhar</namePart>
<namePart type="family">Guzii</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bohdan</namePart>
<namePart type="family">Onyshchenko</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roman</namePart>
<namePart type="family">Kyslyi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pavlo</namePart>
<namePart type="family">Khomenko</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Ukrainian Natural Language Processing Conference (UNLP 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mariana</namePart>
<namePart type="family">Romanyshyn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Lviv, Ukraine</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-359-3</identifier>
</relatedItem>
<abstract>We present a significant expansion of ASR resources for the Hutsul dialect of Ukrainian, building on prior work that established the first aligned speech corpus from a single literary source. In this work, we scale the dataset from a single speaker to a multi-speaker corpus comprising 40 speakers and 60.63 hours of audio drawn from diverse sources: YouTube channels (with author permissions), field recordings from native speakers, linguist student recordings, and regional radio broadcasts. To obtain reference transcriptions for audio without existing text, we introduce a novel RAG-enhanced correction pipeline: audio is first transcribed using ElevenLabs, then corrected through a RAG pipeline backed by a dialect-aware language model. We evaluate a fine-tuned ASR models across five distinct speaker datasets, demonstrating that while the model achieves strong performance on in-domain speakers (CER 3.24%), cross-speaker generalization remains challenging, with CER ranging from 5.33% to 17.24% depending on speaker characteristics. All data, code, and models are released publicly to support further research on Ukrainian dialect speech technologies.</abstract>
<identifier type="citekey">orlovskyi-etal-2026-scaling</identifier>
<location>
<url>https://aclanthology.org/2026.unlp-1.16/</url>
</location>
<part>
<date>2026-05</date>
<extent unit="page">
<start>184</start>
<end>198</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Scaling ASR for Hutsul Dialect: Multi-Speaker Data Collection, Enhanced Transcription and Cross-Speaker Evaluation
%A Orlovskyi, Artem
%A Guzii, Zakhar
%A Onyshchenko, Bohdan
%A Kyslyi, Roman
%A Khomenko, Pavlo
%Y Romanyshyn, Mariana
%S Proceedings of the Fifth Ukrainian Natural Language Processing Conference (UNLP 2026)
%D 2026
%8 May
%I Association for Computational Linguistics
%C Lviv, Ukraine
%@ 979-8-89176-359-3
%F orlovskyi-etal-2026-scaling
%X We present a significant expansion of ASR resources for the Hutsul dialect of Ukrainian, building on prior work that established the first aligned speech corpus from a single literary source. In this work, we scale the dataset from a single speaker to a multi-speaker corpus comprising 40 speakers and 60.63 hours of audio drawn from diverse sources: YouTube channels (with author permissions), field recordings from native speakers, linguist student recordings, and regional radio broadcasts. To obtain reference transcriptions for audio without existing text, we introduce a novel RAG-enhanced correction pipeline: audio is first transcribed using ElevenLabs, then corrected through a RAG pipeline backed by a dialect-aware language model. We evaluate a fine-tuned ASR models across five distinct speaker datasets, demonstrating that while the model achieves strong performance on in-domain speakers (CER 3.24%), cross-speaker generalization remains challenging, with CER ranging from 5.33% to 17.24% depending on speaker characteristics. All data, code, and models are released publicly to support further research on Ukrainian dialect speech technologies.
%U https://aclanthology.org/2026.unlp-1.16/
%P 184-198
Markdown (Informal)
[Scaling ASR for Hutsul Dialect: Multi-Speaker Data Collection, Enhanced Transcription and Cross-Speaker Evaluation](https://aclanthology.org/2026.unlp-1.16/) (Orlovskyi et al., UNLP 2026)
ACL