@inproceedings{jones-etal-2024-comparing,
title = "Comparing {K}aldi-Based Pipeline Elpis and Whisper for {\v{C}}akavian Transcription",
author = "Jones, Austin and
Zhang, Shulin and
Hale, John and
Renwick, Margaret and
Vrzic, Zvjezdana and
Langston, Keith",
editor = "Serikov, Oleg and
Voloshina, Ekaterina and
Postnikova, Anna and
Muradoglu, Saliha and
Le Ferrand, Eric and
Klyachko, Elena and
Vylomova, Ekaterina and
Shavrina, Tatiana and
Tyers, Francis",
booktitle = "Proceedings of the 3rd Workshop on NLP Applications to Field Linguistics (Field Matters 2024)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.fieldmatters-1.8/",
doi = "10.18653/v1/2024.fieldmatters-1.8",
pages = "61--68",
abstract = "Automatic speech recognition (ASR) has the potential to accelerate the documentation of endangered languages, but the dearth of resources poses a major obstacle. {\v{C}}akavian, an endangered variety spoken primarily in Croatia, is a case in point, lacking transcription tools that could aid documentation efforts. We compare training a new ASR model on a limited dataset using the Kaldi-based ASR pipeline Elpis to using the same dataset to adapt the transformer-based pretrained multilingual model Whisper, to determine which is more practical in the documentation context. Results show that Whisper outperformed Elpis, achieving the lowest average Word Error Rate (WER) of 57.3{\%} and median WER of 35.48{\%}. While Elpis offers a less computationally expensive model and friendlier user experience, Whisper appears better at adapting to our collected {\v{C}}akavian data."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jones-etal-2024-comparing">
<titleInfo>
<title>Comparing Kaldi-Based Pipeline Elpis and Whisper for Čakavian Transcription</title>
</titleInfo>
<name type="personal">
<namePart type="given">Austin</namePart>
<namePart type="family">Jones</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shulin</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">John</namePart>
<namePart type="family">Hale</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Margaret</namePart>
<namePart type="family">Renwick</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zvjezdana</namePart>
<namePart type="family">Vrzic</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Keith</namePart>
<namePart type="family">Langston</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 3rd Workshop on NLP Applications to Field Linguistics (Field Matters 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Oleg</namePart>
<namePart type="family">Serikov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Voloshina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Postnikova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Saliha</namePart>
<namePart type="family">Muradoglu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eric</namePart>
<namePart type="family">Le Ferrand</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elena</namePart>
<namePart type="family">Klyachko</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Vylomova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tatiana</namePart>
<namePart type="family">Shavrina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Francis</namePart>
<namePart type="family">Tyers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Automatic speech recognition (ASR) has the potential to accelerate the documentation of endangered languages, but the dearth of resources poses a major obstacle. Čakavian, an endangered variety spoken primarily in Croatia, is a case in point, lacking transcription tools that could aid documentation efforts. We compare training a new ASR model on a limited dataset using the Kaldi-based ASR pipeline Elpis to using the same dataset to adapt the transformer-based pretrained multilingual model Whisper, to determine which is more practical in the documentation context. Results show that Whisper outperformed Elpis, achieving the lowest average Word Error Rate (WER) of 57.3% and median WER of 35.48%. While Elpis offers a less computationally expensive model and friendlier user experience, Whisper appears better at adapting to our collected Čakavian data.</abstract>
<identifier type="citekey">jones-etal-2024-comparing</identifier>
<identifier type="doi">10.18653/v1/2024.fieldmatters-1.8</identifier>
<location>
<url>https://aclanthology.org/2024.fieldmatters-1.8/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>61</start>
<end>68</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Comparing Kaldi-Based Pipeline Elpis and Whisper for Čakavian Transcription
%A Jones, Austin
%A Zhang, Shulin
%A Hale, John
%A Renwick, Margaret
%A Vrzic, Zvjezdana
%A Langston, Keith
%Y Serikov, Oleg
%Y Voloshina, Ekaterina
%Y Postnikova, Anna
%Y Muradoglu, Saliha
%Y Le Ferrand, Eric
%Y Klyachko, Elena
%Y Vylomova, Ekaterina
%Y Shavrina, Tatiana
%Y Tyers, Francis
%S Proceedings of the 3rd Workshop on NLP Applications to Field Linguistics (Field Matters 2024)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F jones-etal-2024-comparing
%X Automatic speech recognition (ASR) has the potential to accelerate the documentation of endangered languages, but the dearth of resources poses a major obstacle. Čakavian, an endangered variety spoken primarily in Croatia, is a case in point, lacking transcription tools that could aid documentation efforts. We compare training a new ASR model on a limited dataset using the Kaldi-based ASR pipeline Elpis to using the same dataset to adapt the transformer-based pretrained multilingual model Whisper, to determine which is more practical in the documentation context. Results show that Whisper outperformed Elpis, achieving the lowest average Word Error Rate (WER) of 57.3% and median WER of 35.48%. While Elpis offers a less computationally expensive model and friendlier user experience, Whisper appears better at adapting to our collected Čakavian data.
%R 10.18653/v1/2024.fieldmatters-1.8
%U https://aclanthology.org/2024.fieldmatters-1.8/
%U https://doi.org/10.18653/v1/2024.fieldmatters-1.8
%P 61-68
Markdown (Informal)
[Comparing Kaldi-Based Pipeline Elpis and Whisper for Čakavian Transcription](https://aclanthology.org/2024.fieldmatters-1.8/) (Jones et al., FieldMatters 2024)
ACL