@inproceedings{lan-etal-2025-challenges,
title = "Challenges and Limitations of the Multilingual Pre-trained Model Whisper on Low-Resource Languages: A Case Study of {H}akka Speech Recognition",
author = "Lan, Pei-Chi and
Chiang, Hsin-Tien and
Lin, Ting-Chun and
Su, Ming-Hsiang",
editor = "Chang, Kai-Wei and
Lu, Ke-Han and
Yang, Chih-Kai and
Tam, Zhi-Rui and
Chang, Wen-Yu and
Wang, Chung-Che",
booktitle = "Proceedings of the 37th Conference on Computational Linguistics and Speech Processing (ROCLING 2025)",
month = nov,
year = "2025",
address = "National Taiwan University, Taipei City, Taiwan",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.rocling-main.62/",
pages = "512--517",
ISBN = "979-8-89176-379-1",
abstract = "This study investigates the practical performance and limitations of the multilingual pre-trained model Whisper in low-resource language settings, using a Hakka speech recognition challenge as a case study. In the preliminary phase, our team (Group G) achieved official scores of 75.58{\%} in Character Error Rate (CER) and 100.97{\%} in Syllable Error Rate (SER). However, in the final phase, both CER and Word Error Rate (WER) reached 100{\%}. Through a retrospective analysis of system design and implementation, we identified three major sources of failure: (1) improper handling of long utterances, where only the first segment was decoded, causing content truncation; (2) inconsistent language prompting, fixed to ``Chinese'' instead of the Hakka target; and (3) lack of systematic verification in data alignment and submission generation, combined with inadequate evaluation setup.Based on these findings, we propose a set of practical guidelines covering long-utterance processing, language consistency checking, and data submission validation. The results highlight that in low-resource speech recognition tasks, poor data quality or flawed workflow design can cause severe degradation of model performance. This study underscores the importance of robust data and process management in ASR system development and provides concrete insights for future improvements and reproducibility."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lan-etal-2025-challenges">
<titleInfo>
<title>Challenges and Limitations of the Multilingual Pre-trained Model Whisper on Low-Resource Languages: A Case Study of Hakka Speech Recognition</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pei-Chi</namePart>
<namePart type="family">Lan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hsin-Tien</namePart>
<namePart type="family">Chiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ting-Chun</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ming-Hsiang</namePart>
<namePart type="family">Su</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 37th Conference on Computational Linguistics and Speech Processing (ROCLING 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kai-Wei</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ke-Han</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chih-Kai</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhi-Rui</namePart>
<namePart type="family">Tam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wen-Yu</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chung-Che</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">National Taiwan University, Taipei City, Taiwan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-379-1</identifier>
</relatedItem>
<abstract>This study investigates the practical performance and limitations of the multilingual pre-trained model Whisper in low-resource language settings, using a Hakka speech recognition challenge as a case study. In the preliminary phase, our team (Group G) achieved official scores of 75.58% in Character Error Rate (CER) and 100.97% in Syllable Error Rate (SER). However, in the final phase, both CER and Word Error Rate (WER) reached 100%. Through a retrospective analysis of system design and implementation, we identified three major sources of failure: (1) improper handling of long utterances, where only the first segment was decoded, causing content truncation; (2) inconsistent language prompting, fixed to “Chinese” instead of the Hakka target; and (3) lack of systematic verification in data alignment and submission generation, combined with inadequate evaluation setup.Based on these findings, we propose a set of practical guidelines covering long-utterance processing, language consistency checking, and data submission validation. The results highlight that in low-resource speech recognition tasks, poor data quality or flawed workflow design can cause severe degradation of model performance. This study underscores the importance of robust data and process management in ASR system development and provides concrete insights for future improvements and reproducibility.</abstract>
<identifier type="citekey">lan-etal-2025-challenges</identifier>
<location>
<url>https://aclanthology.org/2025.rocling-main.62/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>512</start>
<end>517</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Challenges and Limitations of the Multilingual Pre-trained Model Whisper on Low-Resource Languages: A Case Study of Hakka Speech Recognition
%A Lan, Pei-Chi
%A Chiang, Hsin-Tien
%A Lin, Ting-Chun
%A Su, Ming-Hsiang
%Y Chang, Kai-Wei
%Y Lu, Ke-Han
%Y Yang, Chih-Kai
%Y Tam, Zhi-Rui
%Y Chang, Wen-Yu
%Y Wang, Chung-Che
%S Proceedings of the 37th Conference on Computational Linguistics and Speech Processing (ROCLING 2025)
%D 2025
%8 November
%I Association for Computational Linguistics
%C National Taiwan University, Taipei City, Taiwan
%@ 979-8-89176-379-1
%F lan-etal-2025-challenges
%X This study investigates the practical performance and limitations of the multilingual pre-trained model Whisper in low-resource language settings, using a Hakka speech recognition challenge as a case study. In the preliminary phase, our team (Group G) achieved official scores of 75.58% in Character Error Rate (CER) and 100.97% in Syllable Error Rate (SER). However, in the final phase, both CER and Word Error Rate (WER) reached 100%. Through a retrospective analysis of system design and implementation, we identified three major sources of failure: (1) improper handling of long utterances, where only the first segment was decoded, causing content truncation; (2) inconsistent language prompting, fixed to “Chinese” instead of the Hakka target; and (3) lack of systematic verification in data alignment and submission generation, combined with inadequate evaluation setup.Based on these findings, we propose a set of practical guidelines covering long-utterance processing, language consistency checking, and data submission validation. The results highlight that in low-resource speech recognition tasks, poor data quality or flawed workflow design can cause severe degradation of model performance. This study underscores the importance of robust data and process management in ASR system development and provides concrete insights for future improvements and reproducibility.
%U https://aclanthology.org/2025.rocling-main.62/
%P 512-517
Markdown (Informal)
[Challenges and Limitations of the Multilingual Pre-trained Model Whisper on Low-Resource Languages: A Case Study of Hakka Speech Recognition](https://aclanthology.org/2025.rocling-main.62/) (Lan et al., ROCLING 2025)
ACL