@inproceedings{mengke-etal-2024-tandem,
title = "Tandem Long-Short Duration-based Modeling for Automatic Speech Recognition",
author = "Mengke, Dalai and
Meng, Yan and
Mihajlik, Peter",
editor = "Melero, Maite and
Sakti, Sakriani and
Soria, Claudia",
booktitle = "Proceedings of the 3rd Annual Meeting of the Special Interest Group on Under-resourced Languages @ LREC-COLING 2024",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.sigul-1.40",
pages = "331--336",
abstract = "This study outlines our duration-dependent modeling experiments on limited-resource Hungarian speech recognition tasks. As it is well known, very short utterances pose significant challenges in automatic speech recognition due to the lack of context and other phenomena. In particular, we found that that the exclusion of shorter speech samples from fine-tuning for longer duration test data significantly improves the recognition rate measured on public Hungarian datasets, BEA-Base and CommonVoice (CV). Therefore we apply a tandem modeling approach, separate models are used for short and long duration test data. Our strategy improved the ability to recognize short utterances while maintaining recognition of long utterances efficiently, which led to a significant increase in overall recognition accuracy.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mengke-etal-2024-tandem">
<titleInfo>
<title>Tandem Long-Short Duration-based Modeling for Automatic Speech Recognition</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dalai</namePart>
<namePart type="family">Mengke</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yan</namePart>
<namePart type="family">Meng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peter</namePart>
<namePart type="family">Mihajlik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 3rd Annual Meeting of the Special Interest Group on Under-resourced Languages @ LREC-COLING 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maite</namePart>
<namePart type="family">Melero</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Claudia</namePart>
<namePart type="family">Soria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This study outlines our duration-dependent modeling experiments on limited-resource Hungarian speech recognition tasks. As it is well known, very short utterances pose significant challenges in automatic speech recognition due to the lack of context and other phenomena. In particular, we found that that the exclusion of shorter speech samples from fine-tuning for longer duration test data significantly improves the recognition rate measured on public Hungarian datasets, BEA-Base and CommonVoice (CV). Therefore we apply a tandem modeling approach, separate models are used for short and long duration test data. Our strategy improved the ability to recognize short utterances while maintaining recognition of long utterances efficiently, which led to a significant increase in overall recognition accuracy.</abstract>
<identifier type="citekey">mengke-etal-2024-tandem</identifier>
<location>
<url>https://aclanthology.org/2024.sigul-1.40</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>331</start>
<end>336</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Tandem Long-Short Duration-based Modeling for Automatic Speech Recognition
%A Mengke, Dalai
%A Meng, Yan
%A Mihajlik, Peter
%Y Melero, Maite
%Y Sakti, Sakriani
%Y Soria, Claudia
%S Proceedings of the 3rd Annual Meeting of the Special Interest Group on Under-resourced Languages @ LREC-COLING 2024
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F mengke-etal-2024-tandem
%X This study outlines our duration-dependent modeling experiments on limited-resource Hungarian speech recognition tasks. As it is well known, very short utterances pose significant challenges in automatic speech recognition due to the lack of context and other phenomena. In particular, we found that that the exclusion of shorter speech samples from fine-tuning for longer duration test data significantly improves the recognition rate measured on public Hungarian datasets, BEA-Base and CommonVoice (CV). Therefore we apply a tandem modeling approach, separate models are used for short and long duration test data. Our strategy improved the ability to recognize short utterances while maintaining recognition of long utterances efficiently, which led to a significant increase in overall recognition accuracy.
%U https://aclanthology.org/2024.sigul-1.40
%P 331-336
Markdown (Informal)
[Tandem Long-Short Duration-based Modeling for Automatic Speech Recognition](https://aclanthology.org/2024.sigul-1.40) (Mengke et al., SIGUL-WS 2024)
ACL