@inproceedings{jie-etal-2024-dialectmoe,
title = "{D}ialect{M}o{E}: An End-to-End Multi-Dialect Speech Recognition Model with Mixture-of-Experts",
author = "Jie, Zhou and
Shengxiang, Gao and
Zhengtao, Yu and
Ling, Dong and
Wenjun, Wang",
editor = "Sun, Maosong and
Liang, Jiye and
Han, Xianpei and
Liu, Zhiyuan and
He, Yulan",
booktitle = "Proceedings of the 23rd Chinese National Conference on Computational Linguistics (Volume 1: Main Conference)",
month = jul,
year = "2024",
address = "Taiyuan, China",
publisher = "Chinese Information Processing Society of China",
url = "https://aclanthology.org/2024.ccl-1.89/",
pages = "1148--1159",
language = "eng",
abstract = "{\textquotedblleft}Dialect speech recognition has always been one of the challenges in Automatic Speech Recog-nition (ASR) systems. While lots of ASR systems perform well in Mandarin, their performancesignificantly drops when handling dialect speech. This is mainly due to the obvious differencesbetween dialects and Mandarin in pronunciation and the data scarcity of dialect speech. In thispaper, we propose DialectMoE, a Chinese multi-dialects speech recognition model based onMixture-of-Experts (MoE) in a low-resource conditions. Specifically, DialectMoE assigns inputsequences to a set of experts using a dynamic routing algorithm, with each expert potentiallytrained for a specific dialect. Subsequently, the outputs of these experts are combined to derivethe final output. Due to the similarities among dialects, distinct experts may offer assistance inrecognizing other dialects as well. Experimental results on the Datatang dialect public datasetshow that, compared with the baseline model, DialectMoE reduces Character Error Rate (CER)for Sichuan, Yunnan, Hubei and Henan dialects by 23.6{\%}, 32.6{\%}, 39.2{\%} and 35.09{\%} respec-tively. The proposed DialectMoE model demonstrates outstanding performance in multi-dialectsspeech recognition.{\textquotedblright}"
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jie-etal-2024-dialectmoe">
<titleInfo>
<title>DialectMoE: An End-to-End Multi-Dialect Speech Recognition Model with Mixture-of-Experts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhou</namePart>
<namePart type="family">Jie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gao</namePart>
<namePart type="family">Shengxiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yu</namePart>
<namePart type="family">Zhengtao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dong</namePart>
<namePart type="family">Ling</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wang</namePart>
<namePart type="family">Wenjun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 23rd Chinese National Conference on Computational Linguistics (Volume 1: Main Conference)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maosong</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiye</namePart>
<namePart type="family">Liang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xianpei</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhiyuan</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Chinese Information Processing Society of China</publisher>
<place>
<placeTerm type="text">Taiyuan, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>“Dialect speech recognition has always been one of the challenges in Automatic Speech Recog-nition (ASR) systems. While lots of ASR systems perform well in Mandarin, their performancesignificantly drops when handling dialect speech. This is mainly due to the obvious differencesbetween dialects and Mandarin in pronunciation and the data scarcity of dialect speech. In thispaper, we propose DialectMoE, a Chinese multi-dialects speech recognition model based onMixture-of-Experts (MoE) in a low-resource conditions. Specifically, DialectMoE assigns inputsequences to a set of experts using a dynamic routing algorithm, with each expert potentiallytrained for a specific dialect. Subsequently, the outputs of these experts are combined to derivethe final output. Due to the similarities among dialects, distinct experts may offer assistance inrecognizing other dialects as well. Experimental results on the Datatang dialect public datasetshow that, compared with the baseline model, DialectMoE reduces Character Error Rate (CER)for Sichuan, Yunnan, Hubei and Henan dialects by 23.6%, 32.6%, 39.2% and 35.09% respec-tively. The proposed DialectMoE model demonstrates outstanding performance in multi-dialectsspeech recognition.”</abstract>
<identifier type="citekey">jie-etal-2024-dialectmoe</identifier>
<location>
<url>https://aclanthology.org/2024.ccl-1.89/</url>
</location>
<part>
<date>2024-07</date>
<extent unit="page">
<start>1148</start>
<end>1159</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T DialectMoE: An End-to-End Multi-Dialect Speech Recognition Model with Mixture-of-Experts
%A Jie, Zhou
%A Shengxiang, Gao
%A Zhengtao, Yu
%A Ling, Dong
%A Wenjun, Wang
%Y Sun, Maosong
%Y Liang, Jiye
%Y Han, Xianpei
%Y Liu, Zhiyuan
%Y He, Yulan
%S Proceedings of the 23rd Chinese National Conference on Computational Linguistics (Volume 1: Main Conference)
%D 2024
%8 July
%I Chinese Information Processing Society of China
%C Taiyuan, China
%G eng
%F jie-etal-2024-dialectmoe
%X “Dialect speech recognition has always been one of the challenges in Automatic Speech Recog-nition (ASR) systems. While lots of ASR systems perform well in Mandarin, their performancesignificantly drops when handling dialect speech. This is mainly due to the obvious differencesbetween dialects and Mandarin in pronunciation and the data scarcity of dialect speech. In thispaper, we propose DialectMoE, a Chinese multi-dialects speech recognition model based onMixture-of-Experts (MoE) in a low-resource conditions. Specifically, DialectMoE assigns inputsequences to a set of experts using a dynamic routing algorithm, with each expert potentiallytrained for a specific dialect. Subsequently, the outputs of these experts are combined to derivethe final output. Due to the similarities among dialects, distinct experts may offer assistance inrecognizing other dialects as well. Experimental results on the Datatang dialect public datasetshow that, compared with the baseline model, DialectMoE reduces Character Error Rate (CER)for Sichuan, Yunnan, Hubei and Henan dialects by 23.6%, 32.6%, 39.2% and 35.09% respec-tively. The proposed DialectMoE model demonstrates outstanding performance in multi-dialectsspeech recognition.”
%U https://aclanthology.org/2024.ccl-1.89/
%P 1148-1159
Markdown (Informal)
[DialectMoE: An End-to-End Multi-Dialect Speech Recognition Model with Mixture-of-Experts](https://aclanthology.org/2024.ccl-1.89/) (Jie et al., CCL 2024)
ACL