@inproceedings{yuqing-etal-2024-ji,
title = "基于字节对编码的端到端藏语语音识别研究(End-to-End {T}ibetan Speech Recognition Study Based on Byte Pair Coding)",
author = "Yuqing, Cai and
Chao, Wang and
Duojie, Renzeng and
Yulei, Zhu and
Jin, Zhang and
Tashi, Nyima",
editor = "Sun, Maosong and
Liang, Jiye and
Han, Xianpei and
Liu, Zhiyuan and
He, Yulan",
booktitle = "Proceedings of the 23rd Chinese National Conference on Computational Linguistics (Volume 1: Main Conference)",
month = jul,
year = "2024",
address = "Taiyuan, China",
publisher = "Chinese Information Processing Society of China",
url = "https://aclanthology.org/2024.ccl-1.23/",
pages = "305--313",
language = "zho",
abstract = "{\textquotedblleft}针对藏语端到端语音识别研究中存在的建模单元不统一和识别效果不理想的问题,本文提出了一种BPE-Conformer-CTC/Attention端到端藏语语音识别方法。首先,该方法采用了字节对编码算法进行语音建模,通过反复合并出现频率最高的字符对,将文本分割成易于管理、有意义的单元,平衡建模单元的粒度,从而解决藏语语音识别中建模单元不统一的问题。其 次 , 使 用 了Conformer编码器 , 有效地融合了音频序列的全局和局部依赖关系,从而增强了模型的表征能力。最后,通过CTC/Attention联合解码策略,加速了对齐和解码过程,进而提高了识别效果的准确性和效率。在开源数据集XBMU-AMDO31和TIBMD@MUCI上的实验结果表明,所提出的BPE-Conformer-CTC/Attention模型分别取得了9.0{\%}和4.6{\%}的词错误率,相较于基线模型Transformer-CTC/Attention,词错误率分别相对降低了14.2{\%}和30.3{\%}。该研究方法为藏语端到端语音识别任务提供了一种有效的解决方案。{\textquotedblright}"
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yuqing-etal-2024-ji">
<titleInfo>
<title>基于字节对编码的端到端藏语语音识别研究(End-to-End Tibetan Speech Recognition Study Based on Byte Pair Coding)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Cai</namePart>
<namePart type="family">Yuqing</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wang</namePart>
<namePart type="family">Chao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Renzeng</namePart>
<namePart type="family">Duojie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhu</namePart>
<namePart type="family">Yulei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhang</namePart>
<namePart type="family">Jin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nyima</namePart>
<namePart type="family">Tashi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">zho</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 23rd Chinese National Conference on Computational Linguistics (Volume 1: Main Conference)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maosong</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiye</namePart>
<namePart type="family">Liang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xianpei</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhiyuan</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Chinese Information Processing Society of China</publisher>
<place>
<placeTerm type="text">Taiyuan, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>“针对藏语端到端语音识别研究中存在的建模单元不统一和识别效果不理想的问题,本文提出了一种BPE-Conformer-CTC/Attention端到端藏语语音识别方法。首先,该方法采用了字节对编码算法进行语音建模,通过反复合并出现频率最高的字符对,将文本分割成易于管理、有意义的单元,平衡建模单元的粒度,从而解决藏语语音识别中建模单元不统一的问题。其 次 , 使 用 了Conformer编码器 , 有效地融合了音频序列的全局和局部依赖关系,从而增强了模型的表征能力。最后,通过CTC/Attention联合解码策略,加速了对齐和解码过程,进而提高了识别效果的准确性和效率。在开源数据集XBMU-AMDO31和TIBMD@MUCI上的实验结果表明,所提出的BPE-Conformer-CTC/Attention模型分别取得了9.0%和4.6%的词错误率,相较于基线模型Transformer-CTC/Attention,词错误率分别相对降低了14.2%和30.3%。该研究方法为藏语端到端语音识别任务提供了一种有效的解决方案。”</abstract>
<identifier type="citekey">yuqing-etal-2024-ji</identifier>
<location>
<url>https://aclanthology.org/2024.ccl-1.23/</url>
</location>
<part>
<date>2024-07</date>
<extent unit="page">
<start>305</start>
<end>313</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T 基于字节对编码的端到端藏语语音识别研究(End-to-End Tibetan Speech Recognition Study Based on Byte Pair Coding)
%A Yuqing, Cai
%A Chao, Wang
%A Duojie, Renzeng
%A Yulei, Zhu
%A Jin, Zhang
%A Tashi, Nyima
%Y Sun, Maosong
%Y Liang, Jiye
%Y Han, Xianpei
%Y Liu, Zhiyuan
%Y He, Yulan
%S Proceedings of the 23rd Chinese National Conference on Computational Linguistics (Volume 1: Main Conference)
%D 2024
%8 July
%I Chinese Information Processing Society of China
%C Taiyuan, China
%G zho
%F yuqing-etal-2024-ji
%X “针对藏语端到端语音识别研究中存在的建模单元不统一和识别效果不理想的问题,本文提出了一种BPE-Conformer-CTC/Attention端到端藏语语音识别方法。首先,该方法采用了字节对编码算法进行语音建模,通过反复合并出现频率最高的字符对,将文本分割成易于管理、有意义的单元,平衡建模单元的粒度,从而解决藏语语音识别中建模单元不统一的问题。其 次 , 使 用 了Conformer编码器 , 有效地融合了音频序列的全局和局部依赖关系,从而增强了模型的表征能力。最后,通过CTC/Attention联合解码策略,加速了对齐和解码过程,进而提高了识别效果的准确性和效率。在开源数据集XBMU-AMDO31和TIBMD@MUCI上的实验结果表明,所提出的BPE-Conformer-CTC/Attention模型分别取得了9.0%和4.6%的词错误率,相较于基线模型Transformer-CTC/Attention,词错误率分别相对降低了14.2%和30.3%。该研究方法为藏语端到端语音识别任务提供了一种有效的解决方案。”
%U https://aclanthology.org/2024.ccl-1.23/
%P 305-313
Markdown (Informal)
[基于字节对编码的端到端藏语语音识别研究(End-to-End Tibetan Speech Recognition Study Based on Byte Pair Coding)](https://aclanthology.org/2024.ccl-1.23/) (Yuqing et al., CCL 2024)
ACL