@inproceedings{chen-etal-2023-ji,
title = "基于多尺度建模的端到端自动语音识别方法(An End-to-End Automatic Speech Recognition Method Based on Multiscale Modeling)",
author = "Chen, Hao and
Zhang, Runlai and
Zhang, Yuhao and
Gao, Chenghao and
Xu, Chen and
Ma, Anxiang and
Xiao, Tong and
Zhu, Jingbo",
editor = "Sun, Maosong and
Qin, Bing and
Qiu, Xipeng and
Jiang, Jing and
Han, Xianpei",
booktitle = "Proceedings of the 22nd Chinese National Conference on Computational Linguistics",
month = aug,
year = "2023",
address = "Harbin, China",
publisher = "Chinese Information Processing Society of China",
url = "https://aclanthology.org/2023.ccl-1.41",
pages = "468--479",
abstract = "{``}近年来,基于深度学习的端到端自动语音识别模型直接对语音和文本进行建模,结构简单且性能上也具有显著优势,逐渐成为主流。然而,由于连续的语音信号与离散的文本在长度及表示尺度上存在巨大差异,二者间的模态鸿沟问题是该类任务一直存在的困扰。为解决该问题,本文提出了多尺度语音识别建模方法,该方法从利用细粒度分布知识的角度出发,建立多个不同尺度形式的文本信息,将特征序列从细粒度的低层次序列逐步对齐预测出文本序列。这种逐级预测的方式能够有效降低预测难度,缓解模态鸿沟带来的影响,并通过融合不同尺度下特征,提高语料信息的丰富性与完整性,进一步增强模型推理能力。本文在LibriSpeech小规模、大规模和TEDLIUM2数据集上实验,相比基线系统词错误率平均降低1.7、0.45和0.76,验证了方法的有效性。{''}",
language = "Chinese",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chen-etal-2023-ji">
<titleInfo>
<title>基于多尺度建模的端到端自动语音识别方法(An End-to-End Automatic Speech Recognition Method Based on Multiscale Modeling)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hao</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Runlai</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuhao</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chenghao</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chen</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anxiang</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tong</namePart>
<namePart type="family">Xiao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jingbo</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">Chinese</languageTerm>
<languageTerm type="code" authority="iso639-2b">chi</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 22nd Chinese National Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maosong</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bing</namePart>
<namePart type="family">Qin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xipeng</namePart>
<namePart type="family">Qiu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jing</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xianpei</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Chinese Information Processing Society of China</publisher>
<place>
<placeTerm type="text">Harbin, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>“近年来,基于深度学习的端到端自动语音识别模型直接对语音和文本进行建模,结构简单且性能上也具有显著优势,逐渐成为主流。然而,由于连续的语音信号与离散的文本在长度及表示尺度上存在巨大差异,二者间的模态鸿沟问题是该类任务一直存在的困扰。为解决该问题,本文提出了多尺度语音识别建模方法,该方法从利用细粒度分布知识的角度出发,建立多个不同尺度形式的文本信息,将特征序列从细粒度的低层次序列逐步对齐预测出文本序列。这种逐级预测的方式能够有效降低预测难度,缓解模态鸿沟带来的影响,并通过融合不同尺度下特征,提高语料信息的丰富性与完整性,进一步增强模型推理能力。本文在LibriSpeech小规模、大规模和TEDLIUM2数据集上实验,相比基线系统词错误率平均降低1.7、0.45和0.76,验证了方法的有效性。”</abstract>
<identifier type="citekey">chen-etal-2023-ji</identifier>
<location>
<url>https://aclanthology.org/2023.ccl-1.41</url>
</location>
<part>
<date>2023-08</date>
<extent unit="page">
<start>468</start>
<end>479</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T 基于多尺度建模的端到端自动语音识别方法(An End-to-End Automatic Speech Recognition Method Based on Multiscale Modeling)
%A Chen, Hao
%A Zhang, Runlai
%A Zhang, Yuhao
%A Gao, Chenghao
%A Xu, Chen
%A Ma, Anxiang
%A Xiao, Tong
%A Zhu, Jingbo
%Y Sun, Maosong
%Y Qin, Bing
%Y Qiu, Xipeng
%Y Jiang, Jing
%Y Han, Xianpei
%S Proceedings of the 22nd Chinese National Conference on Computational Linguistics
%D 2023
%8 August
%I Chinese Information Processing Society of China
%C Harbin, China
%G Chinese
%F chen-etal-2023-ji
%X “近年来,基于深度学习的端到端自动语音识别模型直接对语音和文本进行建模,结构简单且性能上也具有显著优势,逐渐成为主流。然而,由于连续的语音信号与离散的文本在长度及表示尺度上存在巨大差异,二者间的模态鸿沟问题是该类任务一直存在的困扰。为解决该问题,本文提出了多尺度语音识别建模方法,该方法从利用细粒度分布知识的角度出发,建立多个不同尺度形式的文本信息,将特征序列从细粒度的低层次序列逐步对齐预测出文本序列。这种逐级预测的方式能够有效降低预测难度,缓解模态鸿沟带来的影响,并通过融合不同尺度下特征,提高语料信息的丰富性与完整性,进一步增强模型推理能力。本文在LibriSpeech小规模、大规模和TEDLIUM2数据集上实验,相比基线系统词错误率平均降低1.7、0.45和0.76,验证了方法的有效性。”
%U https://aclanthology.org/2023.ccl-1.41
%P 468-479
Markdown (Informal)
[基于多尺度建模的端到端自动语音识别方法(An End-to-End Automatic Speech Recognition Method Based on Multiscale Modeling)](https://aclanthology.org/2023.ccl-1.41) (Chen et al., CCL 2023)
ACL
- Hao Chen, Runlai Zhang, Yuhao Zhang, Chenghao Gao, Chen Xu, Anxiang Ma, Tong Xiao, and Jingbo Zhu. 2023. 基于多尺度建模的端到端自动语音识别方法(An End-to-End Automatic Speech Recognition Method Based on Multiscale Modeling). In Proceedings of the 22nd Chinese National Conference on Computational Linguistics, pages 468–479, Harbin, China. Chinese Information Processing Society of China.