@inproceedings{feng-etal-2023-ji,
title = "基于离散化自监督表征增强的老挝语非自回归语音合成方法(A Discretized Self-Supervised Representation Enhancement based Non-Autoregressive Speech Synthesis Method for {L}ao Language)",
author = "Feng, Zijian and
Wang, Linqin and
Gao, Shengxaing and
Yu, Zhengtao and
Dong, Ling",
editor = "Sun, Maosong and
Qin, Bing and
Qiu, Xipeng and
Jiang, Jing and
Han, Xianpei",
booktitle = "Proceedings of the 22nd Chinese National Conference on Computational Linguistics",
month = aug,
year = "2023",
address = "Harbin, China",
publisher = "Chinese Information Processing Society of China",
url = "https://aclanthology.org/2023.ccl-1.8",
pages = "90--101",
abstract = "{``}老挝语的语音合成对中老两国合作与交流意义重大,但老挝语语音发音复杂,存在声调、音节及音素等发音特性,现有语音合成方法在老挝语上效果不尽人意。基于注意力机制建模的自回归模型难以拟合复杂的老挝语语音,模型泛化能力差,容易出现漏字、跳字等灾难性错误,合成音频缺乏自然性和流畅性。本文提出基于离散化自监督表征增强的老挝语非自回归语音合成方法,结合老挝语的语言语音特点,使用老挝语音素粒度的标注时长信息构建非自回归架构声学模型,通过自监督学习的预训练语音模型来提取语音内容和声调信息的离散化表征,融入到声学模型中增强模型的语音生成能力,增强合成音频的流畅性和自然性。实验证明,本文方法合成音频达到了4.03的MOS评分,基于离散化自监督表征增强的非自回归建模方法,能更好的在声调、音素时长、音高等细粒度层面刻画老挝语的语音特性。{''}",
language = "Chinese",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="feng-etal-2023-ji">
<titleInfo>
<title>基于离散化自监督表征增强的老挝语非自回归语音合成方法(A Discretized Self-Supervised Representation Enhancement based Non-Autoregressive Speech Synthesis Method for Lao Language)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zijian</namePart>
<namePart type="family">Feng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Linqin</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shengxaing</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhengtao</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ling</namePart>
<namePart type="family">Dong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">Chinese</languageTerm>
<languageTerm type="code" authority="iso639-2b">chi</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 22nd Chinese National Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maosong</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bing</namePart>
<namePart type="family">Qin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xipeng</namePart>
<namePart type="family">Qiu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jing</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xianpei</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Chinese Information Processing Society of China</publisher>
<place>
<placeTerm type="text">Harbin, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>“老挝语的语音合成对中老两国合作与交流意义重大,但老挝语语音发音复杂,存在声调、音节及音素等发音特性,现有语音合成方法在老挝语上效果不尽人意。基于注意力机制建模的自回归模型难以拟合复杂的老挝语语音,模型泛化能力差,容易出现漏字、跳字等灾难性错误,合成音频缺乏自然性和流畅性。本文提出基于离散化自监督表征增强的老挝语非自回归语音合成方法,结合老挝语的语言语音特点,使用老挝语音素粒度的标注时长信息构建非自回归架构声学模型,通过自监督学习的预训练语音模型来提取语音内容和声调信息的离散化表征,融入到声学模型中增强模型的语音生成能力,增强合成音频的流畅性和自然性。实验证明,本文方法合成音频达到了4.03的MOS评分,基于离散化自监督表征增强的非自回归建模方法,能更好的在声调、音素时长、音高等细粒度层面刻画老挝语的语音特性。”</abstract>
<identifier type="citekey">feng-etal-2023-ji</identifier>
<location>
<url>https://aclanthology.org/2023.ccl-1.8</url>
</location>
<part>
<date>2023-08</date>
<extent unit="page">
<start>90</start>
<end>101</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T 基于离散化自监督表征增强的老挝语非自回归语音合成方法(A Discretized Self-Supervised Representation Enhancement based Non-Autoregressive Speech Synthesis Method for Lao Language)
%A Feng, Zijian
%A Wang, Linqin
%A Gao, Shengxaing
%A Yu, Zhengtao
%A Dong, Ling
%Y Sun, Maosong
%Y Qin, Bing
%Y Qiu, Xipeng
%Y Jiang, Jing
%Y Han, Xianpei
%S Proceedings of the 22nd Chinese National Conference on Computational Linguistics
%D 2023
%8 August
%I Chinese Information Processing Society of China
%C Harbin, China
%G Chinese
%F feng-etal-2023-ji
%X “老挝语的语音合成对中老两国合作与交流意义重大,但老挝语语音发音复杂,存在声调、音节及音素等发音特性,现有语音合成方法在老挝语上效果不尽人意。基于注意力机制建模的自回归模型难以拟合复杂的老挝语语音,模型泛化能力差,容易出现漏字、跳字等灾难性错误,合成音频缺乏自然性和流畅性。本文提出基于离散化自监督表征增强的老挝语非自回归语音合成方法,结合老挝语的语言语音特点,使用老挝语音素粒度的标注时长信息构建非自回归架构声学模型,通过自监督学习的预训练语音模型来提取语音内容和声调信息的离散化表征,融入到声学模型中增强模型的语音生成能力,增强合成音频的流畅性和自然性。实验证明,本文方法合成音频达到了4.03的MOS评分,基于离散化自监督表征增强的非自回归建模方法,能更好的在声调、音素时长、音高等细粒度层面刻画老挝语的语音特性。”
%U https://aclanthology.org/2023.ccl-1.8
%P 90-101
Markdown (Informal)
[基于离散化自监督表征增强的老挝语非自回归语音合成方法(A Discretized Self-Supervised Representation Enhancement based Non-Autoregressive Speech Synthesis Method for Lao Language)](https://aclanthology.org/2023.ccl-1.8) (Feng et al., CCL 2023)
ACL