@inproceedings{chuyuan-etal-2024-distinguishing,
title = "Distinguishing Neural Speech Synthesis Models Through Fingerprints in Speech Waveforms",
author = "ChuYuan, Zhang and
Jiangyan, Yi and
Jianhua, Tao and
Chenglong, Wang and
Xinrui, Yan",
editor = "Sun, Maosong and
Liang, Jiye and
Han, Xianpei and
Liu, Zhiyuan and
He, Yulan",
booktitle = "Proceedings of the 23rd Chinese National Conference on Computational Linguistics (Volume 1: Main Conference)",
month = jul,
year = "2024",
address = "Taiyuan, China",
publisher = "Chinese Information Processing Society of China",
url = "https://aclanthology.org/2024.ccl-1.90/",
pages = "1160--1171",
language = "eng",
abstract = "{\textquotedblleft}Recent advancements in neural speech synthesis technologies have brought aboutwidespread applications but have also raised concerns about potential misuse and abuse.Addressing these challenges is crucial, particularly in the realms of forensics and intellec-tual property protection. While previous research on source attribution of synthesizedspeech has its limitations, our study aims to fill these gaps by investigating the identifi-cation of sources in synthesized speech. We focus on analyzing speech synthesis modelfingerprints in generated speech waveforms, emphasizing the roles of the acoustic modeland vocoder. Our research, based on the multi-speaker LibriTTS dataset, reveals twokey insights: (1) both vocoders and acoustic models leave distinct, model-specific fin-gerprints on generated waveforms, and (2) vocoder fingerprints, being more dominant,may obscure those from the acoustic model. These findings underscore the presence ofmodel-specific fingerprints in both components, suggesting their potential significance insource identification applications.{\textquotedblright}"
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chuyuan-etal-2024-distinguishing">
<titleInfo>
<title>Distinguishing Neural Speech Synthesis Models Through Fingerprints in Speech Waveforms</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhang</namePart>
<namePart type="family">ChuYuan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yi</namePart>
<namePart type="family">Jiangyan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tao</namePart>
<namePart type="family">Jianhua</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wang</namePart>
<namePart type="family">Chenglong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yan</namePart>
<namePart type="family">Xinrui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 23rd Chinese National Conference on Computational Linguistics (Volume 1: Main Conference)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maosong</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiye</namePart>
<namePart type="family">Liang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xianpei</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhiyuan</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Chinese Information Processing Society of China</publisher>
<place>
<placeTerm type="text">Taiyuan, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>“Recent advancements in neural speech synthesis technologies have brought aboutwidespread applications but have also raised concerns about potential misuse and abuse.Addressing these challenges is crucial, particularly in the realms of forensics and intellec-tual property protection. While previous research on source attribution of synthesizedspeech has its limitations, our study aims to fill these gaps by investigating the identifi-cation of sources in synthesized speech. We focus on analyzing speech synthesis modelfingerprints in generated speech waveforms, emphasizing the roles of the acoustic modeland vocoder. Our research, based on the multi-speaker LibriTTS dataset, reveals twokey insights: (1) both vocoders and acoustic models leave distinct, model-specific fin-gerprints on generated waveforms, and (2) vocoder fingerprints, being more dominant,may obscure those from the acoustic model. These findings underscore the presence ofmodel-specific fingerprints in both components, suggesting their potential significance insource identification applications.”</abstract>
<identifier type="citekey">chuyuan-etal-2024-distinguishing</identifier>
<location>
<url>https://aclanthology.org/2024.ccl-1.90/</url>
</location>
<part>
<date>2024-07</date>
<extent unit="page">
<start>1160</start>
<end>1171</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Distinguishing Neural Speech Synthesis Models Through Fingerprints in Speech Waveforms
%A ChuYuan, Zhang
%A Jiangyan, Yi
%A Jianhua, Tao
%A Chenglong, Wang
%A Xinrui, Yan
%Y Sun, Maosong
%Y Liang, Jiye
%Y Han, Xianpei
%Y Liu, Zhiyuan
%Y He, Yulan
%S Proceedings of the 23rd Chinese National Conference on Computational Linguistics (Volume 1: Main Conference)
%D 2024
%8 July
%I Chinese Information Processing Society of China
%C Taiyuan, China
%G eng
%F chuyuan-etal-2024-distinguishing
%X “Recent advancements in neural speech synthesis technologies have brought aboutwidespread applications but have also raised concerns about potential misuse and abuse.Addressing these challenges is crucial, particularly in the realms of forensics and intellec-tual property protection. While previous research on source attribution of synthesizedspeech has its limitations, our study aims to fill these gaps by investigating the identifi-cation of sources in synthesized speech. We focus on analyzing speech synthesis modelfingerprints in generated speech waveforms, emphasizing the roles of the acoustic modeland vocoder. Our research, based on the multi-speaker LibriTTS dataset, reveals twokey insights: (1) both vocoders and acoustic models leave distinct, model-specific fin-gerprints on generated waveforms, and (2) vocoder fingerprints, being more dominant,may obscure those from the acoustic model. These findings underscore the presence ofmodel-specific fingerprints in both components, suggesting their potential significance insource identification applications.”
%U https://aclanthology.org/2024.ccl-1.90/
%P 1160-1171
Markdown (Informal)
[Distinguishing Neural Speech Synthesis Models Through Fingerprints in Speech Waveforms](https://aclanthology.org/2024.ccl-1.90/) (ChuYuan et al., CCL 2024)
ACL