@inproceedings{chen-etal-2021-employing,
title = "Employing low-pass filtered temporal speech features for the training of ideal ratio mask in speech enhancement",
author = "Chen, Yan-Tong and
Lin, Zi-Qiang and
Hung, Jeih-Weih",
editor = "Lee, Lung-Hao and
Chang, Chia-Hui and
Chen, Kuan-Yu",
booktitle = "Proceedings of the 33rd Conference on Computational Linguistics and Speech Processing (ROCLING 2021)",
month = oct,
year = "2021",
address = "Taoyuan, Taiwan",
publisher = "The Association for Computational Linguistics and Chinese Language Processing (ACLCLP)",
url = "https://aclanthology.org/2021.rocling-1.30",
pages = "236--242",
abstract = "The masking-based speech enhancement method pursues a multiplicative mask that applies to the spectrogram of input noise-corrupted utterance, and a deep neural network (DNN) is often used to learn the mask. In particular, the features commonly used for automatic speech recognition can serve as the input of the DNN to learn the well-behaved mask that significantly reduce the noise distortion of processed utterances. This study proposes to preprocess the input speech features for the ideal ratio mask (IRM)-based DNN by lowpass filtering in order to alleviate the noise components. In particular, we employ the discrete wavelet transform (DWT) to decompose the temporal speech feature sequence and scale down the detail coefficients, which correspond to the high-pass portion of the sequence. Preliminary experiments conducted on a subset of TIMIT corpus reveal that the proposed method can make the resulting IRM achieve higher speech quality and intelligibility for the babble noise-corrupted signals compared with the original IRM, indicating that the lowpass filtered temporal feature sequence can learn a superior IRM network for speech enhancement.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chen-etal-2021-employing">
<titleInfo>
<title>Employing low-pass filtered temporal speech features for the training of ideal ratio mask in speech enhancement</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yan-Tong</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zi-Qiang</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jeih-Weih</namePart>
<namePart type="family">Hung</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 33rd Conference on Computational Linguistics and Speech Processing (ROCLING 2021)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lung-Hao</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chia-Hui</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kuan-Yu</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>The Association for Computational Linguistics and Chinese Language Processing (ACLCLP)</publisher>
<place>
<placeTerm type="text">Taoyuan, Taiwan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The masking-based speech enhancement method pursues a multiplicative mask that applies to the spectrogram of input noise-corrupted utterance, and a deep neural network (DNN) is often used to learn the mask. In particular, the features commonly used for automatic speech recognition can serve as the input of the DNN to learn the well-behaved mask that significantly reduce the noise distortion of processed utterances. This study proposes to preprocess the input speech features for the ideal ratio mask (IRM)-based DNN by lowpass filtering in order to alleviate the noise components. In particular, we employ the discrete wavelet transform (DWT) to decompose the temporal speech feature sequence and scale down the detail coefficients, which correspond to the high-pass portion of the sequence. Preliminary experiments conducted on a subset of TIMIT corpus reveal that the proposed method can make the resulting IRM achieve higher speech quality and intelligibility for the babble noise-corrupted signals compared with the original IRM, indicating that the lowpass filtered temporal feature sequence can learn a superior IRM network for speech enhancement.</abstract>
<identifier type="citekey">chen-etal-2021-employing</identifier>
<location>
<url>https://aclanthology.org/2021.rocling-1.30</url>
</location>
<part>
<date>2021-10</date>
<extent unit="page">
<start>236</start>
<end>242</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Employing low-pass filtered temporal speech features for the training of ideal ratio mask in speech enhancement
%A Chen, Yan-Tong
%A Lin, Zi-Qiang
%A Hung, Jeih-Weih
%Y Lee, Lung-Hao
%Y Chang, Chia-Hui
%Y Chen, Kuan-Yu
%S Proceedings of the 33rd Conference on Computational Linguistics and Speech Processing (ROCLING 2021)
%D 2021
%8 October
%I The Association for Computational Linguistics and Chinese Language Processing (ACLCLP)
%C Taoyuan, Taiwan
%F chen-etal-2021-employing
%X The masking-based speech enhancement method pursues a multiplicative mask that applies to the spectrogram of input noise-corrupted utterance, and a deep neural network (DNN) is often used to learn the mask. In particular, the features commonly used for automatic speech recognition can serve as the input of the DNN to learn the well-behaved mask that significantly reduce the noise distortion of processed utterances. This study proposes to preprocess the input speech features for the ideal ratio mask (IRM)-based DNN by lowpass filtering in order to alleviate the noise components. In particular, we employ the discrete wavelet transform (DWT) to decompose the temporal speech feature sequence and scale down the detail coefficients, which correspond to the high-pass portion of the sequence. Preliminary experiments conducted on a subset of TIMIT corpus reveal that the proposed method can make the resulting IRM achieve higher speech quality and intelligibility for the babble noise-corrupted signals compared with the original IRM, indicating that the lowpass filtered temporal feature sequence can learn a superior IRM network for speech enhancement.
%U https://aclanthology.org/2021.rocling-1.30
%P 236-242
Markdown (Informal)
[Employing low-pass filtered temporal speech features for the training of ideal ratio mask in speech enhancement](https://aclanthology.org/2021.rocling-1.30) (Chen et al., ROCLING 2021)
ACL