@inproceedings{gao-etal-2024-dlm,
title = "{DLM}: A Decoupled Learning Model for Long-tailed Polyphone Disambiguation in {M}andarin",
author = "Gao, Beibei and
Zhang, Yangsen and
Xiang, Ga and
Jiang, Yushan",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.naacl-long.294",
doi = "10.18653/v1/2024.naacl-long.294",
pages = "5252--5262",
abstract = "Grapheme-to-phoneme conversion (G2P) is a critical component of the text-to-speech system (TTS), where polyphone disambiguation is the most crucial task. However, polyphone disambiguation datasets often suffer from the long-tail problem, and context learning for polyphonic characters commonly stems from a single dimension. In this paper, we propose a novel model DLM: a Decoupled Learning Model for long-tailed polyphone disambiguation in Mandarin. Firstly, DLM decouples representation and classification learnings. It can apply different data samplers for each stage to obtain an optimal training data distribution. This can mitigate the long-tail problem. Secondly, two improved attention mechanisms and a gradual conversion strategy are integrated into the DLM, which achieve transition learning of context from local to global. Finally, to evaluate the effectiveness of DLM, we construct a balanced polyphone disambiguation corpus via in-context learning. Experiments on the benchmark CPP dataset demonstrate that DLM achieves a boosted accuracy of 99.07{\%}. Moreover, DLM improves the disambiguation performance of long-tailed polyphonic characters. For many long-tailed characters, DLM even achieves an accuracy of 100{\%}.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gao-etal-2024-dlm">
<titleInfo>
<title>DLM: A Decoupled Learning Model for Long-tailed Polyphone Disambiguation in Mandarin</title>
</titleInfo>
<name type="personal">
<namePart type="given">Beibei</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yangsen</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ga</namePart>
<namePart type="family">Xiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yushan</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Grapheme-to-phoneme conversion (G2P) is a critical component of the text-to-speech system (TTS), where polyphone disambiguation is the most crucial task. However, polyphone disambiguation datasets often suffer from the long-tail problem, and context learning for polyphonic characters commonly stems from a single dimension. In this paper, we propose a novel model DLM: a Decoupled Learning Model for long-tailed polyphone disambiguation in Mandarin. Firstly, DLM decouples representation and classification learnings. It can apply different data samplers for each stage to obtain an optimal training data distribution. This can mitigate the long-tail problem. Secondly, two improved attention mechanisms and a gradual conversion strategy are integrated into the DLM, which achieve transition learning of context from local to global. Finally, to evaluate the effectiveness of DLM, we construct a balanced polyphone disambiguation corpus via in-context learning. Experiments on the benchmark CPP dataset demonstrate that DLM achieves a boosted accuracy of 99.07%. Moreover, DLM improves the disambiguation performance of long-tailed polyphonic characters. For many long-tailed characters, DLM even achieves an accuracy of 100%.</abstract>
<identifier type="citekey">gao-etal-2024-dlm</identifier>
<identifier type="doi">10.18653/v1/2024.naacl-long.294</identifier>
<location>
<url>https://aclanthology.org/2024.naacl-long.294</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>5252</start>
<end>5262</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T DLM: A Decoupled Learning Model for Long-tailed Polyphone Disambiguation in Mandarin
%A Gao, Beibei
%A Zhang, Yangsen
%A Xiang, Ga
%A Jiang, Yushan
%Y Duh, Kevin
%Y Gomez, Helena
%Y Bethard, Steven
%S Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F gao-etal-2024-dlm
%X Grapheme-to-phoneme conversion (G2P) is a critical component of the text-to-speech system (TTS), where polyphone disambiguation is the most crucial task. However, polyphone disambiguation datasets often suffer from the long-tail problem, and context learning for polyphonic characters commonly stems from a single dimension. In this paper, we propose a novel model DLM: a Decoupled Learning Model for long-tailed polyphone disambiguation in Mandarin. Firstly, DLM decouples representation and classification learnings. It can apply different data samplers for each stage to obtain an optimal training data distribution. This can mitigate the long-tail problem. Secondly, two improved attention mechanisms and a gradual conversion strategy are integrated into the DLM, which achieve transition learning of context from local to global. Finally, to evaluate the effectiveness of DLM, we construct a balanced polyphone disambiguation corpus via in-context learning. Experiments on the benchmark CPP dataset demonstrate that DLM achieves a boosted accuracy of 99.07%. Moreover, DLM improves the disambiguation performance of long-tailed polyphonic characters. For many long-tailed characters, DLM even achieves an accuracy of 100%.
%R 10.18653/v1/2024.naacl-long.294
%U https://aclanthology.org/2024.naacl-long.294
%U https://doi.org/10.18653/v1/2024.naacl-long.294
%P 5252-5262
Markdown (Informal)
[DLM: A Decoupled Learning Model for Long-tailed Polyphone Disambiguation in Mandarin](https://aclanthology.org/2024.naacl-long.294) (Gao et al., NAACL 2024)
ACL