@inproceedings{sun-etal-2024-two,
title = "Two Issues with {C}hinese Spelling Correction and A Refinement Solution",
author = "Sun, Changxuan and
She, Linlin and
Lu, Xuesong",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.luhme-short.19/",
doi = "10.18653/v1/2024.acl-short.19",
pages = "196--204",
abstract = "The Chinese Spelling Correction (CSC) task aims to detect and correct misspelled characters in Chinese text, and has received lots of attention in the past few years. Most recent studies adopt a Transformer-based model and leverage different features of characters such as pronunciation, glyph and contextual information to enhance the model`s ability to complete the task. Despite their state-of-the-art performance, we observe two issues that should be addressed to further advance the CSC task. First, the widely-used benchmark datasets SIGHAN13, SIGHAN14 and SIGHAN15, contain many mistakes. Hence the performance of existing models is not accurate and should be re-evaluated. Second, existing models seem to have reached a performance bottleneck, where the improvements on the SIGHAN`s testing sets are increasingly smaller and unstable. To deal with the two issues, we make two contributions: (1) we manually fix the SIGHAN datasets and re-evaluate four representative CSC models using the fixed datasets; (2) we analyze the new results to identify the spelling errors that none of the four models successfully corrects, based on which we propose a simple yet effective refinement solution. Experimental results show that our solution improves the four models in all metrics by notable margins."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sun-etal-2024-two">
<titleInfo>
<title>Two Issues with Chinese Spelling Correction and A Refinement Solution</title>
</titleInfo>
<name type="personal">
<namePart type="given">Changxuan</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Linlin</namePart>
<namePart type="family">She</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuesong</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The Chinese Spelling Correction (CSC) task aims to detect and correct misspelled characters in Chinese text, and has received lots of attention in the past few years. Most recent studies adopt a Transformer-based model and leverage different features of characters such as pronunciation, glyph and contextual information to enhance the model‘s ability to complete the task. Despite their state-of-the-art performance, we observe two issues that should be addressed to further advance the CSC task. First, the widely-used benchmark datasets SIGHAN13, SIGHAN14 and SIGHAN15, contain many mistakes. Hence the performance of existing models is not accurate and should be re-evaluated. Second, existing models seem to have reached a performance bottleneck, where the improvements on the SIGHAN‘s testing sets are increasingly smaller and unstable. To deal with the two issues, we make two contributions: (1) we manually fix the SIGHAN datasets and re-evaluate four representative CSC models using the fixed datasets; (2) we analyze the new results to identify the spelling errors that none of the four models successfully corrects, based on which we propose a simple yet effective refinement solution. Experimental results show that our solution improves the four models in all metrics by notable margins.</abstract>
<identifier type="citekey">sun-etal-2024-two</identifier>
<identifier type="doi">10.18653/v1/2024.acl-short.19</identifier>
<location>
<url>https://aclanthology.org/2024.luhme-short.19/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>196</start>
<end>204</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Two Issues with Chinese Spelling Correction and A Refinement Solution
%A Sun, Changxuan
%A She, Linlin
%A Lu, Xuesong
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F sun-etal-2024-two
%X The Chinese Spelling Correction (CSC) task aims to detect and correct misspelled characters in Chinese text, and has received lots of attention in the past few years. Most recent studies adopt a Transformer-based model and leverage different features of characters such as pronunciation, glyph and contextual information to enhance the model‘s ability to complete the task. Despite their state-of-the-art performance, we observe two issues that should be addressed to further advance the CSC task. First, the widely-used benchmark datasets SIGHAN13, SIGHAN14 and SIGHAN15, contain many mistakes. Hence the performance of existing models is not accurate and should be re-evaluated. Second, existing models seem to have reached a performance bottleneck, where the improvements on the SIGHAN‘s testing sets are increasingly smaller and unstable. To deal with the two issues, we make two contributions: (1) we manually fix the SIGHAN datasets and re-evaluate four representative CSC models using the fixed datasets; (2) we analyze the new results to identify the spelling errors that none of the four models successfully corrects, based on which we propose a simple yet effective refinement solution. Experimental results show that our solution improves the four models in all metrics by notable margins.
%R 10.18653/v1/2024.acl-short.19
%U https://aclanthology.org/2024.luhme-short.19/
%U https://doi.org/10.18653/v1/2024.acl-short.19
%P 196-204
Markdown (Informal)
[Two Issues with Chinese Spelling Correction and A Refinement Solution](https://aclanthology.org/2024.luhme-short.19/) (Sun et al., ACL 2024)
ACL