@inproceedings{dong-etal-2025-retrieval,
title = "Retrieval-Augmented Generation for Large Language Model based Few-shot {C}hinese Spell Checking",
author = "Dong, Ming and
Cheng, Zhiwei and
Luo, Changyin and
He, Tingting",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.717/",
pages = "10767--10780",
abstract = "Large language models (LLMs) are naturally suitable for Chinese spelling check (CSC) task in few-shot scenarios due to their powerful semantic understanding and few-shot learning capabilities. Recent CSC research has begun to use LLMs as foundational models. However, most current datasets are primarily focused on errors generated during the text generation process, with little attention given to errors occurring in the modal conversion process. Furthermore, existing LLM-based CSC methods often rely on fixed prompt samples, which limits the performance of LLMs. Therefore, we propose a framework named RagID (Retrieval-Augment Generation and Iterative Discriminator Strategy). By utilizing semantic-based similarity search and an iterative discriminator mechanism, RagID can provide well-chosen prompt samples and reduce over-correction issues in LLM-based CSC. RagID demonstrates excellent effectiveness in few-shot scenarios. We conducted comprehensive experiments, and the results show that RagID achieves the best performance on dataset that include data from multiple domains and dataset containing modal conversion spelling errors. The dataset and method are available online."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dong-etal-2025-retrieval">
<titleInfo>
<title>Retrieval-Augmented Generation for Large Language Model based Few-shot Chinese Spell Checking</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ming</namePart>
<namePart type="family">Dong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhiwei</namePart>
<namePart type="family">Cheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Changyin</namePart>
<namePart type="family">Luo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tingting</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large language models (LLMs) are naturally suitable for Chinese spelling check (CSC) task in few-shot scenarios due to their powerful semantic understanding and few-shot learning capabilities. Recent CSC research has begun to use LLMs as foundational models. However, most current datasets are primarily focused on errors generated during the text generation process, with little attention given to errors occurring in the modal conversion process. Furthermore, existing LLM-based CSC methods often rely on fixed prompt samples, which limits the performance of LLMs. Therefore, we propose a framework named RagID (Retrieval-Augment Generation and Iterative Discriminator Strategy). By utilizing semantic-based similarity search and an iterative discriminator mechanism, RagID can provide well-chosen prompt samples and reduce over-correction issues in LLM-based CSC. RagID demonstrates excellent effectiveness in few-shot scenarios. We conducted comprehensive experiments, and the results show that RagID achieves the best performance on dataset that include data from multiple domains and dataset containing modal conversion spelling errors. The dataset and method are available online.</abstract>
<identifier type="citekey">dong-etal-2025-retrieval</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.717/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>10767</start>
<end>10780</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Retrieval-Augmented Generation for Large Language Model based Few-shot Chinese Spell Checking
%A Dong, Ming
%A Cheng, Zhiwei
%A Luo, Changyin
%A He, Tingting
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F dong-etal-2025-retrieval
%X Large language models (LLMs) are naturally suitable for Chinese spelling check (CSC) task in few-shot scenarios due to their powerful semantic understanding and few-shot learning capabilities. Recent CSC research has begun to use LLMs as foundational models. However, most current datasets are primarily focused on errors generated during the text generation process, with little attention given to errors occurring in the modal conversion process. Furthermore, existing LLM-based CSC methods often rely on fixed prompt samples, which limits the performance of LLMs. Therefore, we propose a framework named RagID (Retrieval-Augment Generation and Iterative Discriminator Strategy). By utilizing semantic-based similarity search and an iterative discriminator mechanism, RagID can provide well-chosen prompt samples and reduce over-correction issues in LLM-based CSC. RagID demonstrates excellent effectiveness in few-shot scenarios. We conducted comprehensive experiments, and the results show that RagID achieves the best performance on dataset that include data from multiple domains and dataset containing modal conversion spelling errors. The dataset and method are available online.
%U https://aclanthology.org/2025.coling-main.717/
%P 10767-10780
Markdown (Informal)
[Retrieval-Augmented Generation for Large Language Model based Few-shot Chinese Spell Checking](https://aclanthology.org/2025.coling-main.717/) (Dong et al., COLING 2025)
ACL