@inproceedings{su-etal-2025-racqc,
title = "{RACQC}: Advanced Retrieval-Augmented Generation for {C}hinese Query Correction",
author = "Su, Jinbo and
Gao, Lingzhe and
Li, Wei and
Liu, Shihao and
Lei, Haojie and
Wang, Xinyi and
Guo, Yuanzhao and
Wang, Ke and
Shi, Daiting and
Yin, Dawei",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.36/",
pages = "675--689",
ISBN = "979-8-89176-335-7",
abstract = "In web search scenarios, erroneous queries frequently degrade users' experience through irrelevant results, underscoring the pivotal role of Chinese Spelling Check (CSC) systems. Although large language models (LLMs) exhibit remarkable capabilities across many tasks, they face critical challenges in the CSC scenario: (1) poor generalization to rare entities in open-domain searches, and (2) failure to adapt to temporal entity variations due to static parameters, resulting in serious over-correction issues. To tackle this, we present RACQC, a **C**hinese **Q**uery **C**orrection system with **R**etrieval-**A**ugmented Generation(RAG) and multi-task learning. Specifically, our approach (1) integrates dynamic knowledge retrieval through entity-centric RAG to address rare entities and innovatively proposes an entity-title collaborative corpus, and (2) employs contrastive correction tasks to mitigate LLM over-correction tendencies. Furthermore, we propose MDCQC, a **M**ulti-**D**omain **C**hinese **Q**uery **C**orrection benchmark to test the model{'}s entity correction capabilities. Extensive experiments on several datasets show that RACQC significantly outperforms existing baselines in CSC tasks. Specifically, RACQC achieves a maximum improvement of +9.92{\%} on the search scenario benchmark and +3.2{\%} on the general-domain dataset under the $F_1$ metric."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="su-etal-2025-racqc">
<titleInfo>
<title>RACQC: Advanced Retrieval-Augmented Generation for Chinese Query Correction</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jinbo</namePart>
<namePart type="family">Su</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lingzhe</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shihao</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haojie</namePart>
<namePart type="family">Lei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xinyi</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuanzhao</namePart>
<namePart type="family">Guo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ke</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daiting</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dawei</namePart>
<namePart type="family">Yin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>In web search scenarios, erroneous queries frequently degrade users’ experience through irrelevant results, underscoring the pivotal role of Chinese Spelling Check (CSC) systems. Although large language models (LLMs) exhibit remarkable capabilities across many tasks, they face critical challenges in the CSC scenario: (1) poor generalization to rare entities in open-domain searches, and (2) failure to adapt to temporal entity variations due to static parameters, resulting in serious over-correction issues. To tackle this, we present RACQC, a **C**hinese **Q**uery **C**orrection system with **R**etrieval-**A**ugmented Generation(RAG) and multi-task learning. Specifically, our approach (1) integrates dynamic knowledge retrieval through entity-centric RAG to address rare entities and innovatively proposes an entity-title collaborative corpus, and (2) employs contrastive correction tasks to mitigate LLM over-correction tendencies. Furthermore, we propose MDCQC, a **M**ulti-**D**omain **C**hinese **Q**uery **C**orrection benchmark to test the model’s entity correction capabilities. Extensive experiments on several datasets show that RACQC significantly outperforms existing baselines in CSC tasks. Specifically, RACQC achieves a maximum improvement of +9.92% on the search scenario benchmark and +3.2% on the general-domain dataset under the F₁ metric.</abstract>
<identifier type="citekey">su-etal-2025-racqc</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.36/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>675</start>
<end>689</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T RACQC: Advanced Retrieval-Augmented Generation for Chinese Query Correction
%A Su, Jinbo
%A Gao, Lingzhe
%A Li, Wei
%A Liu, Shihao
%A Lei, Haojie
%A Wang, Xinyi
%A Guo, Yuanzhao
%A Wang, Ke
%A Shi, Daiting
%A Yin, Dawei
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F su-etal-2025-racqc
%X In web search scenarios, erroneous queries frequently degrade users’ experience through irrelevant results, underscoring the pivotal role of Chinese Spelling Check (CSC) systems. Although large language models (LLMs) exhibit remarkable capabilities across many tasks, they face critical challenges in the CSC scenario: (1) poor generalization to rare entities in open-domain searches, and (2) failure to adapt to temporal entity variations due to static parameters, resulting in serious over-correction issues. To tackle this, we present RACQC, a **C**hinese **Q**uery **C**orrection system with **R**etrieval-**A**ugmented Generation(RAG) and multi-task learning. Specifically, our approach (1) integrates dynamic knowledge retrieval through entity-centric RAG to address rare entities and innovatively proposes an entity-title collaborative corpus, and (2) employs contrastive correction tasks to mitigate LLM over-correction tendencies. Furthermore, we propose MDCQC, a **M**ulti-**D**omain **C**hinese **Q**uery **C**orrection benchmark to test the model’s entity correction capabilities. Extensive experiments on several datasets show that RACQC significantly outperforms existing baselines in CSC tasks. Specifically, RACQC achieves a maximum improvement of +9.92% on the search scenario benchmark and +3.2% on the general-domain dataset under the F₁ metric.
%U https://aclanthology.org/2025.findings-emnlp.36/
%P 675-689
Markdown (Informal)
[RACQC: Advanced Retrieval-Augmented Generation for Chinese Query Correction](https://aclanthology.org/2025.findings-emnlp.36/) (Su et al., Findings 2025)
ACL
- Jinbo Su, Lingzhe Gao, Wei Li, Shihao Liu, Haojie Lei, Xinyi Wang, Yuanzhao Guo, Ke Wang, Daiting Shi, and Dawei Yin. 2025. RACQC: Advanced Retrieval-Augmented Generation for Chinese Query Correction. In Findings of the Association for Computational Linguistics: EMNLP 2025, pages 675–689, Suzhou, China. Association for Computational Linguistics.