@inproceedings{ye-etal-2023-system,
title = "System Report for {CCL}23-Eval Task 7: {THU} {KEL}ab (sz) - Exploring Data Augmentation and Denoising for {C}hinese Grammatical Error Correction",
author = "Ye, Jingheng and
Li, Yinghui and
Zheng, Haitao",
editor = "Sun, Maosong and
Qin, Bing and
Qiu, Xipeng and
Jiang, Jing and
Han, Xianpei",
booktitle = "Proceedings of the 22nd Chinese National Conference on Computational Linguistics (Volume 3: Evaluations)",
month = aug,
year = "2023",
address = "Harbin, China",
publisher = "Chinese Information Processing Society of China",
url = "https://aclanthology.org/2023.ccl-3.29",
pages = "262--270",
abstract = "{``}This paper explains our GEC system submitted by THU KELab (sz) in the CCL2023-Eval Task7 CLTC (Chinese Learner Text Correction) Track 1: Multidimensional Chinese Learner TextCorrection. Recent studies have demonstrate GEC performance can be improved by increasingthe amount of training data. However, high-quality public GEC data is much less abundant. To address this issue, we propose two data-driven techniques, data augmentation and data de-noising, to improve the GEC performance. Data augmentation creates pseudo data to enhancegeneralization, while data denoising removes noise from the realistic training data. The resultson the official evaluation dataset YACLC demonstrate the effectiveness of our approach. Finally,our GEC system ranked second in both close and open tasks. All of our datasets and codes areavailabel at \url{https://github.com/THUKElab/CCL2023-CLTC-THU_KELab}.{''}",
language = "English",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ye-etal-2023-system">
<titleInfo>
<title>System Report for CCL23-Eval Task 7: THU KELab (sz) - Exploring Data Augmentation and Denoising for Chinese Grammatical Error Correction</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jingheng</namePart>
<namePart type="family">Ye</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yinghui</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haitao</namePart>
<namePart type="family">Zheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 22nd Chinese National Conference on Computational Linguistics (Volume 3: Evaluations)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maosong</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bing</namePart>
<namePart type="family">Qin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xipeng</namePart>
<namePart type="family">Qiu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jing</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xianpei</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Chinese Information Processing Society of China</publisher>
<place>
<placeTerm type="text">Harbin, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>“This paper explains our GEC system submitted by THU KELab (sz) in the CCL2023-Eval Task7 CLTC (Chinese Learner Text Correction) Track 1: Multidimensional Chinese Learner TextCorrection. Recent studies have demonstrate GEC performance can be improved by increasingthe amount of training data. However, high-quality public GEC data is much less abundant. To address this issue, we propose two data-driven techniques, data augmentation and data de-noising, to improve the GEC performance. Data augmentation creates pseudo data to enhancegeneralization, while data denoising removes noise from the realistic training data. The resultson the official evaluation dataset YACLC demonstrate the effectiveness of our approach. Finally,our GEC system ranked second in both close and open tasks. All of our datasets and codes areavailabel at https://github.com/THUKElab/CCL2023-CLTC-THU_KELab.”</abstract>
<identifier type="citekey">ye-etal-2023-system</identifier>
<location>
<url>https://aclanthology.org/2023.ccl-3.29</url>
</location>
<part>
<date>2023-08</date>
<extent unit="page">
<start>262</start>
<end>270</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T System Report for CCL23-Eval Task 7: THU KELab (sz) - Exploring Data Augmentation and Denoising for Chinese Grammatical Error Correction
%A Ye, Jingheng
%A Li, Yinghui
%A Zheng, Haitao
%Y Sun, Maosong
%Y Qin, Bing
%Y Qiu, Xipeng
%Y Jiang, Jing
%Y Han, Xianpei
%S Proceedings of the 22nd Chinese National Conference on Computational Linguistics (Volume 3: Evaluations)
%D 2023
%8 August
%I Chinese Information Processing Society of China
%C Harbin, China
%G English
%F ye-etal-2023-system
%X “This paper explains our GEC system submitted by THU KELab (sz) in the CCL2023-Eval Task7 CLTC (Chinese Learner Text Correction) Track 1: Multidimensional Chinese Learner TextCorrection. Recent studies have demonstrate GEC performance can be improved by increasingthe amount of training data. However, high-quality public GEC data is much less abundant. To address this issue, we propose two data-driven techniques, data augmentation and data de-noising, to improve the GEC performance. Data augmentation creates pseudo data to enhancegeneralization, while data denoising removes noise from the realistic training data. The resultson the official evaluation dataset YACLC demonstrate the effectiveness of our approach. Finally,our GEC system ranked second in both close and open tasks. All of our datasets and codes areavailabel at https://github.com/THUKElab/CCL2023-CLTC-THU_KELab.”
%U https://aclanthology.org/2023.ccl-3.29
%P 262-270
Markdown (Informal)
[System Report for CCL23-Eval Task 7: THU KELab (sz) - Exploring Data Augmentation and Denoising for Chinese Grammatical Error Correction](https://aclanthology.org/2023.ccl-3.29) (Ye et al., CCL 2023)
ACL