@inproceedings{palma-gomez-rozovskaya-2025-low,
title = "Low-Resource Grammatical Error Correction: Selective Data Augmentation with Round-Trip Machine Translation",
author = "Palma Gomez, Frank and
Rozovskaya, Alla",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.1322/",
doi = "10.18653/v1/2025.findings-acl.1322",
pages = "25749--25770",
ISBN = "979-8-89176-256-5",
abstract = "Supervised state-of-the-art methods for grammatical error correction require large amounts of parallel data for training. Due to lack of gold-labeled data, techniques that create synthetic training data have become popular. We show that models trained on synthetic data tend tocorrect a limited range of grammar and spelling mistakes that involve character-level changes, but perform poorly on (more complex) phenomena that require word-level changes. We propose to address the performance gap on such errors by generating synthetic data through selective data augmentation via round-trip machine translation. We show that the proposed technique, SeLex-RT, is capable of generating mistakes that are similar to those observed with language learners. Using the approach with two types of state-of-the-art learning frameworks and two low-resource languages (Russian and Ukrainian), we achieve substantial improvements, compared to training on synthetic data produced with standard techniques. Analysis of the output reveals that models trained on data noisified with the SeLex-RT approach are capable of making word-level changes and correct lexical errors common with language learners."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="palma-gomez-rozovskaya-2025-low">
<titleInfo>
<title>Low-Resource Grammatical Error Correction: Selective Data Augmentation with Round-Trip Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Frank</namePart>
<namePart type="family">Palma Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alla</namePart>
<namePart type="family">Rozovskaya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Supervised state-of-the-art methods for grammatical error correction require large amounts of parallel data for training. Due to lack of gold-labeled data, techniques that create synthetic training data have become popular. We show that models trained on synthetic data tend tocorrect a limited range of grammar and spelling mistakes that involve character-level changes, but perform poorly on (more complex) phenomena that require word-level changes. We propose to address the performance gap on such errors by generating synthetic data through selective data augmentation via round-trip machine translation. We show that the proposed technique, SeLex-RT, is capable of generating mistakes that are similar to those observed with language learners. Using the approach with two types of state-of-the-art learning frameworks and two low-resource languages (Russian and Ukrainian), we achieve substantial improvements, compared to training on synthetic data produced with standard techniques. Analysis of the output reveals that models trained on data noisified with the SeLex-RT approach are capable of making word-level changes and correct lexical errors common with language learners.</abstract>
<identifier type="citekey">palma-gomez-rozovskaya-2025-low</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.1322</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.1322/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>25749</start>
<end>25770</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Low-Resource Grammatical Error Correction: Selective Data Augmentation with Round-Trip Machine Translation
%A Palma Gomez, Frank
%A Rozovskaya, Alla
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F palma-gomez-rozovskaya-2025-low
%X Supervised state-of-the-art methods for grammatical error correction require large amounts of parallel data for training. Due to lack of gold-labeled data, techniques that create synthetic training data have become popular. We show that models trained on synthetic data tend tocorrect a limited range of grammar and spelling mistakes that involve character-level changes, but perform poorly on (more complex) phenomena that require word-level changes. We propose to address the performance gap on such errors by generating synthetic data through selective data augmentation via round-trip machine translation. We show that the proposed technique, SeLex-RT, is capable of generating mistakes that are similar to those observed with language learners. Using the approach with two types of state-of-the-art learning frameworks and two low-resource languages (Russian and Ukrainian), we achieve substantial improvements, compared to training on synthetic data produced with standard techniques. Analysis of the output reveals that models trained on data noisified with the SeLex-RT approach are capable of making word-level changes and correct lexical errors common with language learners.
%R 10.18653/v1/2025.findings-acl.1322
%U https://aclanthology.org/2025.findings-acl.1322/
%U https://doi.org/10.18653/v1/2025.findings-acl.1322
%P 25749-25770
Markdown (Informal)
[Low-Resource Grammatical Error Correction: Selective Data Augmentation with Round-Trip Machine Translation](https://aclanthology.org/2025.findings-acl.1322/) (Palma Gomez & Rozovskaya, Findings 2025)
ACL