@inproceedings{deoghare-etal-2025-refer,
title = "Refer to the Reference: Reference-focused Synthetic Automatic Post-Editing Data Generation",
author = "Deoghare, Sourabh Dattatray and
Kanojia, Diptesh and
Bhattacharyya, Pushpak",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.344/",
pages = "5123--5135",
abstract = "A prevalent approach to synthetic APE data generation uses source (src) sentences in a parallel corpus to obtain translations (mt) through an MT system and treats corresponding reference (ref) sentences as post-edits (pe). While effective, due to independence between {\textquoteleft}mt' and {\textquoteleft}pe,' these translations do not adequately reflect errors to be corrected by a human post-editor. Thus, we introduce a novel and simple yet effective reference-focused synthetic APE data generation technique that uses {\textquoteleft}ref' instead of src' sentences to obtain corrupted translations (mt{\_}new). The experimental results across English-German, English-Russian, English-Marathi, English-Hindi, and English-Tamil language pairs demonstrate the superior performance of APE systems trained using the newly generated synthetic data compared to those trained using existing synthetic data. Further, APE models trained using a balanced mix of existing and newly generated synthetic data achieve improvements of 0.37, 0.19, 1.01, 2.42, and 2.60 TER points, respectively. We will release the generated synthetic APE data."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="deoghare-etal-2025-refer">
<titleInfo>
<title>Refer to the Reference: Reference-focused Synthetic Automatic Post-Editing Data Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sourabh</namePart>
<namePart type="given">Dattatray</namePart>
<namePart type="family">Deoghare</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Diptesh</namePart>
<namePart type="family">Kanojia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pushpak</namePart>
<namePart type="family">Bhattacharyya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>A prevalent approach to synthetic APE data generation uses source (src) sentences in a parallel corpus to obtain translations (mt) through an MT system and treats corresponding reference (ref) sentences as post-edits (pe). While effective, due to independence between ‘mt’ and ‘pe,’ these translations do not adequately reflect errors to be corrected by a human post-editor. Thus, we introduce a novel and simple yet effective reference-focused synthetic APE data generation technique that uses ‘ref’ instead of src’ sentences to obtain corrupted translations (mt_new). The experimental results across English-German, English-Russian, English-Marathi, English-Hindi, and English-Tamil language pairs demonstrate the superior performance of APE systems trained using the newly generated synthetic data compared to those trained using existing synthetic data. Further, APE models trained using a balanced mix of existing and newly generated synthetic data achieve improvements of 0.37, 0.19, 1.01, 2.42, and 2.60 TER points, respectively. We will release the generated synthetic APE data.</abstract>
<identifier type="citekey">deoghare-etal-2025-refer</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.344/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>5123</start>
<end>5135</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Refer to the Reference: Reference-focused Synthetic Automatic Post-Editing Data Generation
%A Deoghare, Sourabh Dattatray
%A Kanojia, Diptesh
%A Bhattacharyya, Pushpak
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F deoghare-etal-2025-refer
%X A prevalent approach to synthetic APE data generation uses source (src) sentences in a parallel corpus to obtain translations (mt) through an MT system and treats corresponding reference (ref) sentences as post-edits (pe). While effective, due to independence between ‘mt’ and ‘pe,’ these translations do not adequately reflect errors to be corrected by a human post-editor. Thus, we introduce a novel and simple yet effective reference-focused synthetic APE data generation technique that uses ‘ref’ instead of src’ sentences to obtain corrupted translations (mt_new). The experimental results across English-German, English-Russian, English-Marathi, English-Hindi, and English-Tamil language pairs demonstrate the superior performance of APE systems trained using the newly generated synthetic data compared to those trained using existing synthetic data. Further, APE models trained using a balanced mix of existing and newly generated synthetic data achieve improvements of 0.37, 0.19, 1.01, 2.42, and 2.60 TER points, respectively. We will release the generated synthetic APE data.
%U https://aclanthology.org/2025.coling-main.344/
%P 5123-5135
Markdown (Informal)
[Refer to the Reference: Reference-focused Synthetic Automatic Post-Editing Data Generation](https://aclanthology.org/2025.coling-main.344/) (Deoghare et al., COLING 2025)
ACL