@inproceedings{moon-etal-2022-empirical,
title = "Empirical Analysis of Noising Scheme based Synthetic Data Generation for Automatic Post-editing",
author = "Moon, Hyeonseok and
Park, Chanjun and
Lee, Seolhwa and
Seo, Jaehyung and
Lee, Jungseob and
Eo, Sugyeong and
Lim, Heuiseok",
editor = "Calzolari, Nicoletta and
B{\'e}chet, Fr{\'e}d{\'e}ric and
Blache, Philippe and
Choukri, Khalid and
Cieri, Christopher and
Declerck, Thierry and
Goggi, Sara and
Isahara, Hitoshi and
Maegaard, Bente and
Mariani, Joseph and
Mazo, H{\'e}l{\`e}ne and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2022.lrec-1.93",
pages = "883--891",
abstract = "Automatic post-editing (APE) refers to a research field that aims to automatically correct errors included in the translation sentences derived by the machine translation system. This study has several limitations, considering the data acquisition, because there is no official dataset for most language pairs. Moreover, the amount of data is restricted even for language pairs in which official data has been released, such as WMT. To solve this problem and promote universal APE research regardless of APE data existence, this study proposes a method for automatically generating APE data based on a noising scheme from a parallel corpus. Particularly, we propose a human mimicking errors-based noising scheme that considers a practical correction process at the human level. We propose a precise inspection to attain high performance, and we derived the optimal noising schemes that show substantial effectiveness. Through these, we also demonstrate that depending on the type of noise, the noising scheme-based APE data generation may lead to inferior performance. In addition, we propose a dynamic noise injection strategy that enables the acquisition of a robust error correction capability and demonstrated its effectiveness by comparative analysis. This study enables obtaining a high performance APE model without human-generated data and can promote universal APE research for all language pairs targeting English.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="moon-etal-2022-empirical">
<titleInfo>
<title>Empirical Analysis of Noising Scheme based Synthetic Data Generation for Automatic Post-editing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hyeonseok</namePart>
<namePart type="family">Moon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chanjun</namePart>
<namePart type="family">Park</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seolhwa</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jaehyung</namePart>
<namePart type="family">Seo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jungseob</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sugyeong</namePart>
<namePart type="family">Eo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Heuiseok</namePart>
<namePart type="family">Lim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Thirteenth Language Resources and Evaluation Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frédéric</namePart>
<namePart type="family">Béchet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philippe</namePart>
<namePart type="family">Blache</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="family">Cieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Goggi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hitoshi</namePart>
<namePart type="family">Isahara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hélène</namePart>
<namePart type="family">Mazo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Automatic post-editing (APE) refers to a research field that aims to automatically correct errors included in the translation sentences derived by the machine translation system. This study has several limitations, considering the data acquisition, because there is no official dataset for most language pairs. Moreover, the amount of data is restricted even for language pairs in which official data has been released, such as WMT. To solve this problem and promote universal APE research regardless of APE data existence, this study proposes a method for automatically generating APE data based on a noising scheme from a parallel corpus. Particularly, we propose a human mimicking errors-based noising scheme that considers a practical correction process at the human level. We propose a precise inspection to attain high performance, and we derived the optimal noising schemes that show substantial effectiveness. Through these, we also demonstrate that depending on the type of noise, the noising scheme-based APE data generation may lead to inferior performance. In addition, we propose a dynamic noise injection strategy that enables the acquisition of a robust error correction capability and demonstrated its effectiveness by comparative analysis. This study enables obtaining a high performance APE model without human-generated data and can promote universal APE research for all language pairs targeting English.</abstract>
<identifier type="citekey">moon-etal-2022-empirical</identifier>
<location>
<url>https://aclanthology.org/2022.lrec-1.93</url>
</location>
<part>
<date>2022-06</date>
<extent unit="page">
<start>883</start>
<end>891</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Empirical Analysis of Noising Scheme based Synthetic Data Generation for Automatic Post-editing
%A Moon, Hyeonseok
%A Park, Chanjun
%A Lee, Seolhwa
%A Seo, Jaehyung
%A Lee, Jungseob
%A Eo, Sugyeong
%A Lim, Heuiseok
%Y Calzolari, Nicoletta
%Y Béchet, Frédéric
%Y Blache, Philippe
%Y Choukri, Khalid
%Y Cieri, Christopher
%Y Declerck, Thierry
%Y Goggi, Sara
%Y Isahara, Hitoshi
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Mazo, Hélène
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Thirteenth Language Resources and Evaluation Conference
%D 2022
%8 June
%I European Language Resources Association
%C Marseille, France
%F moon-etal-2022-empirical
%X Automatic post-editing (APE) refers to a research field that aims to automatically correct errors included in the translation sentences derived by the machine translation system. This study has several limitations, considering the data acquisition, because there is no official dataset for most language pairs. Moreover, the amount of data is restricted even for language pairs in which official data has been released, such as WMT. To solve this problem and promote universal APE research regardless of APE data existence, this study proposes a method for automatically generating APE data based on a noising scheme from a parallel corpus. Particularly, we propose a human mimicking errors-based noising scheme that considers a practical correction process at the human level. We propose a precise inspection to attain high performance, and we derived the optimal noising schemes that show substantial effectiveness. Through these, we also demonstrate that depending on the type of noise, the noising scheme-based APE data generation may lead to inferior performance. In addition, we propose a dynamic noise injection strategy that enables the acquisition of a robust error correction capability and demonstrated its effectiveness by comparative analysis. This study enables obtaining a high performance APE model without human-generated data and can promote universal APE research for all language pairs targeting English.
%U https://aclanthology.org/2022.lrec-1.93
%P 883-891
Markdown (Informal)
[Empirical Analysis of Noising Scheme based Synthetic Data Generation for Automatic Post-editing](https://aclanthology.org/2022.lrec-1.93) (Moon et al., LREC 2022)
ACL