@inproceedings{dreano-etal-2024-cyclegn,
title = "{C}ycle{GN}: A Cycle Consistent Approach for Neural Machine Translation",
author = {Dreano, S{\"o}ren and
Molloy, Derek and
Murphy, Noel},
editor = "Haddow, Barry and
Kocmi, Tom and
Koehn, Philipp and
Monz, Christof",
booktitle = "Proceedings of the Ninth Conference on Machine Translation",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.wmt-1.10",
pages = "165--175",
abstract = "CycleGN is a fully self-supervised Neural Machine Translation framework relying on the Transformer architecture that does not require parallel data. Its approach is similar to a Discriminator-less CycleGAN, hence the {``}non-adversarial{''} name, specifically tailored for non-parallel text datasets. The foundational concept of our research posits that in an ideal scenario, retro-translations of generated translations should revert to the original source sentences. Consequently, a pair of models can be trained using a Cycle Consistency Loss (CCL) only, with one model translating in one direction and the second model in the opposite direction.In the context of this research, two sub-categories of non-parallel datasets are introduced. A {``}permuted{''} dataset is defined as a parallel dataset wherein the sentences of one language have been systematically rearranged. Consequently, this results in a non-parallel corpus where it is guaranteed that each sentence has a corresponding translation located at an unspecified index within the dataset. A {``}non-intersecting{''} dataset is a non-parallel dataset for which it is guaranteed that no sentence has an exact translation.Masked Language Modeling (MLM) is a pre-training strategy implemented in BERT, where a specified proportion of the input tokens are substituted with a unique {\$}mask{\$} token. The objective of the neural network under this paradigm is to accurately reconstruct the original sentence from this degraded input.In inference mode, Transformers are able to generate sentences without labels. Thus, the first step is to generate pseudo-labels in inference, that are then used as labels during training. However, the models consistently converge towards a trivial solution in which the input, the generated pseudo-labels and the output are identical, achieving an optimal outcome on the CCL function, registering a value of zero. CycleGN demonstrates how MLM pre-training can be leveraged to move away from this trivial path and perform actual text translation.As a contribution to the WMT24 challenge, this study explores the efficacy of the CycleGN architectural framework in learning translation tasks across eleven language pairs under the permuted condition and four under the non-intersecting condition. Moreover, two additional language pairs from the previous WMT edition were trained and the evaluations demonstrate the robust adaptability of CycleGN in learning translation tasks.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dreano-etal-2024-cyclegn">
<titleInfo>
<title>CycleGN: A Cycle Consistent Approach for Neural Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sören</namePart>
<namePart type="family">Dreano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Derek</namePart>
<namePart type="family">Molloy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Noel</namePart>
<namePart type="family">Murphy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Ninth Conference on Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Barry</namePart>
<namePart type="family">Haddow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tom</namePart>
<namePart type="family">Kocmi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Koehn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christof</namePart>
<namePart type="family">Monz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>CycleGN is a fully self-supervised Neural Machine Translation framework relying on the Transformer architecture that does not require parallel data. Its approach is similar to a Discriminator-less CycleGAN, hence the “non-adversarial” name, specifically tailored for non-parallel text datasets. The foundational concept of our research posits that in an ideal scenario, retro-translations of generated translations should revert to the original source sentences. Consequently, a pair of models can be trained using a Cycle Consistency Loss (CCL) only, with one model translating in one direction and the second model in the opposite direction.In the context of this research, two sub-categories of non-parallel datasets are introduced. A “permuted” dataset is defined as a parallel dataset wherein the sentences of one language have been systematically rearranged. Consequently, this results in a non-parallel corpus where it is guaranteed that each sentence has a corresponding translation located at an unspecified index within the dataset. A “non-intersecting” dataset is a non-parallel dataset for which it is guaranteed that no sentence has an exact translation.Masked Language Modeling (MLM) is a pre-training strategy implemented in BERT, where a specified proportion of the input tokens are substituted with a unique $mask$ token. The objective of the neural network under this paradigm is to accurately reconstruct the original sentence from this degraded input.In inference mode, Transformers are able to generate sentences without labels. Thus, the first step is to generate pseudo-labels in inference, that are then used as labels during training. However, the models consistently converge towards a trivial solution in which the input, the generated pseudo-labels and the output are identical, achieving an optimal outcome on the CCL function, registering a value of zero. CycleGN demonstrates how MLM pre-training can be leveraged to move away from this trivial path and perform actual text translation.As a contribution to the WMT24 challenge, this study explores the efficacy of the CycleGN architectural framework in learning translation tasks across eleven language pairs under the permuted condition and four under the non-intersecting condition. Moreover, two additional language pairs from the previous WMT edition were trained and the evaluations demonstrate the robust adaptability of CycleGN in learning translation tasks.</abstract>
<identifier type="citekey">dreano-etal-2024-cyclegn</identifier>
<location>
<url>https://aclanthology.org/2024.wmt-1.10</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>165</start>
<end>175</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CycleGN: A Cycle Consistent Approach for Neural Machine Translation
%A Dreano, Sören
%A Molloy, Derek
%A Murphy, Noel
%Y Haddow, Barry
%Y Kocmi, Tom
%Y Koehn, Philipp
%Y Monz, Christof
%S Proceedings of the Ninth Conference on Machine Translation
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F dreano-etal-2024-cyclegn
%X CycleGN is a fully self-supervised Neural Machine Translation framework relying on the Transformer architecture that does not require parallel data. Its approach is similar to a Discriminator-less CycleGAN, hence the “non-adversarial” name, specifically tailored for non-parallel text datasets. The foundational concept of our research posits that in an ideal scenario, retro-translations of generated translations should revert to the original source sentences. Consequently, a pair of models can be trained using a Cycle Consistency Loss (CCL) only, with one model translating in one direction and the second model in the opposite direction.In the context of this research, two sub-categories of non-parallel datasets are introduced. A “permuted” dataset is defined as a parallel dataset wherein the sentences of one language have been systematically rearranged. Consequently, this results in a non-parallel corpus where it is guaranteed that each sentence has a corresponding translation located at an unspecified index within the dataset. A “non-intersecting” dataset is a non-parallel dataset for which it is guaranteed that no sentence has an exact translation.Masked Language Modeling (MLM) is a pre-training strategy implemented in BERT, where a specified proportion of the input tokens are substituted with a unique $mask$ token. The objective of the neural network under this paradigm is to accurately reconstruct the original sentence from this degraded input.In inference mode, Transformers are able to generate sentences without labels. Thus, the first step is to generate pseudo-labels in inference, that are then used as labels during training. However, the models consistently converge towards a trivial solution in which the input, the generated pseudo-labels and the output are identical, achieving an optimal outcome on the CCL function, registering a value of zero. CycleGN demonstrates how MLM pre-training can be leveraged to move away from this trivial path and perform actual text translation.As a contribution to the WMT24 challenge, this study explores the efficacy of the CycleGN architectural framework in learning translation tasks across eleven language pairs under the permuted condition and four under the non-intersecting condition. Moreover, two additional language pairs from the previous WMT edition were trained and the evaluations demonstrate the robust adaptability of CycleGN in learning translation tasks.
%U https://aclanthology.org/2024.wmt-1.10
%P 165-175
Markdown (Informal)
[CycleGN: A Cycle Consistent Approach for Neural Machine Translation](https://aclanthology.org/2024.wmt-1.10) (Dreano et al., WMT 2024)
ACL