@inproceedings{ormazabal-etal-2022-principled,
title = "Principled Paraphrase Generation with Parallel Corpora",
author = "Ormazabal, Aitor and
Artetxe, Mikel and
Soroa, Aitor and
Labaka, Gorka and
Agirre, Eneko",
editor = "Muresan, Smaranda and
Nakov, Preslav and
Villavicencio, Aline",
booktitle = "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.acl-long.114/",
doi = "10.18653/v1/2022.acl-long.114",
pages = "1621--1638",
abstract = "Round-trip Machine Translation (MT) is a popular choice for paraphrase generation, which leverages readily available parallel corpora for supervision. In this paper, we formalize the implicit similarity function induced by this approach, and show that it is susceptible to non-paraphrase pairs sharing a single ambiguous translation. Based on these insights, we design an alternative similarity metric that mitigates this issue by requiring the entire translation distribution to match, and implement a relaxation of it through the Information Bottleneck method. Our approach incorporates an adversarial term into MT training in order to learn representations that encode as much information about the reference translation as possible, while keeping as little information about the input as possible. Paraphrases can be generated by decoding back to the source from this representation, without having to generate pivot translations. In addition to being more principled and efficient than round-trip MT, our approach offers an adjustable parameter to control the fidelity-diversity trade-off, and obtains better results in our experiments."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ormazabal-etal-2022-principled">
<titleInfo>
<title>Principled Paraphrase Generation with Parallel Corpora</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aitor</namePart>
<namePart type="family">Ormazabal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mikel</namePart>
<namePart type="family">Artetxe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aitor</namePart>
<namePart type="family">Soroa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gorka</namePart>
<namePart type="family">Labaka</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eneko</namePart>
<namePart type="family">Agirre</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Smaranda</namePart>
<namePart type="family">Muresan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preslav</namePart>
<namePart type="family">Nakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aline</namePart>
<namePart type="family">Villavicencio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Round-trip Machine Translation (MT) is a popular choice for paraphrase generation, which leverages readily available parallel corpora for supervision. In this paper, we formalize the implicit similarity function induced by this approach, and show that it is susceptible to non-paraphrase pairs sharing a single ambiguous translation. Based on these insights, we design an alternative similarity metric that mitigates this issue by requiring the entire translation distribution to match, and implement a relaxation of it through the Information Bottleneck method. Our approach incorporates an adversarial term into MT training in order to learn representations that encode as much information about the reference translation as possible, while keeping as little information about the input as possible. Paraphrases can be generated by decoding back to the source from this representation, without having to generate pivot translations. In addition to being more principled and efficient than round-trip MT, our approach offers an adjustable parameter to control the fidelity-diversity trade-off, and obtains better results in our experiments.</abstract>
<identifier type="citekey">ormazabal-etal-2022-principled</identifier>
<identifier type="doi">10.18653/v1/2022.acl-long.114</identifier>
<location>
<url>https://aclanthology.org/2022.acl-long.114/</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>1621</start>
<end>1638</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Principled Paraphrase Generation with Parallel Corpora
%A Ormazabal, Aitor
%A Artetxe, Mikel
%A Soroa, Aitor
%A Labaka, Gorka
%A Agirre, Eneko
%Y Muresan, Smaranda
%Y Nakov, Preslav
%Y Villavicencio, Aline
%S Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland
%F ormazabal-etal-2022-principled
%X Round-trip Machine Translation (MT) is a popular choice for paraphrase generation, which leverages readily available parallel corpora for supervision. In this paper, we formalize the implicit similarity function induced by this approach, and show that it is susceptible to non-paraphrase pairs sharing a single ambiguous translation. Based on these insights, we design an alternative similarity metric that mitigates this issue by requiring the entire translation distribution to match, and implement a relaxation of it through the Information Bottleneck method. Our approach incorporates an adversarial term into MT training in order to learn representations that encode as much information about the reference translation as possible, while keeping as little information about the input as possible. Paraphrases can be generated by decoding back to the source from this representation, without having to generate pivot translations. In addition to being more principled and efficient than round-trip MT, our approach offers an adjustable parameter to control the fidelity-diversity trade-off, and obtains better results in our experiments.
%R 10.18653/v1/2022.acl-long.114
%U https://aclanthology.org/2022.acl-long.114/
%U https://doi.org/10.18653/v1/2022.acl-long.114
%P 1621-1638
Markdown (Informal)
[Principled Paraphrase Generation with Parallel Corpora](https://aclanthology.org/2022.acl-long.114/) (Ormazabal et al., ACL 2022)
ACL
- Aitor Ormazabal, Mikel Artetxe, Aitor Soroa, Gorka Labaka, and Eneko Agirre. 2022. Principled Paraphrase Generation with Parallel Corpora. In Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 1621–1638, Dublin, Ireland. Association for Computational Linguistics.