@inproceedings{liu-etal-2022-paramac,
title = "{P}ara{M}ac: A General Unsupervised Paraphrase Generation Framework Leveraging Semantic Constraints and Diversifying Mechanisms",
author = "Liu, Jinxin and
Shi, Jiaxin and
Qi, Ji and
Hou, Lei and
Li, Juanzi and
Tian, Qi",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2022",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.findings-emnlp.461/",
doi = "10.18653/v1/2022.findings-emnlp.461",
pages = "6193--6206",
abstract = "Paraphrase generation reflects the ability to understand the meaning from the language surface form and rephrase it to other expressions. Recent paraphrase generation works have paid attention to unsupervised approaches based on Pre-trained Language Models (PLMs) to avoid heavy reliance on parallel data by utilizing PLMs' generation ability. However, the generated pairs of existing unsupervised methods are usually weak either in semantic equivalence or expression diversity. In this paper, we present a novel unsupervised paraphrase generation framework called Paraphrase Machine. By employing multi-aspect equivalence constraints and multi-granularity diversifying mechanisms, Paraphrase Machine is able to achieve good semantic equivalence and expressive diversity, producing a high-quality unsupervised paraphrase dataset. Based on this dataset, we train a general paraphrase model, which can be directly applied to rewrite the input sentence of various domains without any fine-tuning, and achieves substantial gains of 9.1{\%} and 3.3{\%} absolutely in BLEU score over previous SOTA on Quora and MSCOCO. By further fine-tuning our model with domain-specific training sets, the improvement can be increased to even 18.0{\%} and 4.6{\%}. Most importantly, by applying it to language understanding and generation tasks under the low-resource setting, we demonstrate that our model can serve as a universal data augmentor to boost the few-shot performance (e.g., average 2.0{\%} gain on GLUE)."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="liu-etal-2022-paramac">
<titleInfo>
<title>ParaMac: A General Unsupervised Paraphrase Generation Framework Leveraging Semantic Constraints and Diversifying Mechanisms</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jinxin</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiaxin</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ji</namePart>
<namePart type="family">Qi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lei</namePart>
<namePart type="family">Hou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juanzi</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qi</namePart>
<namePart type="family">Tian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Paraphrase generation reflects the ability to understand the meaning from the language surface form and rephrase it to other expressions. Recent paraphrase generation works have paid attention to unsupervised approaches based on Pre-trained Language Models (PLMs) to avoid heavy reliance on parallel data by utilizing PLMs’ generation ability. However, the generated pairs of existing unsupervised methods are usually weak either in semantic equivalence or expression diversity. In this paper, we present a novel unsupervised paraphrase generation framework called Paraphrase Machine. By employing multi-aspect equivalence constraints and multi-granularity diversifying mechanisms, Paraphrase Machine is able to achieve good semantic equivalence and expressive diversity, producing a high-quality unsupervised paraphrase dataset. Based on this dataset, we train a general paraphrase model, which can be directly applied to rewrite the input sentence of various domains without any fine-tuning, and achieves substantial gains of 9.1% and 3.3% absolutely in BLEU score over previous SOTA on Quora and MSCOCO. By further fine-tuning our model with domain-specific training sets, the improvement can be increased to even 18.0% and 4.6%. Most importantly, by applying it to language understanding and generation tasks under the low-resource setting, we demonstrate that our model can serve as a universal data augmentor to boost the few-shot performance (e.g., average 2.0% gain on GLUE).</abstract>
<identifier type="citekey">liu-etal-2022-paramac</identifier>
<identifier type="doi">10.18653/v1/2022.findings-emnlp.461</identifier>
<location>
<url>https://aclanthology.org/2022.findings-emnlp.461/</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>6193</start>
<end>6206</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ParaMac: A General Unsupervised Paraphrase Generation Framework Leveraging Semantic Constraints and Diversifying Mechanisms
%A Liu, Jinxin
%A Shi, Jiaxin
%A Qi, Ji
%A Hou, Lei
%A Li, Juanzi
%A Tian, Qi
%Y Goldberg, Yoav
%Y Kozareva, Zornitsa
%Y Zhang, Yue
%S Findings of the Association for Computational Linguistics: EMNLP 2022
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F liu-etal-2022-paramac
%X Paraphrase generation reflects the ability to understand the meaning from the language surface form and rephrase it to other expressions. Recent paraphrase generation works have paid attention to unsupervised approaches based on Pre-trained Language Models (PLMs) to avoid heavy reliance on parallel data by utilizing PLMs’ generation ability. However, the generated pairs of existing unsupervised methods are usually weak either in semantic equivalence or expression diversity. In this paper, we present a novel unsupervised paraphrase generation framework called Paraphrase Machine. By employing multi-aspect equivalence constraints and multi-granularity diversifying mechanisms, Paraphrase Machine is able to achieve good semantic equivalence and expressive diversity, producing a high-quality unsupervised paraphrase dataset. Based on this dataset, we train a general paraphrase model, which can be directly applied to rewrite the input sentence of various domains without any fine-tuning, and achieves substantial gains of 9.1% and 3.3% absolutely in BLEU score over previous SOTA on Quora and MSCOCO. By further fine-tuning our model with domain-specific training sets, the improvement can be increased to even 18.0% and 4.6%. Most importantly, by applying it to language understanding and generation tasks under the low-resource setting, we demonstrate that our model can serve as a universal data augmentor to boost the few-shot performance (e.g., average 2.0% gain on GLUE).
%R 10.18653/v1/2022.findings-emnlp.461
%U https://aclanthology.org/2022.findings-emnlp.461/
%U https://doi.org/10.18653/v1/2022.findings-emnlp.461
%P 6193-6206
Markdown (Informal)
[ParaMac: A General Unsupervised Paraphrase Generation Framework Leveraging Semantic Constraints and Diversifying Mechanisms](https://aclanthology.org/2022.findings-emnlp.461/) (Liu et al., Findings 2022)
ACL