@inproceedings{liu-soh-2022-towards,
title = "Towards Better Characterization of Paraphrases",
author = "Liu, Timothy and
Soh, De Wen",
editor = "Muresan, Smaranda and
Nakov, Preslav and
Villavicencio, Aline",
booktitle = "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.acl-long.588",
doi = "10.18653/v1/2022.acl-long.588",
pages = "8592--8601",
abstract = "To effectively characterize the nature of paraphrase pairs without expert human annotation, we proposes two new metrics: word position deviation (WPD) and lexical deviation (LD). WPD measures the degree of structural alteration, while LD measures the difference in vocabulary used. We apply these metrics to better understand the commonly-used MRPC dataset and study how it differs from PAWS, another paraphrase identification dataset. We also perform a detailed study on MRPC and propose improvements to the dataset, showing that it improves generalizability of models trained on the dataset. Lastly, we apply our metrics to filter the output of a paraphrase generation model and show how it can be used to generate specific forms of paraphrases for data augmentation or robustness testing of NLP models.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="liu-soh-2022-towards">
<titleInfo>
<title>Towards Better Characterization of Paraphrases</title>
</titleInfo>
<name type="personal">
<namePart type="given">Timothy</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">De</namePart>
<namePart type="given">Wen</namePart>
<namePart type="family">Soh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Smaranda</namePart>
<namePart type="family">Muresan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preslav</namePart>
<namePart type="family">Nakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aline</namePart>
<namePart type="family">Villavicencio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>To effectively characterize the nature of paraphrase pairs without expert human annotation, we proposes two new metrics: word position deviation (WPD) and lexical deviation (LD). WPD measures the degree of structural alteration, while LD measures the difference in vocabulary used. We apply these metrics to better understand the commonly-used MRPC dataset and study how it differs from PAWS, another paraphrase identification dataset. We also perform a detailed study on MRPC and propose improvements to the dataset, showing that it improves generalizability of models trained on the dataset. Lastly, we apply our metrics to filter the output of a paraphrase generation model and show how it can be used to generate specific forms of paraphrases for data augmentation or robustness testing of NLP models.</abstract>
<identifier type="citekey">liu-soh-2022-towards</identifier>
<identifier type="doi">10.18653/v1/2022.acl-long.588</identifier>
<location>
<url>https://aclanthology.org/2022.acl-long.588</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>8592</start>
<end>8601</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Towards Better Characterization of Paraphrases
%A Liu, Timothy
%A Soh, De Wen
%Y Muresan, Smaranda
%Y Nakov, Preslav
%Y Villavicencio, Aline
%S Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland
%F liu-soh-2022-towards
%X To effectively characterize the nature of paraphrase pairs without expert human annotation, we proposes two new metrics: word position deviation (WPD) and lexical deviation (LD). WPD measures the degree of structural alteration, while LD measures the difference in vocabulary used. We apply these metrics to better understand the commonly-used MRPC dataset and study how it differs from PAWS, another paraphrase identification dataset. We also perform a detailed study on MRPC and propose improvements to the dataset, showing that it improves generalizability of models trained on the dataset. Lastly, we apply our metrics to filter the output of a paraphrase generation model and show how it can be used to generate specific forms of paraphrases for data augmentation or robustness testing of NLP models.
%R 10.18653/v1/2022.acl-long.588
%U https://aclanthology.org/2022.acl-long.588
%U https://doi.org/10.18653/v1/2022.acl-long.588
%P 8592-8601
Markdown (Informal)
[Towards Better Characterization of Paraphrases](https://aclanthology.org/2022.acl-long.588) (Liu & Soh, ACL 2022)
ACL
- Timothy Liu and De Wen Soh. 2022. Towards Better Characterization of Paraphrases. In Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 8592–8601, Dublin, Ireland. Association for Computational Linguistics.