@inproceedings{dong-etal-2021-parasci,
title = "{P}ara{SCI}: A Large Scientific Paraphrase Dataset for Longer Paraphrase Generation",
author = "Dong, Qingxiu and
Wan, Xiaojun and
Cao, Yue",
editor = "Merlo, Paola and
Tiedemann, Jorg and
Tsarfaty, Reut",
booktitle = "Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume",
month = apr,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.eacl-main.33",
doi = "10.18653/v1/2021.eacl-main.33",
pages = "424--434",
abstract = "We propose ParaSCI, the first large-scale paraphrase dataset in the scientific field, including 33,981 paraphrase pairs from ACL (ParaSCI-ACL) and 316,063 pairs from arXiv (ParaSCI-arXiv). Digging into characteristics and common patterns of scientific papers, we construct this dataset though intra-paper and inter-paper methods, such as collecting citations to the same paper or aggregating definitions by scientific terms. To take advantage of sentences paraphrased partially, we put up PDBERT as a general paraphrase discovering method. The major advantages of paraphrases in ParaSCI lie in the prominent length and textual diversity, which is complementary to existing paraphrase datasets. ParaSCI obtains satisfactory results on human evaluation and downstream tasks, especially long paraphrase generation.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dong-etal-2021-parasci">
<titleInfo>
<title>ParaSCI: A Large Scientific Paraphrase Dataset for Longer Paraphrase Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Qingxiu</namePart>
<namePart type="family">Dong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaojun</namePart>
<namePart type="family">Wan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Cao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume</title>
</titleInfo>
<name type="personal">
<namePart type="given">Paola</namePart>
<namePart type="family">Merlo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jorg</namePart>
<namePart type="family">Tiedemann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Reut</namePart>
<namePart type="family">Tsarfaty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We propose ParaSCI, the first large-scale paraphrase dataset in the scientific field, including 33,981 paraphrase pairs from ACL (ParaSCI-ACL) and 316,063 pairs from arXiv (ParaSCI-arXiv). Digging into characteristics and common patterns of scientific papers, we construct this dataset though intra-paper and inter-paper methods, such as collecting citations to the same paper or aggregating definitions by scientific terms. To take advantage of sentences paraphrased partially, we put up PDBERT as a general paraphrase discovering method. The major advantages of paraphrases in ParaSCI lie in the prominent length and textual diversity, which is complementary to existing paraphrase datasets. ParaSCI obtains satisfactory results on human evaluation and downstream tasks, especially long paraphrase generation.</abstract>
<identifier type="citekey">dong-etal-2021-parasci</identifier>
<identifier type="doi">10.18653/v1/2021.eacl-main.33</identifier>
<location>
<url>https://aclanthology.org/2021.eacl-main.33</url>
</location>
<part>
<date>2021-04</date>
<extent unit="page">
<start>424</start>
<end>434</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ParaSCI: A Large Scientific Paraphrase Dataset for Longer Paraphrase Generation
%A Dong, Qingxiu
%A Wan, Xiaojun
%A Cao, Yue
%Y Merlo, Paola
%Y Tiedemann, Jorg
%Y Tsarfaty, Reut
%S Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume
%D 2021
%8 April
%I Association for Computational Linguistics
%C Online
%F dong-etal-2021-parasci
%X We propose ParaSCI, the first large-scale paraphrase dataset in the scientific field, including 33,981 paraphrase pairs from ACL (ParaSCI-ACL) and 316,063 pairs from arXiv (ParaSCI-arXiv). Digging into characteristics and common patterns of scientific papers, we construct this dataset though intra-paper and inter-paper methods, such as collecting citations to the same paper or aggregating definitions by scientific terms. To take advantage of sentences paraphrased partially, we put up PDBERT as a general paraphrase discovering method. The major advantages of paraphrases in ParaSCI lie in the prominent length and textual diversity, which is complementary to existing paraphrase datasets. ParaSCI obtains satisfactory results on human evaluation and downstream tasks, especially long paraphrase generation.
%R 10.18653/v1/2021.eacl-main.33
%U https://aclanthology.org/2021.eacl-main.33
%U https://doi.org/10.18653/v1/2021.eacl-main.33
%P 424-434
Markdown (Informal)
[ParaSCI: A Large Scientific Paraphrase Dataset for Longer Paraphrase Generation](https://aclanthology.org/2021.eacl-main.33) (Dong et al., EACL 2021)
ACL