@inproceedings{maclaughlin-etal-2021-recovering,
title = "Recovering Lexically and Semantically Reused Texts",
author = "MacLaughlin, Ansel and
Xu, Shaobin and
Smith, David A.",
editor = "Ku, Lun-Wei and
Nastase, Vivi and
Vuli{\'c}, Ivan",
booktitle = "Proceedings of *SEM 2021: The Tenth Joint Conference on Lexical and Computational Semantics",
month = aug,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.starsem-1.5",
doi = "10.18653/v1/2021.starsem-1.5",
pages = "52--66",
abstract = "Writers often repurpose material from existing texts when composing new documents. Because most documents have more than one source, we cannot trace these connections using only models of document-level similarity. Instead, this paper considers methods for local text reuse detection (LTRD), detecting localized regions of lexically or semantically similar text embedded in otherwise unrelated material. In extensive experiments, we study the relative performance of four classes of neural and bag-of-words models on three LTRD tasks {--} detecting plagiarism, modeling journalists{'} use of press releases, and identifying scientists{'} citation of earlier papers. We conduct evaluations on three existing datasets and a new, publicly-available citation localization dataset. Our findings shed light on a number of previously-unexplored questions in the study of LTRD, including the importance of incorporating document-level context for predictions, the applicability of of-the-shelf neural models pretrained on {``}general{''} semantic textual similarity tasks such as paraphrase detection, and the trade-offs between more efficient bag-of-words and feature-based neural models and slower pairwise neural models.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="maclaughlin-etal-2021-recovering">
<titleInfo>
<title>Recovering Lexically and Semantically Reused Texts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ansel</namePart>
<namePart type="family">MacLaughlin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shaobin</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="given">A</namePart>
<namePart type="family">Smith</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of *SEM 2021: The Tenth Joint Conference on Lexical and Computational Semantics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivi</namePart>
<namePart type="family">Nastase</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ivan</namePart>
<namePart type="family">Vulić</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Writers often repurpose material from existing texts when composing new documents. Because most documents have more than one source, we cannot trace these connections using only models of document-level similarity. Instead, this paper considers methods for local text reuse detection (LTRD), detecting localized regions of lexically or semantically similar text embedded in otherwise unrelated material. In extensive experiments, we study the relative performance of four classes of neural and bag-of-words models on three LTRD tasks – detecting plagiarism, modeling journalists’ use of press releases, and identifying scientists’ citation of earlier papers. We conduct evaluations on three existing datasets and a new, publicly-available citation localization dataset. Our findings shed light on a number of previously-unexplored questions in the study of LTRD, including the importance of incorporating document-level context for predictions, the applicability of of-the-shelf neural models pretrained on “general” semantic textual similarity tasks such as paraphrase detection, and the trade-offs between more efficient bag-of-words and feature-based neural models and slower pairwise neural models.</abstract>
<identifier type="citekey">maclaughlin-etal-2021-recovering</identifier>
<identifier type="doi">10.18653/v1/2021.starsem-1.5</identifier>
<location>
<url>https://aclanthology.org/2021.starsem-1.5</url>
</location>
<part>
<date>2021-08</date>
<extent unit="page">
<start>52</start>
<end>66</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Recovering Lexically and Semantically Reused Texts
%A MacLaughlin, Ansel
%A Xu, Shaobin
%A Smith, David A.
%Y Ku, Lun-Wei
%Y Nastase, Vivi
%Y Vulić, Ivan
%S Proceedings of *SEM 2021: The Tenth Joint Conference on Lexical and Computational Semantics
%D 2021
%8 August
%I Association for Computational Linguistics
%C Online
%F maclaughlin-etal-2021-recovering
%X Writers often repurpose material from existing texts when composing new documents. Because most documents have more than one source, we cannot trace these connections using only models of document-level similarity. Instead, this paper considers methods for local text reuse detection (LTRD), detecting localized regions of lexically or semantically similar text embedded in otherwise unrelated material. In extensive experiments, we study the relative performance of four classes of neural and bag-of-words models on three LTRD tasks – detecting plagiarism, modeling journalists’ use of press releases, and identifying scientists’ citation of earlier papers. We conduct evaluations on three existing datasets and a new, publicly-available citation localization dataset. Our findings shed light on a number of previously-unexplored questions in the study of LTRD, including the importance of incorporating document-level context for predictions, the applicability of of-the-shelf neural models pretrained on “general” semantic textual similarity tasks such as paraphrase detection, and the trade-offs between more efficient bag-of-words and feature-based neural models and slower pairwise neural models.
%R 10.18653/v1/2021.starsem-1.5
%U https://aclanthology.org/2021.starsem-1.5
%U https://doi.org/10.18653/v1/2021.starsem-1.5
%P 52-66
Markdown (Informal)
[Recovering Lexically and Semantically Reused Texts](https://aclanthology.org/2021.starsem-1.5) (MacLaughlin et al., *SEM 2021)
ACL
- Ansel MacLaughlin, Shaobin Xu, and David A. Smith. 2021. Recovering Lexically and Semantically Reused Texts. In Proceedings of *SEM 2021: The Tenth Joint Conference on Lexical and Computational Semantics, pages 52–66, Online. Association for Computational Linguistics.