@inproceedings{nguyen-etal-2023-improving,
title = "Improving Long-Text Authorship Verification via Model Selection and Data Tuning",
author = "Nguyen, Trang and
Dagli, Charlie and
Alperin, Kenneth and
Vandam, Courtland and
Singer, Elliot",
editor = "Degaetano-Ortlieb, Stefania and
Kazantseva, Anna and
Reiter, Nils and
Szpakowicz, Stan",
booktitle = "Proceedings of the 7th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.latechclfl-1.4",
doi = "10.18653/v1/2023.latechclfl-1.4",
pages = "28--37",
abstract = "Authorship verification is used to link texts written by the same author without needing a model per author, making it useful to deanonymizing users spreading text with malicious intent. In this work, we evaluated our Cross-Encoder system with four Transformers using differently tuned variants of fanfiction data and found that our BigBird pipeline outperformed Longformer, RoBERTa, and ELECTRA and performed competitively against the official top ranked system from the PAN evaluation. We also examined the effect of authors and fandoms not seen in training on model performance. Through this, we found fandom has the greatest influence on true trials, and that a balanced training dataset in terms of class and fandom performed the most consistently.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nguyen-etal-2023-improving">
<titleInfo>
<title>Improving Long-Text Authorship Verification via Model Selection and Data Tuning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Trang</namePart>
<namePart type="family">Nguyen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Charlie</namePart>
<namePart type="family">Dagli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kenneth</namePart>
<namePart type="family">Alperin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Courtland</namePart>
<namePart type="family">Vandam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elliot</namePart>
<namePart type="family">Singer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 7th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature</title>
</titleInfo>
<name type="personal">
<namePart type="given">Stefania</namePart>
<namePart type="family">Degaetano-Ortlieb</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Kazantseva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nils</namePart>
<namePart type="family">Reiter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stan</namePart>
<namePart type="family">Szpakowicz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dubrovnik, Croatia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Authorship verification is used to link texts written by the same author without needing a model per author, making it useful to deanonymizing users spreading text with malicious intent. In this work, we evaluated our Cross-Encoder system with four Transformers using differently tuned variants of fanfiction data and found that our BigBird pipeline outperformed Longformer, RoBERTa, and ELECTRA and performed competitively against the official top ranked system from the PAN evaluation. We also examined the effect of authors and fandoms not seen in training on model performance. Through this, we found fandom has the greatest influence on true trials, and that a balanced training dataset in terms of class and fandom performed the most consistently.</abstract>
<identifier type="citekey">nguyen-etal-2023-improving</identifier>
<identifier type="doi">10.18653/v1/2023.latechclfl-1.4</identifier>
<location>
<url>https://aclanthology.org/2023.latechclfl-1.4</url>
</location>
<part>
<date>2023-05</date>
<extent unit="page">
<start>28</start>
<end>37</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Improving Long-Text Authorship Verification via Model Selection and Data Tuning
%A Nguyen, Trang
%A Dagli, Charlie
%A Alperin, Kenneth
%A Vandam, Courtland
%A Singer, Elliot
%Y Degaetano-Ortlieb, Stefania
%Y Kazantseva, Anna
%Y Reiter, Nils
%Y Szpakowicz, Stan
%S Proceedings of the 7th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature
%D 2023
%8 May
%I Association for Computational Linguistics
%C Dubrovnik, Croatia
%F nguyen-etal-2023-improving
%X Authorship verification is used to link texts written by the same author without needing a model per author, making it useful to deanonymizing users spreading text with malicious intent. In this work, we evaluated our Cross-Encoder system with four Transformers using differently tuned variants of fanfiction data and found that our BigBird pipeline outperformed Longformer, RoBERTa, and ELECTRA and performed competitively against the official top ranked system from the PAN evaluation. We also examined the effect of authors and fandoms not seen in training on model performance. Through this, we found fandom has the greatest influence on true trials, and that a balanced training dataset in terms of class and fandom performed the most consistently.
%R 10.18653/v1/2023.latechclfl-1.4
%U https://aclanthology.org/2023.latechclfl-1.4
%U https://doi.org/10.18653/v1/2023.latechclfl-1.4
%P 28-37
Markdown (Informal)
[Improving Long-Text Authorship Verification via Model Selection and Data Tuning](https://aclanthology.org/2023.latechclfl-1.4) (Nguyen et al., LaTeCHCLfL 2023)
ACL