@inproceedings{altakrori-etal-2021-topic-confusion,
title = "The Topic Confusion Task: A Novel Evaluation Scenario for Authorship Attribution",
author = "Altakrori, Malik and
Cheung, Jackie Chi Kit and
Fung, Benjamin C. M.",
editor = "Moens, Marie-Francine and
Huang, Xuanjing and
Specia, Lucia and
Yih, Scott Wen-tau",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2021",
month = nov,
year = "2021",
address = "Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.findings-emnlp.359",
doi = "10.18653/v1/2021.findings-emnlp.359",
pages = "4242--4256",
abstract = "Authorship attribution is the problem of identifying the most plausible author of an anonymous text from a set of candidate authors. Researchers have investigated same-topic and cross-topic scenarios of authorship attribution, which differ according to whether new, unseen topics are used in the testing phase. However, neither scenario allows us to explain whether errors are caused by failure to capture authorship writing style or by the topic shift. Motivated by this, we propose the \textit{topic confusion} task where we switch the author-topic configuration between the training and testing sets. This setup allows us to investigate two types of errors: one caused by the topic shift and one caused by the features{'} inability to capture the writing styles. We show that stylometric features with part-of-speech tags are the least susceptible to topic variations. We further show that combining them with other features leads to significantly lower topic confusion and higher attribution accuracy. Finally, we show that pretrained language models such as BERT and RoBERTa perform poorly on this task and are surpassed by simple features such as word-level $n$-gram.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="altakrori-etal-2021-topic-confusion">
<titleInfo>
<title>The Topic Confusion Task: A Novel Evaluation Scenario for Authorship Attribution</title>
</titleInfo>
<name type="personal">
<namePart type="given">Malik</namePart>
<namePart type="family">Altakrori</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jackie</namePart>
<namePart type="given">Chi</namePart>
<namePart type="given">Kit</namePart>
<namePart type="family">Cheung</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Benjamin</namePart>
<namePart type="given">C</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Fung</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2021</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marie-Francine</namePart>
<namePart type="family">Moens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuanjing</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucia</namePart>
<namePart type="family">Specia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Scott</namePart>
<namePart type="given">Wen-tau</namePart>
<namePart type="family">Yih</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Punta Cana, Dominican Republic</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Authorship attribution is the problem of identifying the most plausible author of an anonymous text from a set of candidate authors. Researchers have investigated same-topic and cross-topic scenarios of authorship attribution, which differ according to whether new, unseen topics are used in the testing phase. However, neither scenario allows us to explain whether errors are caused by failure to capture authorship writing style or by the topic shift. Motivated by this, we propose the topic confusion task where we switch the author-topic configuration between the training and testing sets. This setup allows us to investigate two types of errors: one caused by the topic shift and one caused by the features’ inability to capture the writing styles. We show that stylometric features with part-of-speech tags are the least susceptible to topic variations. We further show that combining them with other features leads to significantly lower topic confusion and higher attribution accuracy. Finally, we show that pretrained language models such as BERT and RoBERTa perform poorly on this task and are surpassed by simple features such as word-level n-gram.</abstract>
<identifier type="citekey">altakrori-etal-2021-topic-confusion</identifier>
<identifier type="doi">10.18653/v1/2021.findings-emnlp.359</identifier>
<location>
<url>https://aclanthology.org/2021.findings-emnlp.359</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>4242</start>
<end>4256</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The Topic Confusion Task: A Novel Evaluation Scenario for Authorship Attribution
%A Altakrori, Malik
%A Cheung, Jackie Chi Kit
%A Fung, Benjamin C. M.
%Y Moens, Marie-Francine
%Y Huang, Xuanjing
%Y Specia, Lucia
%Y Yih, Scott Wen-tau
%S Findings of the Association for Computational Linguistics: EMNLP 2021
%D 2021
%8 November
%I Association for Computational Linguistics
%C Punta Cana, Dominican Republic
%F altakrori-etal-2021-topic-confusion
%X Authorship attribution is the problem of identifying the most plausible author of an anonymous text from a set of candidate authors. Researchers have investigated same-topic and cross-topic scenarios of authorship attribution, which differ according to whether new, unseen topics are used in the testing phase. However, neither scenario allows us to explain whether errors are caused by failure to capture authorship writing style or by the topic shift. Motivated by this, we propose the topic confusion task where we switch the author-topic configuration between the training and testing sets. This setup allows us to investigate two types of errors: one caused by the topic shift and one caused by the features’ inability to capture the writing styles. We show that stylometric features with part-of-speech tags are the least susceptible to topic variations. We further show that combining them with other features leads to significantly lower topic confusion and higher attribution accuracy. Finally, we show that pretrained language models such as BERT and RoBERTa perform poorly on this task and are surpassed by simple features such as word-level n-gram.
%R 10.18653/v1/2021.findings-emnlp.359
%U https://aclanthology.org/2021.findings-emnlp.359
%U https://doi.org/10.18653/v1/2021.findings-emnlp.359
%P 4242-4256
Markdown (Informal)
[The Topic Confusion Task: A Novel Evaluation Scenario for Authorship Attribution](https://aclanthology.org/2021.findings-emnlp.359) (Altakrori et al., Findings 2021)
ACL