@inproceedings{thomas-etal-2024-simple,
title = "Simple models are all you need: Ensembling stylometric, part-of-speech, and information-theoretic models for the {ALTA} 2024 Shared Task",
author = "Thomas, Joel and
Hoang, Gia Bao and
Mitchell, Lewis",
editor = "Baldwin, Tim and
Rodr{\'i}guez M{\'e}ndez, Sergio Jos{\'e} and
Kuo, Nicholas",
booktitle = "Proceedings of the 22nd Annual Workshop of the Australasian Language Technology Association",
month = dec,
year = "2024",
address = "Canberra, Australia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.alta-1.19/",
pages = "207--212",
abstract = "The ALTA 2024 shared task concerned automated detection of AI-generated text. Large language models (LLM) were used to generate hybrid documents, where individual sentences were authored by either humans or a state-of-the-art LLM. Rather than rely on similarly computationally expensive tools like transformer-based methods, we decided to approach this task using only an ensemble of lightweight {\textquotedblleft}traditional{\textquotedblright} methods that could be trained on a standard desktop machine. Our approach used models based on word counts, stylometric features, readability metrics, part-of-speech tagging, and an information-theoretic entropy estimator to predict authorship. These models, combined with a simple weighting scheme, performed well on a held-out test set, achieving an accuracy of 0.855 and a kappa score of 0.695. Our results show that relatively simple, interpretable models can perform effectively at tasks like authorship prediction, even on short texts, which is important for democratisation of AI as well as future applications in edge computing."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="thomas-etal-2024-simple">
<titleInfo>
<title>Simple models are all you need: Ensembling stylometric, part-of-speech, and information-theoretic models for the ALTA 2024 Shared Task</title>
</titleInfo>
<name type="personal">
<namePart type="given">Joel</namePart>
<namePart type="family">Thomas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gia</namePart>
<namePart type="given">Bao</namePart>
<namePart type="family">Hoang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lewis</namePart>
<namePart type="family">Mitchell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 22nd Annual Workshop of the Australasian Language Technology Association</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tim</namePart>
<namePart type="family">Baldwin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sergio</namePart>
<namePart type="given">José</namePart>
<namePart type="family">Rodríguez Méndez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nicholas</namePart>
<namePart type="family">Kuo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Canberra, Australia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The ALTA 2024 shared task concerned automated detection of AI-generated text. Large language models (LLM) were used to generate hybrid documents, where individual sentences were authored by either humans or a state-of-the-art LLM. Rather than rely on similarly computationally expensive tools like transformer-based methods, we decided to approach this task using only an ensemble of lightweight “traditional” methods that could be trained on a standard desktop machine. Our approach used models based on word counts, stylometric features, readability metrics, part-of-speech tagging, and an information-theoretic entropy estimator to predict authorship. These models, combined with a simple weighting scheme, performed well on a held-out test set, achieving an accuracy of 0.855 and a kappa score of 0.695. Our results show that relatively simple, interpretable models can perform effectively at tasks like authorship prediction, even on short texts, which is important for democratisation of AI as well as future applications in edge computing.</abstract>
<identifier type="citekey">thomas-etal-2024-simple</identifier>
<location>
<url>https://aclanthology.org/2024.alta-1.19/</url>
</location>
<part>
<date>2024-12</date>
<extent unit="page">
<start>207</start>
<end>212</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Simple models are all you need: Ensembling stylometric, part-of-speech, and information-theoretic models for the ALTA 2024 Shared Task
%A Thomas, Joel
%A Hoang, Gia Bao
%A Mitchell, Lewis
%Y Baldwin, Tim
%Y Rodríguez Méndez, Sergio José
%Y Kuo, Nicholas
%S Proceedings of the 22nd Annual Workshop of the Australasian Language Technology Association
%D 2024
%8 December
%I Association for Computational Linguistics
%C Canberra, Australia
%F thomas-etal-2024-simple
%X The ALTA 2024 shared task concerned automated detection of AI-generated text. Large language models (LLM) were used to generate hybrid documents, where individual sentences were authored by either humans or a state-of-the-art LLM. Rather than rely on similarly computationally expensive tools like transformer-based methods, we decided to approach this task using only an ensemble of lightweight “traditional” methods that could be trained on a standard desktop machine. Our approach used models based on word counts, stylometric features, readability metrics, part-of-speech tagging, and an information-theoretic entropy estimator to predict authorship. These models, combined with a simple weighting scheme, performed well on a held-out test set, achieving an accuracy of 0.855 and a kappa score of 0.695. Our results show that relatively simple, interpretable models can perform effectively at tasks like authorship prediction, even on short texts, which is important for democratisation of AI as well as future applications in edge computing.
%U https://aclanthology.org/2024.alta-1.19/
%P 207-212
Markdown (Informal)
[Simple models are all you need: Ensembling stylometric, part-of-speech, and information-theoretic models for the ALTA 2024 Shared Task](https://aclanthology.org/2024.alta-1.19/) (Thomas et al., ALTA 2024)
ACL