@inproceedings{yadagiri-etal-2024-detecting,
title = "Detecting {AI}-Generated Text with Pre-Trained Models Using Linguistic Features",
author = "Yadagiri, Annepaka and
Shree, Lavanya and
Parween, Suraiya and
Raj, Anushka and
Maurya, Shreya and
Pakray, Partha",
editor = "Lalitha Devi, Sobha and
Arora, Karunesh",
booktitle = "Proceedings of the 21st International Conference on Natural Language Processing (ICON)",
month = dec,
year = "2024",
address = "AU-KBC Research Centre, Chennai, India",
publisher = "NLP Association of India (NLPAI)",
url = "https://aclanthology.org/2024.icon-1.21/",
pages = "188--196",
abstract = "The advent of sophisticated large language models, such as ChatGPT and other AI-driven platforms, has led to the generation of text that closely mimics human writing, making it increasingly challenging to discern whether it is human-generated or AI-generated content. This poses significant challenges to content verification, academic integrity, and detecting misleading information. To address these issues, we developed a classification system to differentiate between human-written and AI-generated texts using a diverse HC3-English dataset. This dataset leveraged linguistic analysis and structural features, including part-of-speech tags, vocabulary size, word density, active and passive voice usage, and readability metrics such as Flesch Reading Ease, perplexity, and burstiness. We employed transformer-based and deep-learning models for the classification task, such as CNN{\_}BiLSTM, RNN, BERT, GPT-2, and RoBERTa. Among these, the RoBERTa model demonstrated superior performance, achieving an impressive accuracy of 99.73. These outcomes demonstrate how cutting-edge deep learning methods can maintain information integrity in the digital realm."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yadagiri-etal-2024-detecting">
<titleInfo>
<title>Detecting AI-Generated Text with Pre-Trained Models Using Linguistic Features</title>
</titleInfo>
<name type="personal">
<namePart type="given">Annepaka</namePart>
<namePart type="family">Yadagiri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lavanya</namePart>
<namePart type="family">Shree</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Suraiya</namePart>
<namePart type="family">Parween</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anushka</namePart>
<namePart type="family">Raj</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shreya</namePart>
<namePart type="family">Maurya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Partha</namePart>
<namePart type="family">Pakray</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 21st International Conference on Natural Language Processing (ICON)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sobha</namePart>
<namePart type="family">Lalitha Devi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Karunesh</namePart>
<namePart type="family">Arora</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>NLP Association of India (NLPAI)</publisher>
<place>
<placeTerm type="text">AU-KBC Research Centre, Chennai, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The advent of sophisticated large language models, such as ChatGPT and other AI-driven platforms, has led to the generation of text that closely mimics human writing, making it increasingly challenging to discern whether it is human-generated or AI-generated content. This poses significant challenges to content verification, academic integrity, and detecting misleading information. To address these issues, we developed a classification system to differentiate between human-written and AI-generated texts using a diverse HC3-English dataset. This dataset leveraged linguistic analysis and structural features, including part-of-speech tags, vocabulary size, word density, active and passive voice usage, and readability metrics such as Flesch Reading Ease, perplexity, and burstiness. We employed transformer-based and deep-learning models for the classification task, such as CNN_BiLSTM, RNN, BERT, GPT-2, and RoBERTa. Among these, the RoBERTa model demonstrated superior performance, achieving an impressive accuracy of 99.73. These outcomes demonstrate how cutting-edge deep learning methods can maintain information integrity in the digital realm.</abstract>
<identifier type="citekey">yadagiri-etal-2024-detecting</identifier>
<location>
<url>https://aclanthology.org/2024.icon-1.21/</url>
</location>
<part>
<date>2024-12</date>
<extent unit="page">
<start>188</start>
<end>196</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Detecting AI-Generated Text with Pre-Trained Models Using Linguistic Features
%A Yadagiri, Annepaka
%A Shree, Lavanya
%A Parween, Suraiya
%A Raj, Anushka
%A Maurya, Shreya
%A Pakray, Partha
%Y Lalitha Devi, Sobha
%Y Arora, Karunesh
%S Proceedings of the 21st International Conference on Natural Language Processing (ICON)
%D 2024
%8 December
%I NLP Association of India (NLPAI)
%C AU-KBC Research Centre, Chennai, India
%F yadagiri-etal-2024-detecting
%X The advent of sophisticated large language models, such as ChatGPT and other AI-driven platforms, has led to the generation of text that closely mimics human writing, making it increasingly challenging to discern whether it is human-generated or AI-generated content. This poses significant challenges to content verification, academic integrity, and detecting misleading information. To address these issues, we developed a classification system to differentiate between human-written and AI-generated texts using a diverse HC3-English dataset. This dataset leveraged linguistic analysis and structural features, including part-of-speech tags, vocabulary size, word density, active and passive voice usage, and readability metrics such as Flesch Reading Ease, perplexity, and burstiness. We employed transformer-based and deep-learning models for the classification task, such as CNN_BiLSTM, RNN, BERT, GPT-2, and RoBERTa. Among these, the RoBERTa model demonstrated superior performance, achieving an impressive accuracy of 99.73. These outcomes demonstrate how cutting-edge deep learning methods can maintain information integrity in the digital realm.
%U https://aclanthology.org/2024.icon-1.21/
%P 188-196
Markdown (Informal)
[Detecting AI-Generated Text with Pre-Trained Models Using Linguistic Features](https://aclanthology.org/2024.icon-1.21/) (Yadagiri et al., ICON 2024)
ACL