@inproceedings{kumar-etal-2024-scalar,
title = "{SC}a{LAR} at {S}em{E}val-2024 Task 8: Unmasking the machine : Exploring the power of {R}o{BERT}a Ensemble for Detecting Machine Generated Text",
author = "Kumar, Anand and
B, Abhin and
Murali, Sidhaarth",
editor = {Ojha, Atul Kr. and
Do{\u{g}}ru{\"o}z, A. Seza and
Tayyar Madabushi, Harish and
Da San Martino, Giovanni and
Rosenthal, Sara and
Ros{\'a}, Aiala},
booktitle = "Proceedings of the 18th International Workshop on Semantic Evaluation (SemEval-2024)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.semeval-1.165",
doi = "10.18653/v1/2024.semeval-1.165",
pages = "1135--1139",
abstract = "SemEval SubtaskB, a shared task that is concerned with the detection of text generated by one out of the 5 different models - davinci, bloomz, chatGPT, cohere and dolly. This is an important task considering the boom of generative models in the current day scenario and their ability to draft mails, formal documents, write and qualify exams and many more which keep evolving every passing day. The purpose of classifying text as generated by which pre-trained model helps in analyzing how each of the training data has affected the ability of the model in performing a certain given task. In the proposed approach, data augmentation was done in order to handle lengthier sentences and also labelling them with the same parent label. Upon the augmented data three RoBERTa models were trained on different segments of data which were then ensembled using a voting classifier based on their R2 score to achieve a higher accuracy than the individual models itself. The proposed model achieved an overall validation accuracy of 97.05{\%} and testing accuracy of 76.25{\%}. and our standing was 18th position on the leaderboard.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kumar-etal-2024-scalar">
<titleInfo>
<title>SCaLAR at SemEval-2024 Task 8: Unmasking the machine : Exploring the power of RoBERTa Ensemble for Detecting Machine Generated Text</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anand</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abhin</namePart>
<namePart type="family">B</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sidhaarth</namePart>
<namePart type="family">Murali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 18th International Workshop on Semantic Evaluation (SemEval-2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Atul</namePart>
<namePart type="given">Kr.</namePart>
<namePart type="family">Ojha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">A</namePart>
<namePart type="given">Seza</namePart>
<namePart type="family">Doğruöz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Harish</namePart>
<namePart type="family">Tayyar Madabushi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Giovanni</namePart>
<namePart type="family">Da San Martino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Rosenthal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aiala</namePart>
<namePart type="family">Rosá</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>SemEval SubtaskB, a shared task that is concerned with the detection of text generated by one out of the 5 different models - davinci, bloomz, chatGPT, cohere and dolly. This is an important task considering the boom of generative models in the current day scenario and their ability to draft mails, formal documents, write and qualify exams and many more which keep evolving every passing day. The purpose of classifying text as generated by which pre-trained model helps in analyzing how each of the training data has affected the ability of the model in performing a certain given task. In the proposed approach, data augmentation was done in order to handle lengthier sentences and also labelling them with the same parent label. Upon the augmented data three RoBERTa models were trained on different segments of data which were then ensembled using a voting classifier based on their R2 score to achieve a higher accuracy than the individual models itself. The proposed model achieved an overall validation accuracy of 97.05% and testing accuracy of 76.25%. and our standing was 18th position on the leaderboard.</abstract>
<identifier type="citekey">kumar-etal-2024-scalar</identifier>
<identifier type="doi">10.18653/v1/2024.semeval-1.165</identifier>
<location>
<url>https://aclanthology.org/2024.semeval-1.165</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>1135</start>
<end>1139</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SCaLAR at SemEval-2024 Task 8: Unmasking the machine : Exploring the power of RoBERTa Ensemble for Detecting Machine Generated Text
%A Kumar, Anand
%A B, Abhin
%A Murali, Sidhaarth
%Y Ojha, Atul Kr.
%Y Doğruöz, A. Seza
%Y Tayyar Madabushi, Harish
%Y Da San Martino, Giovanni
%Y Rosenthal, Sara
%Y Rosá, Aiala
%S Proceedings of the 18th International Workshop on Semantic Evaluation (SemEval-2024)
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F kumar-etal-2024-scalar
%X SemEval SubtaskB, a shared task that is concerned with the detection of text generated by one out of the 5 different models - davinci, bloomz, chatGPT, cohere and dolly. This is an important task considering the boom of generative models in the current day scenario and their ability to draft mails, formal documents, write and qualify exams and many more which keep evolving every passing day. The purpose of classifying text as generated by which pre-trained model helps in analyzing how each of the training data has affected the ability of the model in performing a certain given task. In the proposed approach, data augmentation was done in order to handle lengthier sentences and also labelling them with the same parent label. Upon the augmented data three RoBERTa models were trained on different segments of data which were then ensembled using a voting classifier based on their R2 score to achieve a higher accuracy than the individual models itself. The proposed model achieved an overall validation accuracy of 97.05% and testing accuracy of 76.25%. and our standing was 18th position on the leaderboard.
%R 10.18653/v1/2024.semeval-1.165
%U https://aclanthology.org/2024.semeval-1.165
%U https://doi.org/10.18653/v1/2024.semeval-1.165
%P 1135-1139
Markdown (Informal)
[SCaLAR at SemEval-2024 Task 8: Unmasking the machine : Exploring the power of RoBERTa Ensemble for Detecting Machine Generated Text](https://aclanthology.org/2024.semeval-1.165) (Kumar et al., SemEval 2024)
ACL