@inproceedings{kumar-etal-2024-empowering,
title = "Empowering {SW} Security: {C}ode{BERT} and Machine Learning Approaches to Vulnerability Detection",
author = "Kumar, Lov and
Singh, Vikram and
Patel, Srivalli and
Mishra, Pratyush",
editor = "Lalitha Devi, Sobha and
Arora, Karunesh",
booktitle = "Proceedings of the 21st International Conference on Natural Language Processing (ICON)",
month = dec,
year = "2024",
address = "AU-KBC Research Centre, Chennai, India",
publisher = "NLP Association of India (NLPAI)",
url = "https://aclanthology.org/2024.icon-1.46/",
pages = "399--407",
abstract = "Software (SW) systems experience faults after deployment, raising concerns about reliability and leading to financial losses, reputational damage, and safety risks. This paper presents a novel approach using CodeBERT, a state-of-the-art neural code representation model pre-trained in multi-programming languages and employs various code metrics to predict SW faults. The study comprehensively evaluates trained models by analyzing publicly available codebase and employing diverse machine learning models, feature selection techniques, and class balancing through SMOTE. The results show that SMOTE significantly enhances vulnerability detection performance, particularly in accuracy, AUC, sensitivity, and specificity. The EXTR classifier consistently outperforms others, with an average AUC of 0.82, and the features selected using the GA feature selection technique, despite achieving a mean AUC of 0.84. Interestingly, among employed embedding techniques, SW metrics combined with CodeBERT (SMCBERT) stand out as top performers, achieving the highest mean AUC score of 0.80, making models trained on SMCBERT the best for SW vulnerability prediction."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kumar-etal-2024-empowering">
<titleInfo>
<title>Empowering SW Security: CodeBERT and Machine Learning Approaches to Vulnerability Detection</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lov</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vikram</namePart>
<namePart type="family">Singh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Srivalli</namePart>
<namePart type="family">Patel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pratyush</namePart>
<namePart type="family">Mishra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 21st International Conference on Natural Language Processing (ICON)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sobha</namePart>
<namePart type="family">Lalitha Devi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Karunesh</namePart>
<namePart type="family">Arora</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>NLP Association of India (NLPAI)</publisher>
<place>
<placeTerm type="text">AU-KBC Research Centre, Chennai, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Software (SW) systems experience faults after deployment, raising concerns about reliability and leading to financial losses, reputational damage, and safety risks. This paper presents a novel approach using CodeBERT, a state-of-the-art neural code representation model pre-trained in multi-programming languages and employs various code metrics to predict SW faults. The study comprehensively evaluates trained models by analyzing publicly available codebase and employing diverse machine learning models, feature selection techniques, and class balancing through SMOTE. The results show that SMOTE significantly enhances vulnerability detection performance, particularly in accuracy, AUC, sensitivity, and specificity. The EXTR classifier consistently outperforms others, with an average AUC of 0.82, and the features selected using the GA feature selection technique, despite achieving a mean AUC of 0.84. Interestingly, among employed embedding techniques, SW metrics combined with CodeBERT (SMCBERT) stand out as top performers, achieving the highest mean AUC score of 0.80, making models trained on SMCBERT the best for SW vulnerability prediction.</abstract>
<identifier type="citekey">kumar-etal-2024-empowering</identifier>
<location>
<url>https://aclanthology.org/2024.icon-1.46/</url>
</location>
<part>
<date>2024-12</date>
<extent unit="page">
<start>399</start>
<end>407</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Empowering SW Security: CodeBERT and Machine Learning Approaches to Vulnerability Detection
%A Kumar, Lov
%A Singh, Vikram
%A Patel, Srivalli
%A Mishra, Pratyush
%Y Lalitha Devi, Sobha
%Y Arora, Karunesh
%S Proceedings of the 21st International Conference on Natural Language Processing (ICON)
%D 2024
%8 December
%I NLP Association of India (NLPAI)
%C AU-KBC Research Centre, Chennai, India
%F kumar-etal-2024-empowering
%X Software (SW) systems experience faults after deployment, raising concerns about reliability and leading to financial losses, reputational damage, and safety risks. This paper presents a novel approach using CodeBERT, a state-of-the-art neural code representation model pre-trained in multi-programming languages and employs various code metrics to predict SW faults. The study comprehensively evaluates trained models by analyzing publicly available codebase and employing diverse machine learning models, feature selection techniques, and class balancing through SMOTE. The results show that SMOTE significantly enhances vulnerability detection performance, particularly in accuracy, AUC, sensitivity, and specificity. The EXTR classifier consistently outperforms others, with an average AUC of 0.82, and the features selected using the GA feature selection technique, despite achieving a mean AUC of 0.84. Interestingly, among employed embedding techniques, SW metrics combined with CodeBERT (SMCBERT) stand out as top performers, achieving the highest mean AUC score of 0.80, making models trained on SMCBERT the best for SW vulnerability prediction.
%U https://aclanthology.org/2024.icon-1.46/
%P 399-407
Markdown (Informal)
[Empowering SW Security: CodeBERT and Machine Learning Approaches to Vulnerability Detection](https://aclanthology.org/2024.icon-1.46/) (Kumar et al., ICON 2024)
ACL