@inproceedings{diaz-etal-2024-spamclus,
title = "{S}pam{C}lus: An Agglomerative Clustering Algorithm for Spam Email Campaigns Detection",
author = "D{\'i}az, Daniel and
Al-Nabki, Wesam and
Fern{\'a}ndez-Robles, Laura and
Alegre, Enrique and
Fidalgo, Eduardo and
Mart{\'i}nez-Mendoza, Alicia",
editor = "Mitkov, Ruslan and
Ezzini, Saad and
Ranasinghe, Tharindu and
Ezeani, Ignatius and
Khallaf, Nouran and
Acarturk, Cengiz and
Bradbury, Matthew and
El-Haj, Mo and
Rayson, Paul",
booktitle = "Proceedings of the First International Conference on Natural Language Processing and Artificial Intelligence for Cyber Security",
month = jul,
year = "2024",
address = "Lancaster, UK",
publisher = "International Conference on Natural Language Processing and Artificial Intelligence for Cyber Security",
url = "https://aclanthology.org/2024.nlpaics-1.8/",
pages = "64--69",
abstract = "Spam emails constitute a significant proportion of emails received by users, and can result in financial losses or in the download of malware on the victim{'}s device. Cyberattackers create spam campaigns to deliver spam messages on a large scale and benefit from the low economic investment and anonymity required to create the attacks. In addition to spam filters, raising awareness about active email scams is a relevant measure that helps mitigate the consequences of spam. Therefore, detecting campaigns becomes a relevant task in identifying and alerting the targets of spam. In this paper, we propose an unsupervised learning algorithm, SpamClus{\_}1, an iterative algorithm that groups spam email campaigns using agglomerative clustering. The measures employed to determine the clusters are the minimum number of samples and minimum percentage of similarity within a cluster. Evaluating SpamClus{\_}1 on a set of emails provided by the Spanish National Cybersecurity Institute (INCIBE), we found that the optimal values are 50 minimum samples and a minimum cosine similarity of 0.8. The clustering results show 19 spam datasets with 3048 spam samples out of 6702 emails from a range of three consecutive days and eight spam clusters with 870 spam samples out of 1469 emails from one day."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="diaz-etal-2024-spamclus">
<titleInfo>
<title>SpamClus: An Agglomerative Clustering Algorithm for Spam Email Campaigns Detection</title>
</titleInfo>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Díaz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wesam</namePart>
<namePart type="family">Al-Nabki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Laura</namePart>
<namePart type="family">Fernández-Robles</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enrique</namePart>
<namePart type="family">Alegre</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eduardo</namePart>
<namePart type="family">Fidalgo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alicia</namePart>
<namePart type="family">Martínez-Mendoza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First International Conference on Natural Language Processing and Artificial Intelligence for Cyber Security</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Saad</namePart>
<namePart type="family">Ezzini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tharindu</namePart>
<namePart type="family">Ranasinghe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ignatius</namePart>
<namePart type="family">Ezeani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nouran</namePart>
<namePart type="family">Khallaf</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cengiz</namePart>
<namePart type="family">Acarturk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matthew</namePart>
<namePart type="family">Bradbury</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mo</namePart>
<namePart type="family">El-Haj</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">Rayson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Conference on Natural Language Processing and Artificial Intelligence for Cyber Security</publisher>
<place>
<placeTerm type="text">Lancaster, UK</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Spam emails constitute a significant proportion of emails received by users, and can result in financial losses or in the download of malware on the victim’s device. Cyberattackers create spam campaigns to deliver spam messages on a large scale and benefit from the low economic investment and anonymity required to create the attacks. In addition to spam filters, raising awareness about active email scams is a relevant measure that helps mitigate the consequences of spam. Therefore, detecting campaigns becomes a relevant task in identifying and alerting the targets of spam. In this paper, we propose an unsupervised learning algorithm, SpamClus_1, an iterative algorithm that groups spam email campaigns using agglomerative clustering. The measures employed to determine the clusters are the minimum number of samples and minimum percentage of similarity within a cluster. Evaluating SpamClus_1 on a set of emails provided by the Spanish National Cybersecurity Institute (INCIBE), we found that the optimal values are 50 minimum samples and a minimum cosine similarity of 0.8. The clustering results show 19 spam datasets with 3048 spam samples out of 6702 emails from a range of three consecutive days and eight spam clusters with 870 spam samples out of 1469 emails from one day.</abstract>
<identifier type="citekey">diaz-etal-2024-spamclus</identifier>
<location>
<url>https://aclanthology.org/2024.nlpaics-1.8/</url>
</location>
<part>
<date>2024-07</date>
<extent unit="page">
<start>64</start>
<end>69</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SpamClus: An Agglomerative Clustering Algorithm for Spam Email Campaigns Detection
%A Díaz, Daniel
%A Al-Nabki, Wesam
%A Fernández-Robles, Laura
%A Alegre, Enrique
%A Fidalgo, Eduardo
%A Martínez-Mendoza, Alicia
%Y Mitkov, Ruslan
%Y Ezzini, Saad
%Y Ranasinghe, Tharindu
%Y Ezeani, Ignatius
%Y Khallaf, Nouran
%Y Acarturk, Cengiz
%Y Bradbury, Matthew
%Y El-Haj, Mo
%Y Rayson, Paul
%S Proceedings of the First International Conference on Natural Language Processing and Artificial Intelligence for Cyber Security
%D 2024
%8 July
%I International Conference on Natural Language Processing and Artificial Intelligence for Cyber Security
%C Lancaster, UK
%F diaz-etal-2024-spamclus
%X Spam emails constitute a significant proportion of emails received by users, and can result in financial losses or in the download of malware on the victim’s device. Cyberattackers create spam campaigns to deliver spam messages on a large scale and benefit from the low economic investment and anonymity required to create the attacks. In addition to spam filters, raising awareness about active email scams is a relevant measure that helps mitigate the consequences of spam. Therefore, detecting campaigns becomes a relevant task in identifying and alerting the targets of spam. In this paper, we propose an unsupervised learning algorithm, SpamClus_1, an iterative algorithm that groups spam email campaigns using agglomerative clustering. The measures employed to determine the clusters are the minimum number of samples and minimum percentage of similarity within a cluster. Evaluating SpamClus_1 on a set of emails provided by the Spanish National Cybersecurity Institute (INCIBE), we found that the optimal values are 50 minimum samples and a minimum cosine similarity of 0.8. The clustering results show 19 spam datasets with 3048 spam samples out of 6702 emails from a range of three consecutive days and eight spam clusters with 870 spam samples out of 1469 emails from one day.
%U https://aclanthology.org/2024.nlpaics-1.8/
%P 64-69
Markdown (Informal)
[SpamClus: An Agglomerative Clustering Algorithm for Spam Email Campaigns Detection](https://aclanthology.org/2024.nlpaics-1.8/) (Díaz et al., NLPAICS 2024)
ACL
- Daniel Díaz, Wesam Al-Nabki, Laura Fernández-Robles, Enrique Alegre, Eduardo Fidalgo, and Alicia Martínez-Mendoza. 2024. SpamClus: An Agglomerative Clustering Algorithm for Spam Email Campaigns Detection. In Proceedings of the First International Conference on Natural Language Processing and Artificial Intelligence for Cyber Security, pages 64–69, Lancaster, UK. International Conference on Natural Language Processing and Artificial Intelligence for Cyber Security.