@inproceedings{chen-etal-2024-vullibgen,
title = "{V}ul{L}ib{G}en: Generating Names of Vulnerability-Affected Packages via a Large Language Model",
author = "Chen, Tianyu and
Li, Lin and
ZhuLiuchuan, ZhuLiuchuan and
Li, Zongyang and
Liu, Xueqing and
Liang, Guangtai and
Wang, Qianxiang and
Xie, Tao",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.luhme-long.527/",
doi = "10.18653/v1/2024.acl-long.527",
pages = "9767--9780",
abstract = "Security practitioners maintain vulnerability reports (e.g., GitHub Advisory) to help developers mitigate security risks. An important task for these databases is automatically extracting structured information mentioned in the report, e.g., the affected software packages, to accelerate the defense of the vulnerability ecosystem.However, it is challenging for existing work on affected package identification to achieve high precision. One reason is that all existing work focuses on relatively smaller models, thus they cannot harness the knowledge and semantic capabilities of large language models.To address this limitation, we propose VulLibGen, the first method to use LLM for affected package identification. In contrast to existing work, VulLibGen proposes the novel idea to directly generate the affected package. To improve the precision, VulLibGen employs supervised fine-tuning (SFT), retrieval augmented generation (RAG) and a local search algorithm. The local search algorithm is a novel post-processing algorithm we introduce for reducing the hallucination of the generated packages. Our evaluation results show that VulLibGen has an average precision of 0.806 for identifying vulnerable packages in the four most popular ecosystems in GitHub Advisory (Java, JS, Python, Go) while the best average precision in previous work is 0.721. Additionally, VulLibGen has high value to security practice: we submitted 60 {\ensuremath{<}}vulnerability, affected package{\ensuremath{>}} pairs to GitHub Advisory (covers four ecosystems) and 34 of them have been accepted and merged."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chen-etal-2024-vullibgen">
<titleInfo>
<title>VulLibGen: Generating Names of Vulnerability-Affected Packages via a Large Language Model</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tianyu</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lin</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">ZhuLiuchuan</namePart>
<namePart type="family">ZhuLiuchuan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zongyang</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xueqing</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guangtai</namePart>
<namePart type="family">Liang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qianxiang</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tao</namePart>
<namePart type="family">Xie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Security practitioners maintain vulnerability reports (e.g., GitHub Advisory) to help developers mitigate security risks. An important task for these databases is automatically extracting structured information mentioned in the report, e.g., the affected software packages, to accelerate the defense of the vulnerability ecosystem.However, it is challenging for existing work on affected package identification to achieve high precision. One reason is that all existing work focuses on relatively smaller models, thus they cannot harness the knowledge and semantic capabilities of large language models.To address this limitation, we propose VulLibGen, the first method to use LLM for affected package identification. In contrast to existing work, VulLibGen proposes the novel idea to directly generate the affected package. To improve the precision, VulLibGen employs supervised fine-tuning (SFT), retrieval augmented generation (RAG) and a local search algorithm. The local search algorithm is a novel post-processing algorithm we introduce for reducing the hallucination of the generated packages. Our evaluation results show that VulLibGen has an average precision of 0.806 for identifying vulnerable packages in the four most popular ecosystems in GitHub Advisory (Java, JS, Python, Go) while the best average precision in previous work is 0.721. Additionally, VulLibGen has high value to security practice: we submitted 60 \ensuremath<vulnerability, affected package\ensuremath> pairs to GitHub Advisory (covers four ecosystems) and 34 of them have been accepted and merged.</abstract>
<identifier type="citekey">chen-etal-2024-vullibgen</identifier>
<identifier type="doi">10.18653/v1/2024.acl-long.527</identifier>
<location>
<url>https://aclanthology.org/2024.luhme-long.527/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>9767</start>
<end>9780</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T VulLibGen: Generating Names of Vulnerability-Affected Packages via a Large Language Model
%A Chen, Tianyu
%A Li, Lin
%A ZhuLiuchuan, ZhuLiuchuan
%A Li, Zongyang
%A Liu, Xueqing
%A Liang, Guangtai
%A Wang, Qianxiang
%A Xie, Tao
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F chen-etal-2024-vullibgen
%X Security practitioners maintain vulnerability reports (e.g., GitHub Advisory) to help developers mitigate security risks. An important task for these databases is automatically extracting structured information mentioned in the report, e.g., the affected software packages, to accelerate the defense of the vulnerability ecosystem.However, it is challenging for existing work on affected package identification to achieve high precision. One reason is that all existing work focuses on relatively smaller models, thus they cannot harness the knowledge and semantic capabilities of large language models.To address this limitation, we propose VulLibGen, the first method to use LLM for affected package identification. In contrast to existing work, VulLibGen proposes the novel idea to directly generate the affected package. To improve the precision, VulLibGen employs supervised fine-tuning (SFT), retrieval augmented generation (RAG) and a local search algorithm. The local search algorithm is a novel post-processing algorithm we introduce for reducing the hallucination of the generated packages. Our evaluation results show that VulLibGen has an average precision of 0.806 for identifying vulnerable packages in the four most popular ecosystems in GitHub Advisory (Java, JS, Python, Go) while the best average precision in previous work is 0.721. Additionally, VulLibGen has high value to security practice: we submitted 60 \ensuremath<vulnerability, affected package\ensuremath> pairs to GitHub Advisory (covers four ecosystems) and 34 of them have been accepted and merged.
%R 10.18653/v1/2024.acl-long.527
%U https://aclanthology.org/2024.luhme-long.527/
%U https://doi.org/10.18653/v1/2024.acl-long.527
%P 9767-9780
Markdown (Informal)
[VulLibGen: Generating Names of Vulnerability-Affected Packages via a Large Language Model](https://aclanthology.org/2024.luhme-long.527/) (Chen et al., ACL 2024)
ACL
- Tianyu Chen, Lin Li, ZhuLiuchuan ZhuLiuchuan, Zongyang Li, Xueqing Liu, Guangtai Liang, Qianxiang Wang, and Tao Xie. 2024. VulLibGen: Generating Names of Vulnerability-Affected Packages via a Large Language Model. In Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 9767–9780, Bangkok, Thailand. Association for Computational Linguistics.