@inproceedings{peng-etal-2025-pwngpt,
title = "{P}wn{GPT}: Automatic Exploit Generation Based on Large Language Models",
author = "Peng, Wanzong and
Ye, Lin and
Du, Xuetao and
Zhang, Hongli and
Zhan, Dongyang and
Zhang, Yunting and
Guo, Yicheng and
Zhang, Chen",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.562/",
doi = "10.18653/v1/2025.acl-long.562",
pages = "11481--11494",
ISBN = "979-8-89176-251-0",
abstract = "Automatic exploit generation (AEG) refers to the automatic discovery and exploitation of vulnerabilities against unknown targets. Traditional AEG often targets a single type of vulnerability and still relies on templates built from expert experience. To achieve intelligent exploit generation, we establish a comprehensive benchmark using Binary Exploitation (pwn) challenges in Capture the Flag (CTF) competitions and investigate the capabilities of Large Language Models (LLMs) in AEG based on the benchmark. To improve the performance of AEG, we propose PwnGPT, an LLM-based automatic exploit generation framework that automatically solves pwn challenges. The structural design of PwnGPT is divided into three main components: analysis, generation, and verification modules. With the help of a modular approach and structured problem inputs, PwnGPT can solve challenges that LLMs cannot directly solve. We evaluate PwnGPT on our benchmark and analyze the outputs of each module. Experimental results show that our framework is highly autonomous and capable of addressing various challenges. Compared to direct input LLMs, PwnGPT increases the completion rate of exploit on our benchmark from 26.3{\%} to 57.9{\%} with the OpenAI o1-preview model and from 21.1{\%} to 36.8{\%} with the GPT-4o model."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="peng-etal-2025-pwngpt">
<titleInfo>
<title>PwnGPT: Automatic Exploit Generation Based on Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanzong</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lin</namePart>
<namePart type="family">Ye</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuetao</namePart>
<namePart type="family">Du</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hongli</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dongyang</namePart>
<namePart type="family">Zhan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yunting</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yicheng</namePart>
<namePart type="family">Guo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chen</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>Automatic exploit generation (AEG) refers to the automatic discovery and exploitation of vulnerabilities against unknown targets. Traditional AEG often targets a single type of vulnerability and still relies on templates built from expert experience. To achieve intelligent exploit generation, we establish a comprehensive benchmark using Binary Exploitation (pwn) challenges in Capture the Flag (CTF) competitions and investigate the capabilities of Large Language Models (LLMs) in AEG based on the benchmark. To improve the performance of AEG, we propose PwnGPT, an LLM-based automatic exploit generation framework that automatically solves pwn challenges. The structural design of PwnGPT is divided into three main components: analysis, generation, and verification modules. With the help of a modular approach and structured problem inputs, PwnGPT can solve challenges that LLMs cannot directly solve. We evaluate PwnGPT on our benchmark and analyze the outputs of each module. Experimental results show that our framework is highly autonomous and capable of addressing various challenges. Compared to direct input LLMs, PwnGPT increases the completion rate of exploit on our benchmark from 26.3% to 57.9% with the OpenAI o1-preview model and from 21.1% to 36.8% with the GPT-4o model.</abstract>
<identifier type="citekey">peng-etal-2025-pwngpt</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.562</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.562/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>11481</start>
<end>11494</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T PwnGPT: Automatic Exploit Generation Based on Large Language Models
%A Peng, Wanzong
%A Ye, Lin
%A Du, Xuetao
%A Zhang, Hongli
%A Zhan, Dongyang
%A Zhang, Yunting
%A Guo, Yicheng
%A Zhang, Chen
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F peng-etal-2025-pwngpt
%X Automatic exploit generation (AEG) refers to the automatic discovery and exploitation of vulnerabilities against unknown targets. Traditional AEG often targets a single type of vulnerability and still relies on templates built from expert experience. To achieve intelligent exploit generation, we establish a comprehensive benchmark using Binary Exploitation (pwn) challenges in Capture the Flag (CTF) competitions and investigate the capabilities of Large Language Models (LLMs) in AEG based on the benchmark. To improve the performance of AEG, we propose PwnGPT, an LLM-based automatic exploit generation framework that automatically solves pwn challenges. The structural design of PwnGPT is divided into three main components: analysis, generation, and verification modules. With the help of a modular approach and structured problem inputs, PwnGPT can solve challenges that LLMs cannot directly solve. We evaluate PwnGPT on our benchmark and analyze the outputs of each module. Experimental results show that our framework is highly autonomous and capable of addressing various challenges. Compared to direct input LLMs, PwnGPT increases the completion rate of exploit on our benchmark from 26.3% to 57.9% with the OpenAI o1-preview model and from 21.1% to 36.8% with the GPT-4o model.
%R 10.18653/v1/2025.acl-long.562
%U https://aclanthology.org/2025.acl-long.562/
%U https://doi.org/10.18653/v1/2025.acl-long.562
%P 11481-11494
Markdown (Informal)
[PwnGPT: Automatic Exploit Generation Based on Large Language Models](https://aclanthology.org/2025.acl-long.562/) (Peng et al., ACL 2025)
ACL
- Wanzong Peng, Lin Ye, Xuetao Du, Hongli Zhang, Dongyang Zhan, Yunting Zhang, Yicheng Guo, and Chen Zhang. 2025. PwnGPT: Automatic Exploit Generation Based on Large Language Models. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 11481–11494, Vienna, Austria. Association for Computational Linguistics.