@inproceedings{huang-etal-2025-capabilities,
title = "From Capabilities to Performance: Evaluating Key Functional Properties of {LLM} Architectures in Penetration Testing",
author = "Huang, Lanxiao and
Dave, Daksh and
Cody, Tyler and
Beling, Peter A. and
Jin, Ming",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.802/",
doi = "10.18653/v1/2025.emnlp-main.802",
pages = "15879--15905",
ISBN = "979-8-89176-332-6",
abstract = "Large Language Models (LLMs) have been explored for automating or enhancing penetration testing tasks, but their effectiveness and reliability across diverse attack phases remain open questions. This study presents a comprehensive evaluation of multiple LLM-based agents, ranging from singular to modular designs, across realistic penetration testing scenarios, analyzing their empirical performance and recurring failure patterns. We further investigate the impact of core functional capabilities on agent success, operationalized through five targeted augmentations: Global Context Memory (GCM), Inter-Agent Messaging (IAM), Context-Conditioned Invocation (CCI), Adaptive Planning (AP), and Real-Time Monitoring (RTM). These interventions respectively support the capabilities of Context Coherence {\&} Retention, Inter-Component Coordination {\&} State Management, Tool Usage Accuracy {\&} Selective Execution, Multi-Step Strategic Planning {\&} Error Detection {\&} Recovery, and Real-Time Dynamic Responsiveness. Our findings reveal that while some architectures natively exhibit select properties, targeted augmentations significantly enhance modular agent performance{---}particularly in complex, multi-step, and real-time penetration testing scenarios."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="huang-etal-2025-capabilities">
<titleInfo>
<title>From Capabilities to Performance: Evaluating Key Functional Properties of LLM Architectures in Penetration Testing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lanxiao</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daksh</namePart>
<namePart type="family">Dave</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tyler</namePart>
<namePart type="family">Cody</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peter</namePart>
<namePart type="given">A</namePart>
<namePart type="family">Beling</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ming</namePart>
<namePart type="family">Jin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>Large Language Models (LLMs) have been explored for automating or enhancing penetration testing tasks, but their effectiveness and reliability across diverse attack phases remain open questions. This study presents a comprehensive evaluation of multiple LLM-based agents, ranging from singular to modular designs, across realistic penetration testing scenarios, analyzing their empirical performance and recurring failure patterns. We further investigate the impact of core functional capabilities on agent success, operationalized through five targeted augmentations: Global Context Memory (GCM), Inter-Agent Messaging (IAM), Context-Conditioned Invocation (CCI), Adaptive Planning (AP), and Real-Time Monitoring (RTM). These interventions respectively support the capabilities of Context Coherence & Retention, Inter-Component Coordination & State Management, Tool Usage Accuracy & Selective Execution, Multi-Step Strategic Planning & Error Detection & Recovery, and Real-Time Dynamic Responsiveness. Our findings reveal that while some architectures natively exhibit select properties, targeted augmentations significantly enhance modular agent performance—particularly in complex, multi-step, and real-time penetration testing scenarios.</abstract>
<identifier type="citekey">huang-etal-2025-capabilities</identifier>
<identifier type="doi">10.18653/v1/2025.emnlp-main.802</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.802/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>15879</start>
<end>15905</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T From Capabilities to Performance: Evaluating Key Functional Properties of LLM Architectures in Penetration Testing
%A Huang, Lanxiao
%A Dave, Daksh
%A Cody, Tyler
%A Beling, Peter A.
%A Jin, Ming
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F huang-etal-2025-capabilities
%X Large Language Models (LLMs) have been explored for automating or enhancing penetration testing tasks, but their effectiveness and reliability across diverse attack phases remain open questions. This study presents a comprehensive evaluation of multiple LLM-based agents, ranging from singular to modular designs, across realistic penetration testing scenarios, analyzing their empirical performance and recurring failure patterns. We further investigate the impact of core functional capabilities on agent success, operationalized through five targeted augmentations: Global Context Memory (GCM), Inter-Agent Messaging (IAM), Context-Conditioned Invocation (CCI), Adaptive Planning (AP), and Real-Time Monitoring (RTM). These interventions respectively support the capabilities of Context Coherence & Retention, Inter-Component Coordination & State Management, Tool Usage Accuracy & Selective Execution, Multi-Step Strategic Planning & Error Detection & Recovery, and Real-Time Dynamic Responsiveness. Our findings reveal that while some architectures natively exhibit select properties, targeted augmentations significantly enhance modular agent performance—particularly in complex, multi-step, and real-time penetration testing scenarios.
%R 10.18653/v1/2025.emnlp-main.802
%U https://aclanthology.org/2025.emnlp-main.802/
%U https://doi.org/10.18653/v1/2025.emnlp-main.802
%P 15879-15905
Markdown (Informal)
[From Capabilities to Performance: Evaluating Key Functional Properties of LLM Architectures in Penetration Testing](https://aclanthology.org/2025.emnlp-main.802/) (Huang et al., EMNLP 2025)
ACL