@inproceedings{hu-etal-2025-dynacode,
title = "{D}yna{C}ode: A Dynamic Complexity-Aware Code Benchmark for Evaluating Large Language Models in Code Generation",
author = "Hu, Wenhao and
Duan, Jinhao and
Wei, Chunchen and
Zhang, Li and
Zhang, Yue and
Xu, Kaidi",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.1133/",
doi = "10.18653/v1/2025.findings-acl.1133",
pages = "21980--21997",
ISBN = "979-8-89176-256-5",
abstract = "The rapid advancement of large language models (LLMs) has significantly improved their performance in code generation tasks. However, existing code benchmarks remain static, consisting of fixed datasets with predefined problems. This makes them vulnerable to memorization during training, where LLMs recall specific test cases instead of generalizing to new problems, leading to data contamination and unreliable evaluation results. To address these issues, we introduce DynaCode, a dynamic, complexity-aware benchmark that overcomes the limitations of static datasets. DynaCode evaluates LLMs systematically using a complexity-aware metric, incorporating both code complexity and call-graph structures. DynaCode achieves large-scale diversity, generating up to 189 million unique nested code problems across 4 units of code complexity and 16 types of call graphs. Results on 12 latest LLMs show an average performance drop of 16.8 to 45.7 compared to MBPP+, with performance progressively decreasing as complexity increases. This demonstrates DynaCode{'}s ability to effectively differentiate model performance based on code complexity and how different parts of a program interact. Our benchmark and evaluation code are available at https://github.com/HWH-2000/DynaCode."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hu-etal-2025-dynacode">
<titleInfo>
<title>DynaCode: A Dynamic Complexity-Aware Code Benchmark for Evaluating Large Language Models in Code Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wenhao</namePart>
<namePart type="family">Hu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jinhao</namePart>
<namePart type="family">Duan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chunchen</namePart>
<namePart type="family">Wei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Li</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kaidi</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>The rapid advancement of large language models (LLMs) has significantly improved their performance in code generation tasks. However, existing code benchmarks remain static, consisting of fixed datasets with predefined problems. This makes them vulnerable to memorization during training, where LLMs recall specific test cases instead of generalizing to new problems, leading to data contamination and unreliable evaluation results. To address these issues, we introduce DynaCode, a dynamic, complexity-aware benchmark that overcomes the limitations of static datasets. DynaCode evaluates LLMs systematically using a complexity-aware metric, incorporating both code complexity and call-graph structures. DynaCode achieves large-scale diversity, generating up to 189 million unique nested code problems across 4 units of code complexity and 16 types of call graphs. Results on 12 latest LLMs show an average performance drop of 16.8 to 45.7 compared to MBPP+, with performance progressively decreasing as complexity increases. This demonstrates DynaCode’s ability to effectively differentiate model performance based on code complexity and how different parts of a program interact. Our benchmark and evaluation code are available at https://github.com/HWH-2000/DynaCode.</abstract>
<identifier type="citekey">hu-etal-2025-dynacode</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.1133</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.1133/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>21980</start>
<end>21997</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T DynaCode: A Dynamic Complexity-Aware Code Benchmark for Evaluating Large Language Models in Code Generation
%A Hu, Wenhao
%A Duan, Jinhao
%A Wei, Chunchen
%A Zhang, Li
%A Zhang, Yue
%A Xu, Kaidi
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F hu-etal-2025-dynacode
%X The rapid advancement of large language models (LLMs) has significantly improved their performance in code generation tasks. However, existing code benchmarks remain static, consisting of fixed datasets with predefined problems. This makes them vulnerable to memorization during training, where LLMs recall specific test cases instead of generalizing to new problems, leading to data contamination and unreliable evaluation results. To address these issues, we introduce DynaCode, a dynamic, complexity-aware benchmark that overcomes the limitations of static datasets. DynaCode evaluates LLMs systematically using a complexity-aware metric, incorporating both code complexity and call-graph structures. DynaCode achieves large-scale diversity, generating up to 189 million unique nested code problems across 4 units of code complexity and 16 types of call graphs. Results on 12 latest LLMs show an average performance drop of 16.8 to 45.7 compared to MBPP+, with performance progressively decreasing as complexity increases. This demonstrates DynaCode’s ability to effectively differentiate model performance based on code complexity and how different parts of a program interact. Our benchmark and evaluation code are available at https://github.com/HWH-2000/DynaCode.
%R 10.18653/v1/2025.findings-acl.1133
%U https://aclanthology.org/2025.findings-acl.1133/
%U https://doi.org/10.18653/v1/2025.findings-acl.1133
%P 21980-21997
Markdown (Informal)
[DynaCode: A Dynamic Complexity-Aware Code Benchmark for Evaluating Large Language Models in Code Generation](https://aclanthology.org/2025.findings-acl.1133/) (Hu et al., Findings 2025)
ACL