@inproceedings{baik-etal-2025-codecomplex,
title = "{C}ode{C}omplex: Dataset for Worst-Case Time Complexity Prediction",
author = "Baik, SeungYeop and
Hahn, Joonghyuk and
Kim, Jungin and
Aditi and
Jeon, Mingi and
Han, Yo-Sub and
Ko, Sang-Ki",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.1069/",
doi = "10.18653/v1/2025.findings-emnlp.1069",
pages = "19616--19638",
ISBN = "979-8-89176-335-7",
abstract = "Reasoning ability of large language models (LLMs) is a crucial ability,especially in complex decision-making tasks. One significant task to show LLMs{'}reasoning capability is code time complexity prediction, which involves variousintricate factors such as the input range of variables and conditional loops.Current benchmarks fall short of providing a rigorous assessment due to limiteddata, language constraints, and insufficient labeling. They do not consider timecomplexity based on input representation and merely evaluate whether predictionsfall into the same class, lacking a measure of how close incorrect predictionsare to the correct ones.To address these dependencies, we introduce CodeComplex, the first robust andextensive dataset designed to evaluate LLMs' reasoning abilities in predictingcode time complexity. CodeComplex comprises 4,900 Java codes and an equivalentnumber of Python codes, overcoming language and labeling constraints, carefullyannotated with complexity labels based on input characteristics by a panel ofalgorithmic experts. Additionally, we propose specialized evaluation metrics forthe reasoning of complexity prediction tasks, offering a more precise andreliable assessment of LLMs' reasoning capabilities. We release our dataset andbaseline models publicly to encourage the relevant (NLP, SE, and PL) communitiesto utilize and participate in this research. Our code and data are available athttps://github.com/sybaik1/CodeComplex."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="baik-etal-2025-codecomplex">
<titleInfo>
<title>CodeComplex: Dataset for Worst-Case Time Complexity Prediction</title>
</titleInfo>
<name type="personal">
<namePart type="given">SeungYeop</namePart>
<namePart type="family">Baik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joonghyuk</namePart>
<namePart type="family">Hahn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jungin</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name>
<namePart>Aditi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mingi</namePart>
<namePart type="family">Jeon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yo-Sub</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sang-Ki</namePart>
<namePart type="family">Ko</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>Reasoning ability of large language models (LLMs) is a crucial ability,especially in complex decision-making tasks. One significant task to show LLMs’reasoning capability is code time complexity prediction, which involves variousintricate factors such as the input range of variables and conditional loops.Current benchmarks fall short of providing a rigorous assessment due to limiteddata, language constraints, and insufficient labeling. They do not consider timecomplexity based on input representation and merely evaluate whether predictionsfall into the same class, lacking a measure of how close incorrect predictionsare to the correct ones.To address these dependencies, we introduce CodeComplex, the first robust andextensive dataset designed to evaluate LLMs’ reasoning abilities in predictingcode time complexity. CodeComplex comprises 4,900 Java codes and an equivalentnumber of Python codes, overcoming language and labeling constraints, carefullyannotated with complexity labels based on input characteristics by a panel ofalgorithmic experts. Additionally, we propose specialized evaluation metrics forthe reasoning of complexity prediction tasks, offering a more precise andreliable assessment of LLMs’ reasoning capabilities. We release our dataset andbaseline models publicly to encourage the relevant (NLP, SE, and PL) communitiesto utilize and participate in this research. Our code and data are available athttps://github.com/sybaik1/CodeComplex.</abstract>
<identifier type="citekey">baik-etal-2025-codecomplex</identifier>
<identifier type="doi">10.18653/v1/2025.findings-emnlp.1069</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.1069/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>19616</start>
<end>19638</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CodeComplex: Dataset for Worst-Case Time Complexity Prediction
%A Baik, SeungYeop
%A Hahn, Joonghyuk
%A Kim, Jungin
%A Jeon, Mingi
%A Han, Yo-Sub
%A Ko, Sang-Ki
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%A Aditi
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F baik-etal-2025-codecomplex
%X Reasoning ability of large language models (LLMs) is a crucial ability,especially in complex decision-making tasks. One significant task to show LLMs’reasoning capability is code time complexity prediction, which involves variousintricate factors such as the input range of variables and conditional loops.Current benchmarks fall short of providing a rigorous assessment due to limiteddata, language constraints, and insufficient labeling. They do not consider timecomplexity based on input representation and merely evaluate whether predictionsfall into the same class, lacking a measure of how close incorrect predictionsare to the correct ones.To address these dependencies, we introduce CodeComplex, the first robust andextensive dataset designed to evaluate LLMs’ reasoning abilities in predictingcode time complexity. CodeComplex comprises 4,900 Java codes and an equivalentnumber of Python codes, overcoming language and labeling constraints, carefullyannotated with complexity labels based on input characteristics by a panel ofalgorithmic experts. Additionally, we propose specialized evaluation metrics forthe reasoning of complexity prediction tasks, offering a more precise andreliable assessment of LLMs’ reasoning capabilities. We release our dataset andbaseline models publicly to encourage the relevant (NLP, SE, and PL) communitiesto utilize and participate in this research. Our code and data are available athttps://github.com/sybaik1/CodeComplex.
%R 10.18653/v1/2025.findings-emnlp.1069
%U https://aclanthology.org/2025.findings-emnlp.1069/
%U https://doi.org/10.18653/v1/2025.findings-emnlp.1069
%P 19616-19638
Markdown (Informal)
[CodeComplex: Dataset for Worst-Case Time Complexity Prediction](https://aclanthology.org/2025.findings-emnlp.1069/) (Baik et al., Findings 2025)
ACL
- SeungYeop Baik, Joonghyuk Hahn, Jungin Kim, Aditi, Mingi Jeon, Yo-Sub Han, and Sang-Ki Ko. 2025. CodeComplex: Dataset for Worst-Case Time Complexity Prediction. In Findings of the Association for Computational Linguistics: EMNLP 2025, pages 19616–19638, Suzhou, China. Association for Computational Linguistics.