@inproceedings{wang-etal-2025-mlalgo,
title = "{MLA}lgo-Bench: Can Machines Implement Machine Learning Algorithms?",
author = "Wang, Yunfei and
Zhang, Yeqin and
Wu, Yuyang and
Lu, Liang and
Nguyen, Phi Le and
Wang, Xiaoliang and
Cam-Tu, Nguyen",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.772/",
pages = "14298--14329",
ISBN = "979-8-89176-335-7",
abstract = "As machine learning (ML) application continues to expand across diverse fields, there is a rising demand for ML code generation. In this paper, we aim at a critical research question: Can machines autonomously generate ML code for sophisticated, human-designed algorithms or solutions? To answer this question, we introduce a novel benchmark, MLAlgo-Bench, which includes two challenging tasks: 1) Generating code for ML algorithms including both traditional ML and modern deep learning-based methods, and 2) Giving humans solution sketches, writing ML code for solving practical tasks in Kaggle competitions. This benchmark is unique in its focus on the challenges of interpreting intricate human instructions and producing multi-step, high-complexity code, offering a rigorous test for current Large Language Model (LLM) capabilities. We introduce an automatic evaluation framework with comprehensive metrics such as task pass rate, relative performance metric, and time overhead. Currently, the top-performing models (Claude3.5-Sonet) achieve a 48.8{\%} task completion rate on realizing machine learning algorithms, and a 21.6{\%} rate for completing Kaggle competitions. Further analysis suggests substantial room for improvement."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-etal-2025-mlalgo">
<titleInfo>
<title>MLAlgo-Bench: Can Machines Implement Machine Learning Algorithms?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yunfei</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yeqin</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuyang</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Liang</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Phi</namePart>
<namePart type="given">Le</namePart>
<namePart type="family">Nguyen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaoliang</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nguyen</namePart>
<namePart type="family">Cam-Tu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>As machine learning (ML) application continues to expand across diverse fields, there is a rising demand for ML code generation. In this paper, we aim at a critical research question: Can machines autonomously generate ML code for sophisticated, human-designed algorithms or solutions? To answer this question, we introduce a novel benchmark, MLAlgo-Bench, which includes two challenging tasks: 1) Generating code for ML algorithms including both traditional ML and modern deep learning-based methods, and 2) Giving humans solution sketches, writing ML code for solving practical tasks in Kaggle competitions. This benchmark is unique in its focus on the challenges of interpreting intricate human instructions and producing multi-step, high-complexity code, offering a rigorous test for current Large Language Model (LLM) capabilities. We introduce an automatic evaluation framework with comprehensive metrics such as task pass rate, relative performance metric, and time overhead. Currently, the top-performing models (Claude3.5-Sonet) achieve a 48.8% task completion rate on realizing machine learning algorithms, and a 21.6% rate for completing Kaggle competitions. Further analysis suggests substantial room for improvement.</abstract>
<identifier type="citekey">wang-etal-2025-mlalgo</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.772/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>14298</start>
<end>14329</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MLAlgo-Bench: Can Machines Implement Machine Learning Algorithms?
%A Wang, Yunfei
%A Zhang, Yeqin
%A Wu, Yuyang
%A Lu, Liang
%A Nguyen, Phi Le
%A Wang, Xiaoliang
%A Cam-Tu, Nguyen
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F wang-etal-2025-mlalgo
%X As machine learning (ML) application continues to expand across diverse fields, there is a rising demand for ML code generation. In this paper, we aim at a critical research question: Can machines autonomously generate ML code for sophisticated, human-designed algorithms or solutions? To answer this question, we introduce a novel benchmark, MLAlgo-Bench, which includes two challenging tasks: 1) Generating code for ML algorithms including both traditional ML and modern deep learning-based methods, and 2) Giving humans solution sketches, writing ML code for solving practical tasks in Kaggle competitions. This benchmark is unique in its focus on the challenges of interpreting intricate human instructions and producing multi-step, high-complexity code, offering a rigorous test for current Large Language Model (LLM) capabilities. We introduce an automatic evaluation framework with comprehensive metrics such as task pass rate, relative performance metric, and time overhead. Currently, the top-performing models (Claude3.5-Sonet) achieve a 48.8% task completion rate on realizing machine learning algorithms, and a 21.6% rate for completing Kaggle competitions. Further analysis suggests substantial room for improvement.
%U https://aclanthology.org/2025.findings-emnlp.772/
%P 14298-14329
Markdown (Informal)
[MLAlgo-Bench: Can Machines Implement Machine Learning Algorithms?](https://aclanthology.org/2025.findings-emnlp.772/) (Wang et al., Findings 2025)
ACL