@article{ni-etal-2024-l2ceval,
title = "{L}2{CE}val: Evaluating Language-to-Code Generation Capabilities of Large Language Models",
author = "Ni, Ansong and
Yin, Pengcheng and
Zhao, Yilun and
Riddell, Martin and
Feng, Troy and
Shen, Rui and
Yin, Stephen and
Liu, Ye and
Yavuz, Semih and
Xiong, Caiming and
Joty, Shafiq and
Zhou, Yingbo and
Radev, Dragomir and
Cohan, Arman and
Cohan, Arman",
journal = "Transactions of the Association for Computational Linguistics",
volume = "12",
year = "2024",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2024.tacl-1.72",
doi = "10.1162/tacl_a_00705",
pages = "1311--1329",
abstract = "Recently, large language models (LLMs), especially those that are pretrained on code, have demonstrated strong capabilities in generating programs from natural language inputs. Despite promising results, there is a notable lack of a comprehensive evaluation of these models{'} language-to-code generation capabilities. Existing studies often focus on specific tasks, model architectures, or learning paradigms, leading to a fragmented understanding of the overall landscape. In this work, we present L2CEval, a systematic evaluation of the language-to-code generation capabilities of LLMs on 7 tasks across the domain spectrum of semantic parsing, math reasoning, and Python programming, analyzing the factors that potentially affect their performance, such as model size, pretraining data, instruction tuning, and different prompting methods. In addition, we assess confidence calibration, and conduct human evaluations to identify typical failures across different tasks and models. L2CEval offers a comprehensive understanding of the capabilities and limitations of LLMs in language-to-code generation. We release the evaluation framework1 and all model outputs, hoping to lay the groundwork for further future research. All future evaluations (e.g., LLaMA-3, StarCoder2, etc) will be updated on the project website: https://l2c-eval.github.io/.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ni-etal-2024-l2ceval">
<titleInfo>
<title>L2CEval: Evaluating Language-to-Code Generation Capabilities of Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ansong</namePart>
<namePart type="family">Ni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pengcheng</namePart>
<namePart type="family">Yin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yilun</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Martin</namePart>
<namePart type="family">Riddell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Troy</namePart>
<namePart type="family">Feng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rui</namePart>
<namePart type="family">Shen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stephen</namePart>
<namePart type="family">Yin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ye</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Semih</namePart>
<namePart type="family">Yavuz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Caiming</namePart>
<namePart type="family">Xiong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shafiq</namePart>
<namePart type="family">Joty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yingbo</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dragomir</namePart>
<namePart type="family">Radev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arman</namePart>
<namePart type="family">Cohan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Recently, large language models (LLMs), especially those that are pretrained on code, have demonstrated strong capabilities in generating programs from natural language inputs. Despite promising results, there is a notable lack of a comprehensive evaluation of these models’ language-to-code generation capabilities. Existing studies often focus on specific tasks, model architectures, or learning paradigms, leading to a fragmented understanding of the overall landscape. In this work, we present L2CEval, a systematic evaluation of the language-to-code generation capabilities of LLMs on 7 tasks across the domain spectrum of semantic parsing, math reasoning, and Python programming, analyzing the factors that potentially affect their performance, such as model size, pretraining data, instruction tuning, and different prompting methods. In addition, we assess confidence calibration, and conduct human evaluations to identify typical failures across different tasks and models. L2CEval offers a comprehensive understanding of the capabilities and limitations of LLMs in language-to-code generation. We release the evaluation framework1 and all model outputs, hoping to lay the groundwork for further future research. All future evaluations (e.g., LLaMA-3, StarCoder2, etc) will be updated on the project website: https://l2c-eval.github.io/.</abstract>
<identifier type="citekey">ni-etal-2024-l2ceval</identifier>
<identifier type="doi">10.1162/tacl_a_00705</identifier>
<location>
<url>https://aclanthology.org/2024.tacl-1.72</url>
</location>
<part>
<date>2024</date>
<detail type="volume"><number>12</number></detail>
<extent unit="page">
<start>1311</start>
<end>1329</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T L2CEval: Evaluating Language-to-Code Generation Capabilities of Large Language Models
%A Ni, Ansong
%A Yin, Pengcheng
%A Zhao, Yilun
%A Riddell, Martin
%A Feng, Troy
%A Shen, Rui
%A Yin, Stephen
%A Liu, Ye
%A Yavuz, Semih
%A Xiong, Caiming
%A Joty, Shafiq
%A Zhou, Yingbo
%A Radev, Dragomir
%A Cohan, Arman
%J Transactions of the Association for Computational Linguistics
%D 2024
%V 12
%I MIT Press
%C Cambridge, MA
%F ni-etal-2024-l2ceval
%X Recently, large language models (LLMs), especially those that are pretrained on code, have demonstrated strong capabilities in generating programs from natural language inputs. Despite promising results, there is a notable lack of a comprehensive evaluation of these models’ language-to-code generation capabilities. Existing studies often focus on specific tasks, model architectures, or learning paradigms, leading to a fragmented understanding of the overall landscape. In this work, we present L2CEval, a systematic evaluation of the language-to-code generation capabilities of LLMs on 7 tasks across the domain spectrum of semantic parsing, math reasoning, and Python programming, analyzing the factors that potentially affect their performance, such as model size, pretraining data, instruction tuning, and different prompting methods. In addition, we assess confidence calibration, and conduct human evaluations to identify typical failures across different tasks and models. L2CEval offers a comprehensive understanding of the capabilities and limitations of LLMs in language-to-code generation. We release the evaluation framework1 and all model outputs, hoping to lay the groundwork for further future research. All future evaluations (e.g., LLaMA-3, StarCoder2, etc) will be updated on the project website: https://l2c-eval.github.io/.
%R 10.1162/tacl_a_00705
%U https://aclanthology.org/2024.tacl-1.72
%U https://doi.org/10.1162/tacl_a_00705
%P 1311-1329
Markdown (Informal)
[L2CEval: Evaluating Language-to-Code Generation Capabilities of Large Language Models](https://aclanthology.org/2024.tacl-1.72) (Ni et al., TACL 2024)
ACL
- Ansong Ni, Pengcheng Yin, Yilun Zhao, Martin Riddell, Troy Feng, Rui Shen, Stephen Yin, Ye Liu, Semih Yavuz, Caiming Xiong, Shafiq Joty, Yingbo Zhou, Dragomir Radev, Arman Cohan, and Arman Cohan. 2024. L2CEval: Evaluating Language-to-Code Generation Capabilities of Large Language Models. Transactions of the Association for Computational Linguistics, 12:1311–1329.