@inproceedings{uniyal-etal-2024-one,
title = "One-to-many testing for code generation from (just) natural language",
author = "Uniyal, Mansi and
Singh, Mukul and
Verbruggen, Gust and
Gulwani, Sumit and
Le, Vu",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.902",
doi = "10.18653/v1/2024.findings-emnlp.902",
pages = "15397--15402",
abstract = "MBPP is a popular dataset for evaluating the task of code generation from natural language. Despite its popularity, there are three problems: (1) it relies on providing test cases to generate the right signature, (2) there is poor alignment between instruction and evaluation test cases, and (3) contamination of the exact phrasing being present in training datasets. We adapt MBPP to emphasize on generating code from just natural language by (1) removing ambiguity about the semantics of the task from the descriptions, and (2) evaluating generated code on multiple sets of assertions to account for ambiguity in the syntax. We compare popular open and closed weight models on the original (MBPP) and adapted (MBUPP) datasets.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="uniyal-etal-2024-one">
<titleInfo>
<title>One-to-many testing for code generation from (just) natural language</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mansi</namePart>
<namePart type="family">Uniyal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mukul</namePart>
<namePart type="family">Singh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gust</namePart>
<namePart type="family">Verbruggen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sumit</namePart>
<namePart type="family">Gulwani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vu</namePart>
<namePart type="family">Le</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>MBPP is a popular dataset for evaluating the task of code generation from natural language. Despite its popularity, there are three problems: (1) it relies on providing test cases to generate the right signature, (2) there is poor alignment between instruction and evaluation test cases, and (3) contamination of the exact phrasing being present in training datasets. We adapt MBPP to emphasize on generating code from just natural language by (1) removing ambiguity about the semantics of the task from the descriptions, and (2) evaluating generated code on multiple sets of assertions to account for ambiguity in the syntax. We compare popular open and closed weight models on the original (MBPP) and adapted (MBUPP) datasets.</abstract>
<identifier type="citekey">uniyal-etal-2024-one</identifier>
<identifier type="doi">10.18653/v1/2024.findings-emnlp.902</identifier>
<location>
<url>https://aclanthology.org/2024.findings-emnlp.902</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>15397</start>
<end>15402</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T One-to-many testing for code generation from (just) natural language
%A Uniyal, Mansi
%A Singh, Mukul
%A Verbruggen, Gust
%A Gulwani, Sumit
%A Le, Vu
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Findings of the Association for Computational Linguistics: EMNLP 2024
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F uniyal-etal-2024-one
%X MBPP is a popular dataset for evaluating the task of code generation from natural language. Despite its popularity, there are three problems: (1) it relies on providing test cases to generate the right signature, (2) there is poor alignment between instruction and evaluation test cases, and (3) contamination of the exact phrasing being present in training datasets. We adapt MBPP to emphasize on generating code from just natural language by (1) removing ambiguity about the semantics of the task from the descriptions, and (2) evaluating generated code on multiple sets of assertions to account for ambiguity in the syntax. We compare popular open and closed weight models on the original (MBPP) and adapted (MBUPP) datasets.
%R 10.18653/v1/2024.findings-emnlp.902
%U https://aclanthology.org/2024.findings-emnlp.902
%U https://doi.org/10.18653/v1/2024.findings-emnlp.902
%P 15397-15402
Markdown (Informal)
[One-to-many testing for code generation from (just) natural language](https://aclanthology.org/2024.findings-emnlp.902) (Uniyal et al., Findings 2024)
ACL