@inproceedings{pan-etal-2025-benchmarks,
title = "When Benchmarks Talk: Re-Evaluating Code {LLM}s with Interactive Feedback",
author = "Pan, Jane and
Shar, Ryan and
Pfau, Jacob and
Talwalkar, Ameet and
He, He and
Chen, Valerie",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.1267/",
doi = "10.18653/v1/2025.findings-acl.1267",
pages = "24672--24700",
ISBN = "979-8-89176-256-5",
abstract = "Programming with a coding assistant is a fundamentally interactive process, yet existing static benchmarks fail to capture key features of model-user collaboration. We introduce an interactive evaluation pipeline to examine how LLMs incorporate different types of feedback in a collaborative setting, in which we obfuscate the input of static coding benchmarks so that the code model must interact with a simulated user. Across 10 models and 3 datasets, the relative rankings of models often permute greatly between static and interactive settings, despite models being fairly robust to feedback that contains errors. We also observe that similarly effective feedback types differ in terms of how models respond to higher- vs. lower-quality feedback. Moreover, feedback type impacts the degree to which the models make aesthetic or behavioral edits to their output. Our work aims to ``re-evaluate'' model coding capabilities through an interactive lens toward bridging the gap between existing evaluations and real-world usage."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pan-etal-2025-benchmarks">
<titleInfo>
<title>When Benchmarks Talk: Re-Evaluating Code LLMs with Interactive Feedback</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jane</namePart>
<namePart type="family">Pan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryan</namePart>
<namePart type="family">Shar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jacob</namePart>
<namePart type="family">Pfau</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ameet</namePart>
<namePart type="family">Talwalkar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">He</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Valerie</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Programming with a coding assistant is a fundamentally interactive process, yet existing static benchmarks fail to capture key features of model-user collaboration. We introduce an interactive evaluation pipeline to examine how LLMs incorporate different types of feedback in a collaborative setting, in which we obfuscate the input of static coding benchmarks so that the code model must interact with a simulated user. Across 10 models and 3 datasets, the relative rankings of models often permute greatly between static and interactive settings, despite models being fairly robust to feedback that contains errors. We also observe that similarly effective feedback types differ in terms of how models respond to higher- vs. lower-quality feedback. Moreover, feedback type impacts the degree to which the models make aesthetic or behavioral edits to their output. Our work aims to “re-evaluate” model coding capabilities through an interactive lens toward bridging the gap between existing evaluations and real-world usage.</abstract>
<identifier type="citekey">pan-etal-2025-benchmarks</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.1267</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.1267/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>24672</start>
<end>24700</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T When Benchmarks Talk: Re-Evaluating Code LLMs with Interactive Feedback
%A Pan, Jane
%A Shar, Ryan
%A Pfau, Jacob
%A Talwalkar, Ameet
%A He, He
%A Chen, Valerie
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F pan-etal-2025-benchmarks
%X Programming with a coding assistant is a fundamentally interactive process, yet existing static benchmarks fail to capture key features of model-user collaboration. We introduce an interactive evaluation pipeline to examine how LLMs incorporate different types of feedback in a collaborative setting, in which we obfuscate the input of static coding benchmarks so that the code model must interact with a simulated user. Across 10 models and 3 datasets, the relative rankings of models often permute greatly between static and interactive settings, despite models being fairly robust to feedback that contains errors. We also observe that similarly effective feedback types differ in terms of how models respond to higher- vs. lower-quality feedback. Moreover, feedback type impacts the degree to which the models make aesthetic or behavioral edits to their output. Our work aims to “re-evaluate” model coding capabilities through an interactive lens toward bridging the gap between existing evaluations and real-world usage.
%R 10.18653/v1/2025.findings-acl.1267
%U https://aclanthology.org/2025.findings-acl.1267/
%U https://doi.org/10.18653/v1/2025.findings-acl.1267
%P 24672-24700
Markdown (Informal)
[When Benchmarks Talk: Re-Evaluating Code LLMs with Interactive Feedback](https://aclanthology.org/2025.findings-acl.1267/) (Pan et al., Findings 2025)
ACL