@inproceedings{pietruszka-etal-2026-models,
title = "Can Models Help Us Create Better Models? Evaluating {LLM}s as Data Scientists",
author = "Pietruszka, Micha{\l} and
Borchmann, {\L}ukasz and
J{\k{e}}drosz, Aleksander and
Morawiecki, Pawe{\l}",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-eacl.308/",
pages = "5862--5886",
ISBN = "979-8-89176-386-9",
abstract = "We present FeatEng, a novel benchmark designed to evaluate the ability of large language models (LLMs) to perform feature engineering, a critical and knowledge-intensive task in data science. FeatEng assesses LLMs by their capacity to generate Python code that transforms raw tabular data into features that improve the performance of a downstream machine learning model. Our analysis of LLM outputs reveals that success on FeatEng often requires the application of significant world and domain knowledge, along with complex reasoning, to construct novel data representations. While focused on feature engineering, the benchmark probes a confluence of abilities indicative of an LLM{'}s broader potential for practical, data-centric problem-solving. We demonstrate that FeatEng offers a targeted and efficient approach to assess a specific but crucial aspect of LLM capabilities relevant to real-world data science applications."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pietruszka-etal-2026-models">
<titleInfo>
<title>Can Models Help Us Create Better Models? Evaluating LLMs as Data Scientists</title>
</titleInfo>
<name type="personal">
<namePart type="given">Michał</namePart>
<namePart type="family">Pietruszka</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Łukasz</namePart>
<namePart type="family">Borchmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aleksander</namePart>
<namePart type="family">Jędrosz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paweł</namePart>
<namePart type="family">Morawiecki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-386-9</identifier>
</relatedItem>
<abstract>We present FeatEng, a novel benchmark designed to evaluate the ability of large language models (LLMs) to perform feature engineering, a critical and knowledge-intensive task in data science. FeatEng assesses LLMs by their capacity to generate Python code that transforms raw tabular data into features that improve the performance of a downstream machine learning model. Our analysis of LLM outputs reveals that success on FeatEng often requires the application of significant world and domain knowledge, along with complex reasoning, to construct novel data representations. While focused on feature engineering, the benchmark probes a confluence of abilities indicative of an LLM’s broader potential for practical, data-centric problem-solving. We demonstrate that FeatEng offers a targeted and efficient approach to assess a specific but crucial aspect of LLM capabilities relevant to real-world data science applications.</abstract>
<identifier type="citekey">pietruszka-etal-2026-models</identifier>
<location>
<url>https://aclanthology.org/2026.findings-eacl.308/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>5862</start>
<end>5886</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Can Models Help Us Create Better Models? Evaluating LLMs as Data Scientists
%A Pietruszka, Michał
%A Borchmann, Łukasz
%A Jędrosz, Aleksander
%A Morawiecki, Paweł
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Findings of the Association for Computational Linguistics: EACL 2026
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-386-9
%F pietruszka-etal-2026-models
%X We present FeatEng, a novel benchmark designed to evaluate the ability of large language models (LLMs) to perform feature engineering, a critical and knowledge-intensive task in data science. FeatEng assesses LLMs by their capacity to generate Python code that transforms raw tabular data into features that improve the performance of a downstream machine learning model. Our analysis of LLM outputs reveals that success on FeatEng often requires the application of significant world and domain knowledge, along with complex reasoning, to construct novel data representations. While focused on feature engineering, the benchmark probes a confluence of abilities indicative of an LLM’s broader potential for practical, data-centric problem-solving. We demonstrate that FeatEng offers a targeted and efficient approach to assess a specific but crucial aspect of LLM capabilities relevant to real-world data science applications.
%U https://aclanthology.org/2026.findings-eacl.308/
%P 5862-5886
Markdown (Informal)
[Can Models Help Us Create Better Models? Evaluating LLMs as Data Scientists](https://aclanthology.org/2026.findings-eacl.308/) (Pietruszka et al., Findings 2026)
ACL