@inproceedings{akella-etal-2025-codegenwrangler,
title = "{C}ode{G}en{W}rangler: Data Wrangling task automation using Code-Generating Models",
author = "Akella, Ashlesha and
Manatkar, Abhijit and
Narayanam, Krishnasuri and
Mehta, Sameep",
editor = "Chen, Weizhu and
Yang, Yi and
Kachuee, Mohammad and
Fu, Xue-Yong",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 3: Industry Track)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-industry.70/",
doi = "10.18653/v1/2025.naacl-industry.70",
pages = "949--960",
ISBN = "979-8-89176-194-0",
abstract = "Assuring the data quality of tabular datasets is essential for the efficiency of the diverse tabular downstream tasks (like summarization and fact-checking). Data-wrangling tasks effectively address the challenges associated with structured data processing to improve the quality of tabular data. Traditional statistical methods handle numeric data efficiently but often fail to understand the semantic context of the textual data in tables. Deep learning approaches are resource-intensive, requiring task and dataset-specific training. Addressing these shortcomings, we present an automated system that leverages LLMs to generate executable code for data-wrangling tasks like missing value imputation, error detection, and error correction. Our system aims to identify inherent patterns in the data while leveraging external knowledge, effectively addressing both memory-independent and memory-dependent tasks."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="akella-etal-2025-codegenwrangler">
<titleInfo>
<title>CodeGenWrangler: Data Wrangling task automation using Code-Generating Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ashlesha</namePart>
<namePart type="family">Akella</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abhijit</namePart>
<namePart type="family">Manatkar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Krishnasuri</namePart>
<namePart type="family">Narayanam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sameep</namePart>
<namePart type="family">Mehta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 3: Industry Track)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Weizhu</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yi</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="family">Kachuee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xue-Yong</namePart>
<namePart type="family">Fu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-194-0</identifier>
</relatedItem>
<abstract>Assuring the data quality of tabular datasets is essential for the efficiency of the diverse tabular downstream tasks (like summarization and fact-checking). Data-wrangling tasks effectively address the challenges associated with structured data processing to improve the quality of tabular data. Traditional statistical methods handle numeric data efficiently but often fail to understand the semantic context of the textual data in tables. Deep learning approaches are resource-intensive, requiring task and dataset-specific training. Addressing these shortcomings, we present an automated system that leverages LLMs to generate executable code for data-wrangling tasks like missing value imputation, error detection, and error correction. Our system aims to identify inherent patterns in the data while leveraging external knowledge, effectively addressing both memory-independent and memory-dependent tasks.</abstract>
<identifier type="citekey">akella-etal-2025-codegenwrangler</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-industry.70</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-industry.70/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>949</start>
<end>960</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CodeGenWrangler: Data Wrangling task automation using Code-Generating Models
%A Akella, Ashlesha
%A Manatkar, Abhijit
%A Narayanam, Krishnasuri
%A Mehta, Sameep
%Y Chen, Weizhu
%Y Yang, Yi
%Y Kachuee, Mohammad
%Y Fu, Xue-Yong
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 3: Industry Track)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-194-0
%F akella-etal-2025-codegenwrangler
%X Assuring the data quality of tabular datasets is essential for the efficiency of the diverse tabular downstream tasks (like summarization and fact-checking). Data-wrangling tasks effectively address the challenges associated with structured data processing to improve the quality of tabular data. Traditional statistical methods handle numeric data efficiently but often fail to understand the semantic context of the textual data in tables. Deep learning approaches are resource-intensive, requiring task and dataset-specific training. Addressing these shortcomings, we present an automated system that leverages LLMs to generate executable code for data-wrangling tasks like missing value imputation, error detection, and error correction. Our system aims to identify inherent patterns in the data while leveraging external knowledge, effectively addressing both memory-independent and memory-dependent tasks.
%R 10.18653/v1/2025.naacl-industry.70
%U https://aclanthology.org/2025.naacl-industry.70/
%U https://doi.org/10.18653/v1/2025.naacl-industry.70
%P 949-960
Markdown (Informal)
[CodeGenWrangler: Data Wrangling task automation using Code-Generating Models](https://aclanthology.org/2025.naacl-industry.70/) (Akella et al., NAACL 2025)
ACL
- Ashlesha Akella, Abhijit Manatkar, Krishnasuri Narayanam, and Sameep Mehta. 2025. CodeGenWrangler: Data Wrangling task automation using Code-Generating Models. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 3: Industry Track), pages 949–960, Albuquerque, New Mexico. Association for Computational Linguistics.