@inproceedings{gschwind-etal-2025-classifier,
title = "Classifier-Augmented Generation for Structured Workflow Prediction",
author = "Gschwind, Thomas and
Chakraborty, Shramona and
Gupta, Nitin and
Mehta, Sameep",
editor = "Potdar, Saloni and
Rojas-Barahona, Lina and
Montella, Sebastien",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track",
month = nov,
year = "2025",
address = "Suzhou (China)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-industry.85/",
pages = "1227--1238",
ISBN = "979-8-89176-333-3",
abstract = "ETL (Extract, Transform, Load) tools such as IBM DataStage allow users to visually assemble complex data workflows, but configuring stages and their properties remains time consuming and requires deep tool knowledge. We propose a system that translates natural language descriptions into executable workflows, automatically predicting both the structure and detailed configuration of the flow. At its core lies a Classifier-Augmented Generation (CAG) approach that combines utterance decomposition with a classifier and stage-specific few-shot prompting to produce accurate stage predictions. These stages are then connected into non-linear workflows using edge prediction, and stage properties are inferred from sub-utterance context. We compare CAG against strong single-prompt and agentic baselines, showing improved accuracy and efficiency, while substantially reducing token usage. Our architecture is modular, interpretable, and capable of end-to-end workflow generation, including robust validation steps. To our knowledge, this is the first system with a detailed evaluation across stage prediction, edge layout, and property generation for natural-language-driven ETL authoring."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gschwind-etal-2025-classifier">
<titleInfo>
<title>Classifier-Augmented Generation for Structured Workflow Prediction</title>
</titleInfo>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Gschwind</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shramona</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nitin</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sameep</namePart>
<namePart type="family">Mehta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Saloni</namePart>
<namePart type="family">Potdar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lina</namePart>
<namePart type="family">Rojas-Barahona</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastien</namePart>
<namePart type="family">Montella</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou (China)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-333-3</identifier>
</relatedItem>
<abstract>ETL (Extract, Transform, Load) tools such as IBM DataStage allow users to visually assemble complex data workflows, but configuring stages and their properties remains time consuming and requires deep tool knowledge. We propose a system that translates natural language descriptions into executable workflows, automatically predicting both the structure and detailed configuration of the flow. At its core lies a Classifier-Augmented Generation (CAG) approach that combines utterance decomposition with a classifier and stage-specific few-shot prompting to produce accurate stage predictions. These stages are then connected into non-linear workflows using edge prediction, and stage properties are inferred from sub-utterance context. We compare CAG against strong single-prompt and agentic baselines, showing improved accuracy and efficiency, while substantially reducing token usage. Our architecture is modular, interpretable, and capable of end-to-end workflow generation, including robust validation steps. To our knowledge, this is the first system with a detailed evaluation across stage prediction, edge layout, and property generation for natural-language-driven ETL authoring.</abstract>
<identifier type="citekey">gschwind-etal-2025-classifier</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-industry.85/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>1227</start>
<end>1238</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Classifier-Augmented Generation for Structured Workflow Prediction
%A Gschwind, Thomas
%A Chakraborty, Shramona
%A Gupta, Nitin
%A Mehta, Sameep
%Y Potdar, Saloni
%Y Rojas-Barahona, Lina
%Y Montella, Sebastien
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou (China)
%@ 979-8-89176-333-3
%F gschwind-etal-2025-classifier
%X ETL (Extract, Transform, Load) tools such as IBM DataStage allow users to visually assemble complex data workflows, but configuring stages and their properties remains time consuming and requires deep tool knowledge. We propose a system that translates natural language descriptions into executable workflows, automatically predicting both the structure and detailed configuration of the flow. At its core lies a Classifier-Augmented Generation (CAG) approach that combines utterance decomposition with a classifier and stage-specific few-shot prompting to produce accurate stage predictions. These stages are then connected into non-linear workflows using edge prediction, and stage properties are inferred from sub-utterance context. We compare CAG against strong single-prompt and agentic baselines, showing improved accuracy and efficiency, while substantially reducing token usage. Our architecture is modular, interpretable, and capable of end-to-end workflow generation, including robust validation steps. To our knowledge, this is the first system with a detailed evaluation across stage prediction, edge layout, and property generation for natural-language-driven ETL authoring.
%U https://aclanthology.org/2025.emnlp-industry.85/
%P 1227-1238
Markdown (Informal)
[Classifier-Augmented Generation for Structured Workflow Prediction](https://aclanthology.org/2025.emnlp-industry.85/) (Gschwind et al., EMNLP 2025)
ACL