@inproceedings{sultan-etal-2024-visual,
title = "Visual Editing with {LLM}-based Tool Chaining: An Efficient Distillation Approach for Real-Time Applications",
author = "Sultan, Oren and
Khasin, Alexander and
Shiran, Guy and
Greenstein-Messica, Asnat and
Shahaf, Dafna",
editor = "Dernoncourt, Franck and
Preo{\c{t}}iuc-Pietro, Daniel and
Shimorina, Anastasia",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track",
month = nov,
year = "2024",
address = "Miami, Florida, US",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-industry.96",
pages = "1286--1304",
abstract = "We present a practical distillation approach to fine-tune LLMs for invoking tools in real-time applications. We focus on visual editing tasks; specifically, we modify images and videos by interpreting user stylistic requests, specified in natural language ({``}golden hour{''}), using an LLM to select the appropriate tools and their parameters to achieve the desired visual effect.We found that proprietary LLMs such as GPT-3.5-Turbo show potential in this task, but their high cost and latency make them unsuitable for real-time applications.In our approach, we fine-tune a (smaller) student LLM with guidance from a (larger) teacher LLM and behavioral signals.We introduce offline metrics to evaluate student LLMs. Both online and offline experiments show that our student models manage to match the performance of our teacher model (GPT-3.5-Turbo), significantly reducing costs and latency.Lastly, we show that fine-tuning was improved by 25{\%} in low-data regimes using augmentation.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sultan-etal-2024-visual">
<titleInfo>
<title>Visual Editing with LLM-based Tool Chaining: An Efficient Distillation Approach for Real-Time Applications</title>
</titleInfo>
<name type="personal">
<namePart type="given">Oren</namePart>
<namePart type="family">Sultan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexander</namePart>
<namePart type="family">Khasin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guy</namePart>
<namePart type="family">Shiran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asnat</namePart>
<namePart type="family">Greenstein-Messica</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dafna</namePart>
<namePart type="family">Shahaf</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Franck</namePart>
<namePart type="family">Dernoncourt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Preoţiuc-Pietro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anastasia</namePart>
<namePart type="family">Shimorina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, US</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present a practical distillation approach to fine-tune LLMs for invoking tools in real-time applications. We focus on visual editing tasks; specifically, we modify images and videos by interpreting user stylistic requests, specified in natural language (“golden hour”), using an LLM to select the appropriate tools and their parameters to achieve the desired visual effect.We found that proprietary LLMs such as GPT-3.5-Turbo show potential in this task, but their high cost and latency make them unsuitable for real-time applications.In our approach, we fine-tune a (smaller) student LLM with guidance from a (larger) teacher LLM and behavioral signals.We introduce offline metrics to evaluate student LLMs. Both online and offline experiments show that our student models manage to match the performance of our teacher model (GPT-3.5-Turbo), significantly reducing costs and latency.Lastly, we show that fine-tuning was improved by 25% in low-data regimes using augmentation.</abstract>
<identifier type="citekey">sultan-etal-2024-visual</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-industry.96</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>1286</start>
<end>1304</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Visual Editing with LLM-based Tool Chaining: An Efficient Distillation Approach for Real-Time Applications
%A Sultan, Oren
%A Khasin, Alexander
%A Shiran, Guy
%A Greenstein-Messica, Asnat
%A Shahaf, Dafna
%Y Dernoncourt, Franck
%Y Preoţiuc-Pietro, Daniel
%Y Shimorina, Anastasia
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, US
%F sultan-etal-2024-visual
%X We present a practical distillation approach to fine-tune LLMs for invoking tools in real-time applications. We focus on visual editing tasks; specifically, we modify images and videos by interpreting user stylistic requests, specified in natural language (“golden hour”), using an LLM to select the appropriate tools and their parameters to achieve the desired visual effect.We found that proprietary LLMs such as GPT-3.5-Turbo show potential in this task, but their high cost and latency make them unsuitable for real-time applications.In our approach, we fine-tune a (smaller) student LLM with guidance from a (larger) teacher LLM and behavioral signals.We introduce offline metrics to evaluate student LLMs. Both online and offline experiments show that our student models manage to match the performance of our teacher model (GPT-3.5-Turbo), significantly reducing costs and latency.Lastly, we show that fine-tuning was improved by 25% in low-data regimes using augmentation.
%U https://aclanthology.org/2024.emnlp-industry.96
%P 1286-1304
Markdown (Informal)
[Visual Editing with LLM-based Tool Chaining: An Efficient Distillation Approach for Real-Time Applications](https://aclanthology.org/2024.emnlp-industry.96) (Sultan et al., EMNLP 2024)
ACL