@inproceedings{daiber-etal-2025-dispatchqa,
title = "{D}ispatch{QA}: A Benchmark for Small Function Calling Language Models in {E}-Commerce Applications",
author = "Daiber, Joachim and
Maricato, Victor and
Sinha, Ayan and
Rabinovich, Andrew",
editor = "Potdar, Saloni and
Rojas-Barahona, Lina and
Montella, Sebastien",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track",
month = nov,
year = "2025",
address = "Suzhou (China)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-industry.154/",
pages = "2221--2233",
ISBN = "979-8-89176-333-3",
abstract = "We introduce DispatchQA, a benchmark to evaluate how well small language models (SLMs) translate open{-}ended search queries into executable API calls via explicit function calling. Our benchmark focuses on the latency-sensitive e-commerce setting and measures SLMs' impact on both search relevance and search latency. We provide strong, replicable baselines based on Llama 3.1 8B Instruct fine-tuned on synthetically generated data and find that fine-tuned SLMs produce search quality comparable or better than large language models such as GPT-4o while achieving up to 3{\texttimes} faster inference. All data, code, and training checkpoints are publicly released to spur further research on resource{-}efficient query understanding."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="daiber-etal-2025-dispatchqa">
<titleInfo>
<title>DispatchQA: A Benchmark for Small Function Calling Language Models in E-Commerce Applications</title>
</titleInfo>
<name type="personal">
<namePart type="given">Joachim</namePart>
<namePart type="family">Daiber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Victor</namePart>
<namePart type="family">Maricato</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ayan</namePart>
<namePart type="family">Sinha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrew</namePart>
<namePart type="family">Rabinovich</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Saloni</namePart>
<namePart type="family">Potdar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lina</namePart>
<namePart type="family">Rojas-Barahona</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastien</namePart>
<namePart type="family">Montella</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou (China)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-333-3</identifier>
</relatedItem>
<abstract>We introduce DispatchQA, a benchmark to evaluate how well small language models (SLMs) translate open-ended search queries into executable API calls via explicit function calling. Our benchmark focuses on the latency-sensitive e-commerce setting and measures SLMs’ impact on both search relevance and search latency. We provide strong, replicable baselines based on Llama 3.1 8B Instruct fine-tuned on synthetically generated data and find that fine-tuned SLMs produce search quality comparable or better than large language models such as GPT-4o while achieving up to 3× faster inference. All data, code, and training checkpoints are publicly released to spur further research on resource-efficient query understanding.</abstract>
<identifier type="citekey">daiber-etal-2025-dispatchqa</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-industry.154/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>2221</start>
<end>2233</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T DispatchQA: A Benchmark for Small Function Calling Language Models in E-Commerce Applications
%A Daiber, Joachim
%A Maricato, Victor
%A Sinha, Ayan
%A Rabinovich, Andrew
%Y Potdar, Saloni
%Y Rojas-Barahona, Lina
%Y Montella, Sebastien
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou (China)
%@ 979-8-89176-333-3
%F daiber-etal-2025-dispatchqa
%X We introduce DispatchQA, a benchmark to evaluate how well small language models (SLMs) translate open-ended search queries into executable API calls via explicit function calling. Our benchmark focuses on the latency-sensitive e-commerce setting and measures SLMs’ impact on both search relevance and search latency. We provide strong, replicable baselines based on Llama 3.1 8B Instruct fine-tuned on synthetically generated data and find that fine-tuned SLMs produce search quality comparable or better than large language models such as GPT-4o while achieving up to 3× faster inference. All data, code, and training checkpoints are publicly released to spur further research on resource-efficient query understanding.
%U https://aclanthology.org/2025.emnlp-industry.154/
%P 2221-2233
Markdown (Informal)
[DispatchQA: A Benchmark for Small Function Calling Language Models in E-Commerce Applications](https://aclanthology.org/2025.emnlp-industry.154/) (Daiber et al., EMNLP 2025)
ACL