@inproceedings{anjum-etal-2026-promcp,
title = "{P}ro{MCP}: Profiling Token Flows and Latency Costs in Model Context Protocol{--}Based {LLM} Agents",
author = "Anjum, Sumera and
Zheng, Weijian and
Kettimuthu, Rajkumar and
Fan, Heng and
Feng, Yunhe",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1967/",
doi = "10.18653/v1/2026.findings-acl.1967",
pages = "39476--39487",
ISBN = "979-8-89176-395-1",
abstract = "The Model Context Protocol (MCP) aims to standardize the integration of Large Language Models (LLMs) with external tools, yet existing research primarily evaluates functional capabilities while treating the underlying protocol as an opaque black box. This oversight obscures critical inefficiencies in token flows and latency distributed across MCP{'}s decoupled Host-Client-Server architecture. In this paper, we introduce ProMCP, an end-to-end profiling and instrumentation framework that decomposes the MCP workflow into a six-stage communication pipeline, enabling granular attribution of computational costs. We evaluate widely varying deployment topologies{---}from air-gapped local models to commercial off-the-shelf (OTS) clients{---}across 20 servers and 169 tools from MCP-Bench and MCP-Universe. Our analysis reveals a distinct inversion in performance bottlenecks: topologies with customized clients devote 56{--}72{\%} of total tokens and 60{--}67{\%} of latency to planning and schema injection, whereas OTS clients concentrate over 85{\%} of latency in final answer synthesis. Crucially, actual tool execution constitutes a negligible fraction of the total cost across all configurations. These findings establish a quantitative baseline for protocol overhead and demonstrate that future optimization must target schema orchestration and transport efficiency rather than tool execution speed. The code is available at: https://github.com/ResponsibleAILab/ProMCP."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="anjum-etal-2026-promcp">
<titleInfo>
<title>ProMCP: Profiling Token Flows and Latency Costs in Model Context Protocol–Based LLM Agents</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sumera</namePart>
<namePart type="family">Anjum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Weijian</namePart>
<namePart type="family">Zheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rajkumar</namePart>
<namePart type="family">Kettimuthu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Heng</namePart>
<namePart type="family">Fan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yunhe</namePart>
<namePart type="family">Feng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>The Model Context Protocol (MCP) aims to standardize the integration of Large Language Models (LLMs) with external tools, yet existing research primarily evaluates functional capabilities while treating the underlying protocol as an opaque black box. This oversight obscures critical inefficiencies in token flows and latency distributed across MCP’s decoupled Host-Client-Server architecture. In this paper, we introduce ProMCP, an end-to-end profiling and instrumentation framework that decomposes the MCP workflow into a six-stage communication pipeline, enabling granular attribution of computational costs. We evaluate widely varying deployment topologies—from air-gapped local models to commercial off-the-shelf (OTS) clients—across 20 servers and 169 tools from MCP-Bench and MCP-Universe. Our analysis reveals a distinct inversion in performance bottlenecks: topologies with customized clients devote 56–72% of total tokens and 60–67% of latency to planning and schema injection, whereas OTS clients concentrate over 85% of latency in final answer synthesis. Crucially, actual tool execution constitutes a negligible fraction of the total cost across all configurations. These findings establish a quantitative baseline for protocol overhead and demonstrate that future optimization must target schema orchestration and transport efficiency rather than tool execution speed. The code is available at: https://github.com/ResponsibleAILab/ProMCP.</abstract>
<identifier type="citekey">anjum-etal-2026-promcp</identifier>
<identifier type="doi">10.18653/v1/2026.findings-acl.1967</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1967/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>39476</start>
<end>39487</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ProMCP: Profiling Token Flows and Latency Costs in Model Context Protocol–Based LLM Agents
%A Anjum, Sumera
%A Zheng, Weijian
%A Kettimuthu, Rajkumar
%A Fan, Heng
%A Feng, Yunhe
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F anjum-etal-2026-promcp
%X The Model Context Protocol (MCP) aims to standardize the integration of Large Language Models (LLMs) with external tools, yet existing research primarily evaluates functional capabilities while treating the underlying protocol as an opaque black box. This oversight obscures critical inefficiencies in token flows and latency distributed across MCP’s decoupled Host-Client-Server architecture. In this paper, we introduce ProMCP, an end-to-end profiling and instrumentation framework that decomposes the MCP workflow into a six-stage communication pipeline, enabling granular attribution of computational costs. We evaluate widely varying deployment topologies—from air-gapped local models to commercial off-the-shelf (OTS) clients—across 20 servers and 169 tools from MCP-Bench and MCP-Universe. Our analysis reveals a distinct inversion in performance bottlenecks: topologies with customized clients devote 56–72% of total tokens and 60–67% of latency to planning and schema injection, whereas OTS clients concentrate over 85% of latency in final answer synthesis. Crucially, actual tool execution constitutes a negligible fraction of the total cost across all configurations. These findings establish a quantitative baseline for protocol overhead and demonstrate that future optimization must target schema orchestration and transport efficiency rather than tool execution speed. The code is available at: https://github.com/ResponsibleAILab/ProMCP.
%R 10.18653/v1/2026.findings-acl.1967
%U https://aclanthology.org/2026.findings-acl.1967/
%U https://doi.org/10.18653/v1/2026.findings-acl.1967
%P 39476-39487
Markdown (Informal)
[ProMCP: Profiling Token Flows and Latency Costs in Model Context Protocol–Based LLM Agents](https://aclanthology.org/2026.findings-acl.1967/) (Anjum et al., Findings 2026)
ACL