@inproceedings{chen-etal-2026-condenseflow,
title = "{C}ondense{F}low: Scalable Latent Space Collaboration via Semantic Compression for Multi-Agent Systems",
author = "Chen, Xiaoyu and
Wu, Fengge and
Junsuo, Zhao and
Fan, Yun",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.669/",
pages = "13694--13712",
ISBN = "979-8-89176-395-1",
abstract = "Full-state latent communication in LLM-based multi-agent systems offers richer semantics than text but suffers from memory overhead scaling linearly with collaboration rounds. We propose \textbf{CondenseFlow}, which introduces the \textbf{Latent Thought Condenser (LTC)}{---}a lightweight module using learnable semantic probes to compress KV caches into fixed-size representations, achieving $\mathcal{O}(1)$ communication complexity regardless of context length. We theoretically prove that compression error is bounded by attention concentration and accumulates controllably across rounds. On seven benchmarks spanning six models, CondenseFlow reduces KV cache memory by over 99{\%} and inference latency by approximately 20{\%} compared to dense transfer with negligible accuracy degradation, while outperforming text-based methods by 1.7 percentage points on average across all configurations. Code is available at https://github.com/xxy33/condenseflow."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chen-etal-2026-condenseflow">
<titleInfo>
<title>CondenseFlow: Scalable Latent Space Collaboration via Semantic Compression for Multi-Agent Systems</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xiaoyu</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fengge</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhao</namePart>
<namePart type="family">Junsuo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun</namePart>
<namePart type="family">Fan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Full-state latent communication in LLM-based multi-agent systems offers richer semantics than text but suffers from memory overhead scaling linearly with collaboration rounds. We propose CondenseFlow, which introduces the Latent Thought Condenser (LTC)—a lightweight module using learnable semantic probes to compress KV caches into fixed-size representations, achieving \mathcalO(1) communication complexity regardless of context length. We theoretically prove that compression error is bounded by attention concentration and accumulates controllably across rounds. On seven benchmarks spanning six models, CondenseFlow reduces KV cache memory by over 99% and inference latency by approximately 20% compared to dense transfer with negligible accuracy degradation, while outperforming text-based methods by 1.7 percentage points on average across all configurations. Code is available at https://github.com/xxy33/condenseflow.</abstract>
<identifier type="citekey">chen-etal-2026-condenseflow</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.669/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>13694</start>
<end>13712</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CondenseFlow: Scalable Latent Space Collaboration via Semantic Compression for Multi-Agent Systems
%A Chen, Xiaoyu
%A Wu, Fengge
%A Junsuo, Zhao
%A Fan, Yun
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F chen-etal-2026-condenseflow
%X Full-state latent communication in LLM-based multi-agent systems offers richer semantics than text but suffers from memory overhead scaling linearly with collaboration rounds. We propose CondenseFlow, which introduces the Latent Thought Condenser (LTC)—a lightweight module using learnable semantic probes to compress KV caches into fixed-size representations, achieving \mathcalO(1) communication complexity regardless of context length. We theoretically prove that compression error is bounded by attention concentration and accumulates controllably across rounds. On seven benchmarks spanning six models, CondenseFlow reduces KV cache memory by over 99% and inference latency by approximately 20% compared to dense transfer with negligible accuracy degradation, while outperforming text-based methods by 1.7 percentage points on average across all configurations. Code is available at https://github.com/xxy33/condenseflow.
%U https://aclanthology.org/2026.findings-acl.669/
%P 13694-13712
Markdown (Informal)
[CondenseFlow: Scalable Latent Space Collaboration via Semantic Compression for Multi-Agent Systems](https://aclanthology.org/2026.findings-acl.669/) (Chen et al., Findings 2026)
ACL