@inproceedings{gao-etal-2026-sada,
title = "{SADA}: Bridging In-Context Learning and Fine-Tuning via State-Aligned Distillation Adapters",
author = "Gao, Wenhao and
Wang, Tianlong and
Jia, Wei and
Zhang, Linhao and
Liu, Aiwei and
Fan, Miao and
Xiao, Zhou",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-long.1046/",
pages = "22847--22862",
ISBN = "979-8-89176-390-6",
abstract = "Prompt-based in-context learning (ICL) and parameter fine-tuning are two dominant paradigms for incorporating external information into large language models (LLMs), but they incur high inference costs or require expensive retraining. To bridge this gap, context-to-parameter mapping converts prompts into temporary adapter weights. However, we identify a critical failure mode in existing methods: *hidden-state collapse*, where the adapter-augmented model{'}s internal states diverge sharply from the full-context oracle in deeper layers. We trace this failure to two coupled gaps: suboptimal **Input-Selection** and inadequate **Supervision-Signal**. To address these issues, we propose SADA (**S**tate-**A**ligned **D**istillation **A**dapters). We establish the *attention-block output* as a principled feature interface to improve input selection and introduce *state-alignment distillation* to enforce consistency between the adapter-augmented model and the full-context oracle. Experiments on long-context language modeling (PG19) and downstream NLU and summarization benchmarks show that SADA consistently outperforms strong baselines like *StreamAdapter* and *GenerativeAdapter*, achieving performance comparable to ICL while significantly reducing memory footprint and latency. We further analyze when parameterized context compression is effective and when explicit context retention remains preferable. Our code is available at [https://github.com/Taylor-Gavel/SADA.git](https://github.com/Taylor-Gavel/SADA.git)."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gao-etal-2026-sada">
<titleInfo>
<title>SADA: Bridging In-Context Learning and Fine-Tuning via State-Aligned Distillation Adapters</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wenhao</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tianlong</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Jia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Linhao</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aiwei</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Miao</namePart>
<namePart type="family">Fan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhou</namePart>
<namePart type="family">Xiao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-390-6</identifier>
</relatedItem>
<abstract>Prompt-based in-context learning (ICL) and parameter fine-tuning are two dominant paradigms for incorporating external information into large language models (LLMs), but they incur high inference costs or require expensive retraining. To bridge this gap, context-to-parameter mapping converts prompts into temporary adapter weights. However, we identify a critical failure mode in existing methods: *hidden-state collapse*, where the adapter-augmented model’s internal states diverge sharply from the full-context oracle in deeper layers. We trace this failure to two coupled gaps: suboptimal **Input-Selection** and inadequate **Supervision-Signal**. To address these issues, we propose SADA (**S**tate-**A**ligned **D**istillation **A**dapters). We establish the *attention-block output* as a principled feature interface to improve input selection and introduce *state-alignment distillation* to enforce consistency between the adapter-augmented model and the full-context oracle. Experiments on long-context language modeling (PG19) and downstream NLU and summarization benchmarks show that SADA consistently outperforms strong baselines like *StreamAdapter* and *GenerativeAdapter*, achieving performance comparable to ICL while significantly reducing memory footprint and latency. We further analyze when parameterized context compression is effective and when explicit context retention remains preferable. Our code is available at [https://github.com/Taylor-Gavel/SADA.git](https://github.com/Taylor-Gavel/SADA.git).</abstract>
<identifier type="citekey">gao-etal-2026-sada</identifier>
<location>
<url>https://aclanthology.org/2026.acl-long.1046/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>22847</start>
<end>22862</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SADA: Bridging In-Context Learning and Fine-Tuning via State-Aligned Distillation Adapters
%A Gao, Wenhao
%A Wang, Tianlong
%A Jia, Wei
%A Zhang, Linhao
%A Liu, Aiwei
%A Fan, Miao
%A Xiao, Zhou
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-390-6
%F gao-etal-2026-sada
%X Prompt-based in-context learning (ICL) and parameter fine-tuning are two dominant paradigms for incorporating external information into large language models (LLMs), but they incur high inference costs or require expensive retraining. To bridge this gap, context-to-parameter mapping converts prompts into temporary adapter weights. However, we identify a critical failure mode in existing methods: *hidden-state collapse*, where the adapter-augmented model’s internal states diverge sharply from the full-context oracle in deeper layers. We trace this failure to two coupled gaps: suboptimal **Input-Selection** and inadequate **Supervision-Signal**. To address these issues, we propose SADA (**S**tate-**A**ligned **D**istillation **A**dapters). We establish the *attention-block output* as a principled feature interface to improve input selection and introduce *state-alignment distillation* to enforce consistency between the adapter-augmented model and the full-context oracle. Experiments on long-context language modeling (PG19) and downstream NLU and summarization benchmarks show that SADA consistently outperforms strong baselines like *StreamAdapter* and *GenerativeAdapter*, achieving performance comparable to ICL while significantly reducing memory footprint and latency. We further analyze when parameterized context compression is effective and when explicit context retention remains preferable. Our code is available at [https://github.com/Taylor-Gavel/SADA.git](https://github.com/Taylor-Gavel/SADA.git).
%U https://aclanthology.org/2026.acl-long.1046/
%P 22847-22862
Markdown (Informal)
[SADA: Bridging In-Context Learning and Fine-Tuning via State-Aligned Distillation Adapters](https://aclanthology.org/2026.acl-long.1046/) (Gao et al., ACL 2026)
ACL
- Wenhao Gao, Tianlong Wang, Wei Jia, Linhao Zhang, Aiwei Liu, Miao Fan, and Zhou Xiao. 2026. SADA: Bridging In-Context Learning and Fine-Tuning via State-Aligned Distillation Adapters. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 22847–22862, San Diego, California, United States. Association for Computational Linguistics.