@inproceedings{oh-kim-2026-seam,
title = "{SEAM}: Bridging the Temporal-Semantic Granularity Gap for {LLM}-based Speech Recognition",
author = "Oh, Junseok and
Kim, Ji-Hwan",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-eacl.112/",
pages = "2135--2144",
ISBN = "979-8-89176-386-9",
abstract = "Speech-LLM integration faces a temporal-semantic granularity gap: speech representations scale with temporal duration while text tokens scale with semantic content. Existing duration-based methods generate embeddings at fixed rates, creating distributional mismatch with LLM pre-training. We propose SEAM (Speech Encoder-Decoder Alignment Module), an encoder-decoder architecture employing variable-rate generation through cross-attention between speech features and text embeddings. SEAM produces embeddings at adaptive rates that closely match natural text distributions while preserving pre-trained knowledge by freezing both speech encoder and LLM. We introduce a multi-stage training strategy and First Token Guidance to improve initial token prediction. SEAM achieves competitive performance on LibriSpeech (2.6{\%}/5.2{\%} WER). More significantly, trained only on LibriSpeech (960h), SEAM achieves 4.7{\%} WER on cross-domain TED-LIUM-v2, demonstrating that integrating LLM{'}s linguistic knowledge enables effective generalization beyond limited speech training data."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="oh-kim-2026-seam">
<titleInfo>
<title>SEAM: Bridging the Temporal-Semantic Granularity Gap for LLM-based Speech Recognition</title>
</titleInfo>
<name type="personal">
<namePart type="given">Junseok</namePart>
<namePart type="family">Oh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ji-Hwan</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-386-9</identifier>
</relatedItem>
<abstract>Speech-LLM integration faces a temporal-semantic granularity gap: speech representations scale with temporal duration while text tokens scale with semantic content. Existing duration-based methods generate embeddings at fixed rates, creating distributional mismatch with LLM pre-training. We propose SEAM (Speech Encoder-Decoder Alignment Module), an encoder-decoder architecture employing variable-rate generation through cross-attention between speech features and text embeddings. SEAM produces embeddings at adaptive rates that closely match natural text distributions while preserving pre-trained knowledge by freezing both speech encoder and LLM. We introduce a multi-stage training strategy and First Token Guidance to improve initial token prediction. SEAM achieves competitive performance on LibriSpeech (2.6%/5.2% WER). More significantly, trained only on LibriSpeech (960h), SEAM achieves 4.7% WER on cross-domain TED-LIUM-v2, demonstrating that integrating LLM’s linguistic knowledge enables effective generalization beyond limited speech training data.</abstract>
<identifier type="citekey">oh-kim-2026-seam</identifier>
<location>
<url>https://aclanthology.org/2026.findings-eacl.112/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>2135</start>
<end>2144</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SEAM: Bridging the Temporal-Semantic Granularity Gap for LLM-based Speech Recognition
%A Oh, Junseok
%A Kim, Ji-Hwan
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Findings of the Association for Computational Linguistics: EACL 2026
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-386-9
%F oh-kim-2026-seam
%X Speech-LLM integration faces a temporal-semantic granularity gap: speech representations scale with temporal duration while text tokens scale with semantic content. Existing duration-based methods generate embeddings at fixed rates, creating distributional mismatch with LLM pre-training. We propose SEAM (Speech Encoder-Decoder Alignment Module), an encoder-decoder architecture employing variable-rate generation through cross-attention between speech features and text embeddings. SEAM produces embeddings at adaptive rates that closely match natural text distributions while preserving pre-trained knowledge by freezing both speech encoder and LLM. We introduce a multi-stage training strategy and First Token Guidance to improve initial token prediction. SEAM achieves competitive performance on LibriSpeech (2.6%/5.2% WER). More significantly, trained only on LibriSpeech (960h), SEAM achieves 4.7% WER on cross-domain TED-LIUM-v2, demonstrating that integrating LLM’s linguistic knowledge enables effective generalization beyond limited speech training data.
%U https://aclanthology.org/2026.findings-eacl.112/
%P 2135-2144
Markdown (Informal)
[SEAM: Bridging the Temporal-Semantic Granularity Gap for LLM-based Speech Recognition](https://aclanthology.org/2026.findings-eacl.112/) (Oh & Kim, Findings 2026)
ACL