@inproceedings{koneru-etal-2025-kits,
title = "{KIT}{'}s Offline Speech Translation and Instruction Following Submission for {IWSLT} 2025",
author = {Koneru, Sai and
Z{\"u}fle, Maike and
Binh Nguyen, Thai and
Akti, Seymanur and
Niehues, Jan and
Waibel, Alexander},
editor = "Salesky, Elizabeth and
Federico, Marcello and
Anastasopoulos, Antonis",
booktitle = "Proceedings of the 22nd International Conference on Spoken Language Translation (IWSLT 2025)",
month = jul,
year = "2025",
address = "Vienna, Austria (in-person and online)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.iwslt-1.22/",
doi = "10.18653/v1/2025.iwslt-1.22",
pages = "232--244",
ISBN = "979-8-89176-272-5",
abstract = "In this paper, we present the submissions for the Offline ST and Instruction Following (IF) tracks, where we leverage LLMs to enhance performance across all tasks. For the Offline ST track, we propose a pipeline that employs multiple automatic speech recognition systems, whose outputs are fused using an LLM with document-level context. This is followed by a two-step translation process, incorporating additional contextual refinement step to improve translation quality. For the IF track, we develop an end-to-end model that integrates a speech encoder with an LLM to perform a wide range of instruction-following tasks. We complement it with a final document-level refinement stage to further enhance output quality by using contextual information."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="koneru-etal-2025-kits">
<titleInfo>
<title>KIT’s Offline Speech Translation and Instruction Following Submission for IWSLT 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sai</namePart>
<namePart type="family">Koneru</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maike</namePart>
<namePart type="family">Züfle</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thai</namePart>
<namePart type="family">Binh Nguyen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seymanur</namePart>
<namePart type="family">Akti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Niehues</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexander</namePart>
<namePart type="family">Waibel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 22nd International Conference on Spoken Language Translation (IWSLT 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Elizabeth</namePart>
<namePart type="family">Salesky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcello</namePart>
<namePart type="family">Federico</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonis</namePart>
<namePart type="family">Anastasopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria (in-person and online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-272-5</identifier>
</relatedItem>
<abstract>In this paper, we present the submissions for the Offline ST and Instruction Following (IF) tracks, where we leverage LLMs to enhance performance across all tasks. For the Offline ST track, we propose a pipeline that employs multiple automatic speech recognition systems, whose outputs are fused using an LLM with document-level context. This is followed by a two-step translation process, incorporating additional contextual refinement step to improve translation quality. For the IF track, we develop an end-to-end model that integrates a speech encoder with an LLM to perform a wide range of instruction-following tasks. We complement it with a final document-level refinement stage to further enhance output quality by using contextual information.</abstract>
<identifier type="citekey">koneru-etal-2025-kits</identifier>
<identifier type="doi">10.18653/v1/2025.iwslt-1.22</identifier>
<location>
<url>https://aclanthology.org/2025.iwslt-1.22/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>232</start>
<end>244</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T KIT’s Offline Speech Translation and Instruction Following Submission for IWSLT 2025
%A Koneru, Sai
%A Züfle, Maike
%A Binh Nguyen, Thai
%A Akti, Seymanur
%A Niehues, Jan
%A Waibel, Alexander
%Y Salesky, Elizabeth
%Y Federico, Marcello
%Y Anastasopoulos, Antonis
%S Proceedings of the 22nd International Conference on Spoken Language Translation (IWSLT 2025)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria (in-person and online)
%@ 979-8-89176-272-5
%F koneru-etal-2025-kits
%X In this paper, we present the submissions for the Offline ST and Instruction Following (IF) tracks, where we leverage LLMs to enhance performance across all tasks. For the Offline ST track, we propose a pipeline that employs multiple automatic speech recognition systems, whose outputs are fused using an LLM with document-level context. This is followed by a two-step translation process, incorporating additional contextual refinement step to improve translation quality. For the IF track, we develop an end-to-end model that integrates a speech encoder with an LLM to perform a wide range of instruction-following tasks. We complement it with a final document-level refinement stage to further enhance output quality by using contextual information.
%R 10.18653/v1/2025.iwslt-1.22
%U https://aclanthology.org/2025.iwslt-1.22/
%U https://doi.org/10.18653/v1/2025.iwslt-1.22
%P 232-244
Markdown (Informal)
[KIT’s Offline Speech Translation and Instruction Following Submission for IWSLT 2025](https://aclanthology.org/2025.iwslt-1.22/) (Koneru et al., IWSLT 2025)
ACL