@inproceedings{moslem-2025-efficient,
title = "Efficient Speech Translation through Model Compression and Knowledge Distillation",
author = "Moslem, Yasmin",
editor = "Salesky, Elizabeth and
Federico, Marcello and
Anastasopoulos, Antonis",
booktitle = "Proceedings of the 22nd International Conference on Spoken Language Translation (IWSLT 2025)",
month = jul,
year = "2025",
address = "Vienna, Austria (in-person and online)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.iwslt-1.40/",
doi = "10.18653/v1/2025.iwslt-1.40",
pages = "379--388",
ISBN = "979-8-89176-272-5",
abstract = "Efficient deployment of large audio-language models for speech translation remains challenging due to their significant computational requirements. In this paper, we address this challenge through our system submissions to the `Model Compression' track at the International Conference on Spoken Language Translation (IWSLT 2025). We experiment with a combination of approaches including iterative layer pruning based on layer importance evaluation, low-rank adaptation with 4-bit quantization (QLoRA), and knowledge distillation. In our experiments, we use Qwen2-Audio-7B-Instruct for speech translation into German and Chinese. Our pruned (student) models achieve up to a 50{\%} reduction in both model parameters and storage footprint, while retaining 97-100{\%} of the translation quality of the in-domain (teacher) models."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="moslem-2025-efficient">
<titleInfo>
<title>Efficient Speech Translation through Model Compression and Knowledge Distillation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yasmin</namePart>
<namePart type="family">Moslem</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 22nd International Conference on Spoken Language Translation (IWSLT 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Elizabeth</namePart>
<namePart type="family">Salesky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcello</namePart>
<namePart type="family">Federico</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonis</namePart>
<namePart type="family">Anastasopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria (in-person and online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-272-5</identifier>
</relatedItem>
<abstract>Efficient deployment of large audio-language models for speech translation remains challenging due to their significant computational requirements. In this paper, we address this challenge through our system submissions to the ‘Model Compression’ track at the International Conference on Spoken Language Translation (IWSLT 2025). We experiment with a combination of approaches including iterative layer pruning based on layer importance evaluation, low-rank adaptation with 4-bit quantization (QLoRA), and knowledge distillation. In our experiments, we use Qwen2-Audio-7B-Instruct for speech translation into German and Chinese. Our pruned (student) models achieve up to a 50% reduction in both model parameters and storage footprint, while retaining 97-100% of the translation quality of the in-domain (teacher) models.</abstract>
<identifier type="citekey">moslem-2025-efficient</identifier>
<identifier type="doi">10.18653/v1/2025.iwslt-1.40</identifier>
<location>
<url>https://aclanthology.org/2025.iwslt-1.40/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>379</start>
<end>388</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Efficient Speech Translation through Model Compression and Knowledge Distillation
%A Moslem, Yasmin
%Y Salesky, Elizabeth
%Y Federico, Marcello
%Y Anastasopoulos, Antonis
%S Proceedings of the 22nd International Conference on Spoken Language Translation (IWSLT 2025)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria (in-person and online)
%@ 979-8-89176-272-5
%F moslem-2025-efficient
%X Efficient deployment of large audio-language models for speech translation remains challenging due to their significant computational requirements. In this paper, we address this challenge through our system submissions to the ‘Model Compression’ track at the International Conference on Spoken Language Translation (IWSLT 2025). We experiment with a combination of approaches including iterative layer pruning based on layer importance evaluation, low-rank adaptation with 4-bit quantization (QLoRA), and knowledge distillation. In our experiments, we use Qwen2-Audio-7B-Instruct for speech translation into German and Chinese. Our pruned (student) models achieve up to a 50% reduction in both model parameters and storage footprint, while retaining 97-100% of the translation quality of the in-domain (teacher) models.
%R 10.18653/v1/2025.iwslt-1.40
%U https://aclanthology.org/2025.iwslt-1.40/
%U https://doi.org/10.18653/v1/2025.iwslt-1.40
%P 379-388
Markdown (Informal)
[Efficient Speech Translation through Model Compression and Knowledge Distillation](https://aclanthology.org/2025.iwslt-1.40/) (Moslem, IWSLT 2025)
ACL