@inproceedings{lin-etal-2025-align,
title = "Align-{SLM}: Textless Spoken Language Models with Reinforcement Learning from {AI} Feedback",
author = "Lin, Guan-Ting and
Shivakumar, Prashanth Gurunath and
Gourav, Aditya and
Gu, Yile and
Gandhe, Ankur and
Lee, Hung-yi and
Bulyko, Ivan",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.997/",
doi = "10.18653/v1/2025.acl-long.997",
pages = "20395--20411",
ISBN = "979-8-89176-251-0",
abstract = "While textless Spoken Language Models (SLMs) have shown potential in end-to-end speech-to-speech modeling, they still lag behind text-based Large Language Models (LLMs) in terms of semantic coherence and relevance. This work introduces the \textbf{Align-SLM} framework, which leverages preference optimization inspired by Reinforcement Learning with Human Feedback (RLHF) to enhance the semantic understanding of SLMs. Our approach generates multiple speech continuations from a given prompt and uses LLM-based semantic metrics to create preference data for Direct Preference Optimization (DPO). We evaluate the framework using ZeroSpeech 2021 benchmarks for lexical and syntactic modeling, the spoken version of the StoryCloze dataset for semantic coherence, and other speech generation metrics, including the GPT4-o score and human evaluation. Experimental results show that our method achieves the state-of-the-art performance of SLMs for most benchmarks, highlighting the importance of preference optimization to improve the semantics of SLMs."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lin-etal-2025-align">
<titleInfo>
<title>Align-SLM: Textless Spoken Language Models with Reinforcement Learning from AI Feedback</title>
</titleInfo>
<name type="personal">
<namePart type="given">Guan-Ting</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Prashanth</namePart>
<namePart type="given">Gurunath</namePart>
<namePart type="family">Shivakumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aditya</namePart>
<namePart type="family">Gourav</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yile</namePart>
<namePart type="family">Gu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ankur</namePart>
<namePart type="family">Gandhe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hung-yi</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ivan</namePart>
<namePart type="family">Bulyko</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>While textless Spoken Language Models (SLMs) have shown potential in end-to-end speech-to-speech modeling, they still lag behind text-based Large Language Models (LLMs) in terms of semantic coherence and relevance. This work introduces the Align-SLM framework, which leverages preference optimization inspired by Reinforcement Learning with Human Feedback (RLHF) to enhance the semantic understanding of SLMs. Our approach generates multiple speech continuations from a given prompt and uses LLM-based semantic metrics to create preference data for Direct Preference Optimization (DPO). We evaluate the framework using ZeroSpeech 2021 benchmarks for lexical and syntactic modeling, the spoken version of the StoryCloze dataset for semantic coherence, and other speech generation metrics, including the GPT4-o score and human evaluation. Experimental results show that our method achieves the state-of-the-art performance of SLMs for most benchmarks, highlighting the importance of preference optimization to improve the semantics of SLMs.</abstract>
<identifier type="citekey">lin-etal-2025-align</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.997</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.997/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>20395</start>
<end>20411</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Align-SLM: Textless Spoken Language Models with Reinforcement Learning from AI Feedback
%A Lin, Guan-Ting
%A Shivakumar, Prashanth Gurunath
%A Gourav, Aditya
%A Gu, Yile
%A Gandhe, Ankur
%A Lee, Hung-yi
%A Bulyko, Ivan
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F lin-etal-2025-align
%X While textless Spoken Language Models (SLMs) have shown potential in end-to-end speech-to-speech modeling, they still lag behind text-based Large Language Models (LLMs) in terms of semantic coherence and relevance. This work introduces the Align-SLM framework, which leverages preference optimization inspired by Reinforcement Learning with Human Feedback (RLHF) to enhance the semantic understanding of SLMs. Our approach generates multiple speech continuations from a given prompt and uses LLM-based semantic metrics to create preference data for Direct Preference Optimization (DPO). We evaluate the framework using ZeroSpeech 2021 benchmarks for lexical and syntactic modeling, the spoken version of the StoryCloze dataset for semantic coherence, and other speech generation metrics, including the GPT4-o score and human evaluation. Experimental results show that our method achieves the state-of-the-art performance of SLMs for most benchmarks, highlighting the importance of preference optimization to improve the semantics of SLMs.
%R 10.18653/v1/2025.acl-long.997
%U https://aclanthology.org/2025.acl-long.997/
%U https://doi.org/10.18653/v1/2025.acl-long.997
%P 20395-20411
Markdown (Informal)
[Align-SLM: Textless Spoken Language Models with Reinforcement Learning from AI Feedback](https://aclanthology.org/2025.acl-long.997/) (Lin et al., ACL 2025)
ACL