@inproceedings{levandovsky-2025-deep,
title = "Deep Reinforcement Learning of {LLM}s using {RLHF}",
author = "Levandovsky, Enoch",
editor = "Whetten, Ryan and
Sucal, Virgile and
Ngo, Anh and
Chalamalasetti, Kranti and
Inoue, Koji and
Cimino, Gaetano and
Yang, Zachary and
Zenimoto, Yuki and
Rodriguez, Ricardo",
booktitle = "Proceedings of the 21st Workshop of Young Researchers' Roundtable on Spoken Dialogue Systems",
month = aug,
year = "2025",
address = "Avignon, France",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.yrrsds-1.2/",
pages = "4--5",
abstract = "My main research interests lies in the application of Reinforcement Learning (RL) alignment of LLMs in human robot dialogue. More specifically, my latest research aims to use RL alignment as an efficient training regime to train a newly initialized tiny LM to behave like a toddler. Previous research expresses the difficulty of building a robust tiny LM with an educated adult level understanding. Our hypothesis is that the cognitive barrier to train a tiny LM to at-least behave as a child is achievable with a very small number of parameters especially if training efficiently using RL LLM training regime. My interests also extend to apply RL to LLM training for dialogue management and planning."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="levandovsky-2025-deep">
<titleInfo>
<title>Deep Reinforcement Learning of LLMs using RLHF</title>
</titleInfo>
<name type="personal">
<namePart type="given">Enoch</namePart>
<namePart type="family">Levandovsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 21st Workshop of Young Researchers’ Roundtable on Spoken Dialogue Systems</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ryan</namePart>
<namePart type="family">Whetten</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Virgile</namePart>
<namePart type="family">Sucal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anh</namePart>
<namePart type="family">Ngo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kranti</namePart>
<namePart type="family">Chalamalasetti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Koji</namePart>
<namePart type="family">Inoue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gaetano</namePart>
<namePart type="family">Cimino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zachary</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuki</namePart>
<namePart type="family">Zenimoto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ricardo</namePart>
<namePart type="family">Rodriguez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Avignon, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>My main research interests lies in the application of Reinforcement Learning (RL) alignment of LLMs in human robot dialogue. More specifically, my latest research aims to use RL alignment as an efficient training regime to train a newly initialized tiny LM to behave like a toddler. Previous research expresses the difficulty of building a robust tiny LM with an educated adult level understanding. Our hypothesis is that the cognitive barrier to train a tiny LM to at-least behave as a child is achievable with a very small number of parameters especially if training efficiently using RL LLM training regime. My interests also extend to apply RL to LLM training for dialogue management and planning.</abstract>
<identifier type="citekey">levandovsky-2025-deep</identifier>
<location>
<url>https://aclanthology.org/2025.yrrsds-1.2/</url>
</location>
<part>
<date>2025-08</date>
<extent unit="page">
<start>4</start>
<end>5</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Deep Reinforcement Learning of LLMs using RLHF
%A Levandovsky, Enoch
%Y Whetten, Ryan
%Y Sucal, Virgile
%Y Ngo, Anh
%Y Chalamalasetti, Kranti
%Y Inoue, Koji
%Y Cimino, Gaetano
%Y Yang, Zachary
%Y Zenimoto, Yuki
%Y Rodriguez, Ricardo
%S Proceedings of the 21st Workshop of Young Researchers’ Roundtable on Spoken Dialogue Systems
%D 2025
%8 August
%I Association for Computational Linguistics
%C Avignon, France
%F levandovsky-2025-deep
%X My main research interests lies in the application of Reinforcement Learning (RL) alignment of LLMs in human robot dialogue. More specifically, my latest research aims to use RL alignment as an efficient training regime to train a newly initialized tiny LM to behave like a toddler. Previous research expresses the difficulty of building a robust tiny LM with an educated adult level understanding. Our hypothesis is that the cognitive barrier to train a tiny LM to at-least behave as a child is achievable with a very small number of parameters especially if training efficiently using RL LLM training regime. My interests also extend to apply RL to LLM training for dialogue management and planning.
%U https://aclanthology.org/2025.yrrsds-1.2/
%P 4-5
Markdown (Informal)
[Deep Reinforcement Learning of LLMs using RLHF](https://aclanthology.org/2025.yrrsds-1.2/) (Levandovsky, YRRSDS 2025)
ACL
- Enoch Levandovsky. 2025. Deep Reinforcement Learning of LLMs using RLHF. In Proceedings of the 21st Workshop of Young Researchers' Roundtable on Spoken Dialogue Systems, pages 4–5, Avignon, France. Association for Computational Linguistics.