@inproceedings{liu-etal-2025-haf,
title = "{HAF}-{RM}: A Hybrid Alignment Framework for Reward Model Training",
author = "Liu, Shujun and
Shen, Xiaoyu and
Lai, Yuhang and
Wang, Siyuan and
Yue, Shengbin and
Huang, Zengfeng and
Huang, Xuanjing and
Wei, Zhongyu",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.924/",
doi = "10.18653/v1/2025.acl-long.924",
pages = "18874--18893",
ISBN = "979-8-89176-251-0",
abstract = "The reward model has become increasingly important in alignment, assessment, and data construction for large language models (LLMs). Most existing researchers focus on enhancing reward models through data improvements, following the conventional training framework for reward models that directly optimizes the predicted rewards.In this paper, we propose a hybrid alignment framework **HAF-RM** for reward model training by introducing an additional constraint on token-level policy probabilities in addition to the reward score. It can simultaneously supervise the internal preference model at the token level and optimize the mapping layer of the reward model at the sequence level.Experiment results on five datasets sufficiently show the validity and effectiveness of our proposed hybrid framework for training a high-quality reward model.By decoupling the reward modeling procedure and incorporating hybrid supervision, our **HAF-RM** framework offers a principled and effective approach to enhancing the performance and alignment of reward models, a critical component in the responsible development of powerful language models. We release our code at [https://haf-rm.github.io](https://haf-rm.github.io)."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="liu-etal-2025-haf">
<titleInfo>
<title>HAF-RM: A Hybrid Alignment Framework for Reward Model Training</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shujun</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaoyu</namePart>
<namePart type="family">Shen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuhang</namePart>
<namePart type="family">Lai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Siyuan</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shengbin</namePart>
<namePart type="family">Yue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zengfeng</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuanjing</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhongyu</namePart>
<namePart type="family">Wei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>The reward model has become increasingly important in alignment, assessment, and data construction for large language models (LLMs). Most existing researchers focus on enhancing reward models through data improvements, following the conventional training framework for reward models that directly optimizes the predicted rewards.In this paper, we propose a hybrid alignment framework **HAF-RM** for reward model training by introducing an additional constraint on token-level policy probabilities in addition to the reward score. It can simultaneously supervise the internal preference model at the token level and optimize the mapping layer of the reward model at the sequence level.Experiment results on five datasets sufficiently show the validity and effectiveness of our proposed hybrid framework for training a high-quality reward model.By decoupling the reward modeling procedure and incorporating hybrid supervision, our **HAF-RM** framework offers a principled and effective approach to enhancing the performance and alignment of reward models, a critical component in the responsible development of powerful language models. We release our code at [https://haf-rm.github.io](https://haf-rm.github.io).</abstract>
<identifier type="citekey">liu-etal-2025-haf</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.924</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.924/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>18874</start>
<end>18893</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T HAF-RM: A Hybrid Alignment Framework for Reward Model Training
%A Liu, Shujun
%A Shen, Xiaoyu
%A Lai, Yuhang
%A Wang, Siyuan
%A Yue, Shengbin
%A Huang, Zengfeng
%A Huang, Xuanjing
%A Wei, Zhongyu
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F liu-etal-2025-haf
%X The reward model has become increasingly important in alignment, assessment, and data construction for large language models (LLMs). Most existing researchers focus on enhancing reward models through data improvements, following the conventional training framework for reward models that directly optimizes the predicted rewards.In this paper, we propose a hybrid alignment framework **HAF-RM** for reward model training by introducing an additional constraint on token-level policy probabilities in addition to the reward score. It can simultaneously supervise the internal preference model at the token level and optimize the mapping layer of the reward model at the sequence level.Experiment results on five datasets sufficiently show the validity and effectiveness of our proposed hybrid framework for training a high-quality reward model.By decoupling the reward modeling procedure and incorporating hybrid supervision, our **HAF-RM** framework offers a principled and effective approach to enhancing the performance and alignment of reward models, a critical component in the responsible development of powerful language models. We release our code at [https://haf-rm.github.io](https://haf-rm.github.io).
%R 10.18653/v1/2025.acl-long.924
%U https://aclanthology.org/2025.acl-long.924/
%U https://doi.org/10.18653/v1/2025.acl-long.924
%P 18874-18893
Markdown (Informal)
[HAF-RM: A Hybrid Alignment Framework for Reward Model Training](https://aclanthology.org/2025.acl-long.924/) (Liu et al., ACL 2025)
ACL
- Shujun Liu, Xiaoyu Shen, Yuhang Lai, Siyuan Wang, Shengbin Yue, Zengfeng Huang, Xuanjing Huang, and Zhongyu Wei. 2025. HAF-RM: A Hybrid Alignment Framework for Reward Model Training. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 18874–18893, Vienna, Austria. Association for Computational Linguistics.