@inproceedings{ye-etal-2025-improving,
title = "Improving Reward Models with Synthetic Critiques",
author = "Ye, Zihuiwen and
Greenlee, Fraser David and
Bartolo, Max and
Blunsom, Phil and
Campos, Jon Ander and
Gall{\'e}, Matthias",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2025",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-naacl.254/",
doi = "10.18653/v1/2025.findings-naacl.254",
pages = "4506--4520",
ISBN = "979-8-89176-195-7",
abstract = "Reward models (RMs) play a critical role in aligning language models through the process of reinforcement learning from human feedback. RMs are trained to predict a score reflecting human preference, which requires significant time and cost for human annotation. Additionally, RMs tend to quickly overfit on superficial features in the training set, hindering their generalization performance on unseen distributions. We propose a novel approach using synthetic natural language critiques generated by large language models to provide additional feedback, evaluating aspects such as instruction following, correctness, and style. This offers richer signals and more robust features for RMs to assess and score on. We demonstrate that high-quality critiques improve the performance and data efficiency of RMs initialized from different pretrained models, reducing the reliance on costly human annotations. Furthermore, incorporating critiques improves both the interpretability and robustness of RM training."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ye-etal-2025-improving">
<titleInfo>
<title>Improving Reward Models with Synthetic Critiques</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zihuiwen</namePart>
<namePart type="family">Ye</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fraser</namePart>
<namePart type="given">David</namePart>
<namePart type="family">Greenlee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Max</namePart>
<namePart type="family">Bartolo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Phil</namePart>
<namePart type="family">Blunsom</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jon</namePart>
<namePart type="given">Ander</namePart>
<namePart type="family">Campos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matthias</namePart>
<namePart type="family">Gallé</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-195-7</identifier>
</relatedItem>
<abstract>Reward models (RMs) play a critical role in aligning language models through the process of reinforcement learning from human feedback. RMs are trained to predict a score reflecting human preference, which requires significant time and cost for human annotation. Additionally, RMs tend to quickly overfit on superficial features in the training set, hindering their generalization performance on unseen distributions. We propose a novel approach using synthetic natural language critiques generated by large language models to provide additional feedback, evaluating aspects such as instruction following, correctness, and style. This offers richer signals and more robust features for RMs to assess and score on. We demonstrate that high-quality critiques improve the performance and data efficiency of RMs initialized from different pretrained models, reducing the reliance on costly human annotations. Furthermore, incorporating critiques improves both the interpretability and robustness of RM training.</abstract>
<identifier type="citekey">ye-etal-2025-improving</identifier>
<identifier type="doi">10.18653/v1/2025.findings-naacl.254</identifier>
<location>
<url>https://aclanthology.org/2025.findings-naacl.254/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>4506</start>
<end>4520</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Improving Reward Models with Synthetic Critiques
%A Ye, Zihuiwen
%A Greenlee, Fraser David
%A Bartolo, Max
%A Blunsom, Phil
%A Campos, Jon Ander
%A Gallé, Matthias
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Findings of the Association for Computational Linguistics: NAACL 2025
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-195-7
%F ye-etal-2025-improving
%X Reward models (RMs) play a critical role in aligning language models through the process of reinforcement learning from human feedback. RMs are trained to predict a score reflecting human preference, which requires significant time and cost for human annotation. Additionally, RMs tend to quickly overfit on superficial features in the training set, hindering their generalization performance on unseen distributions. We propose a novel approach using synthetic natural language critiques generated by large language models to provide additional feedback, evaluating aspects such as instruction following, correctness, and style. This offers richer signals and more robust features for RMs to assess and score on. We demonstrate that high-quality critiques improve the performance and data efficiency of RMs initialized from different pretrained models, reducing the reliance on costly human annotations. Furthermore, incorporating critiques improves both the interpretability and robustness of RM training.
%R 10.18653/v1/2025.findings-naacl.254
%U https://aclanthology.org/2025.findings-naacl.254/
%U https://doi.org/10.18653/v1/2025.findings-naacl.254
%P 4506-4520
Markdown (Informal)
[Improving Reward Models with Synthetic Critiques](https://aclanthology.org/2025.findings-naacl.254/) (Ye et al., Findings 2025)
ACL
- Zihuiwen Ye, Fraser David Greenlee, Max Bartolo, Phil Blunsom, Jon Ander Campos, and Matthias Gallé. 2025. Improving Reward Models with Synthetic Critiques. In Findings of the Association for Computational Linguistics: NAACL 2025, pages 4506–4520, Albuquerque, New Mexico. Association for Computational Linguistics.