@article{yu-etal-2025-diverse,
title = "Diverse {AI} Feedback For Large Language Model Alignment",
author = "Yu, Tianshu and
Lin, Ting-En and
Wu, Yuchuan and
Yang, Min and
Huang, Fei and
Li, Yongbin",
journal = "Transactions of the Association for Computational Linguistics",
volume = "13",
year = "2025",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2025.tacl-1.19/",
doi = "10.1162/tacl_a_00746",
pages = "392--407",
abstract = "Recent advances in large language models (LLMs) focus on aligning models with human values to minimize harmful content. However, existing methods often rely on a single type of feedback, such as preferences, annotated labels, or critiques, which can lead to overfitting and suboptimal performance. In this paper, we propose Diverse AIFeedback (DAIF), a novel approach that integrates three types of feedback{---}critique, refinement, and preference{---}tailored to tasks of varying uncertainty levels. Through an analysis of information gain, we show that critique feedback is most effective for low-uncertainty tasks, refinement feedback for medium-uncertainty tasks, and preference feedback for high-uncertainty tasks. Training with this diversified feedback reduces overfitting and improves alignment. Experimental results across three tasks{---}question answering, dialog generation, and text summarization{--}demonstrate that DAIF outperforms traditional methods relying on a single feedback type.1"
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yu-etal-2025-diverse">
<titleInfo>
<title>Diverse AI Feedback For Large Language Model Alignment</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tianshu</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ting-En</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuchuan</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fei</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yongbin</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Recent advances in large language models (LLMs) focus on aligning models with human values to minimize harmful content. However, existing methods often rely on a single type of feedback, such as preferences, annotated labels, or critiques, which can lead to overfitting and suboptimal performance. In this paper, we propose Diverse AIFeedback (DAIF), a novel approach that integrates three types of feedback—critique, refinement, and preference—tailored to tasks of varying uncertainty levels. Through an analysis of information gain, we show that critique feedback is most effective for low-uncertainty tasks, refinement feedback for medium-uncertainty tasks, and preference feedback for high-uncertainty tasks. Training with this diversified feedback reduces overfitting and improves alignment. Experimental results across three tasks—question answering, dialog generation, and text summarization–demonstrate that DAIF outperforms traditional methods relying on a single feedback type.1</abstract>
<identifier type="citekey">yu-etal-2025-diverse</identifier>
<identifier type="doi">10.1162/tacl_a_00746</identifier>
<location>
<url>https://aclanthology.org/2025.tacl-1.19/</url>
</location>
<part>
<date>2025</date>
<detail type="volume"><number>13</number></detail>
<extent unit="page">
<start>392</start>
<end>407</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Diverse AI Feedback For Large Language Model Alignment
%A Yu, Tianshu
%A Lin, Ting-En
%A Wu, Yuchuan
%A Yang, Min
%A Huang, Fei
%A Li, Yongbin
%J Transactions of the Association for Computational Linguistics
%D 2025
%V 13
%I MIT Press
%C Cambridge, MA
%F yu-etal-2025-diverse
%X Recent advances in large language models (LLMs) focus on aligning models with human values to minimize harmful content. However, existing methods often rely on a single type of feedback, such as preferences, annotated labels, or critiques, which can lead to overfitting and suboptimal performance. In this paper, we propose Diverse AIFeedback (DAIF), a novel approach that integrates three types of feedback—critique, refinement, and preference—tailored to tasks of varying uncertainty levels. Through an analysis of information gain, we show that critique feedback is most effective for low-uncertainty tasks, refinement feedback for medium-uncertainty tasks, and preference feedback for high-uncertainty tasks. Training with this diversified feedback reduces overfitting and improves alignment. Experimental results across three tasks—question answering, dialog generation, and text summarization–demonstrate that DAIF outperforms traditional methods relying on a single feedback type.1
%R 10.1162/tacl_a_00746
%U https://aclanthology.org/2025.tacl-1.19/
%U https://doi.org/10.1162/tacl_a_00746
%P 392-407
Markdown (Informal)
[Diverse AI Feedback For Large Language Model Alignment](https://aclanthology.org/2025.tacl-1.19/) (Yu et al., TACL 2025)
ACL