@inproceedings{nguyen-etal-2024-curatron,
title = "{CURATRON}: Complete and Robust Preference Data for Rigorous Alignment of Large Language Models",
author = "Nguyen, Son The and
Naresh, Niranjan Uma and
Tulabandhula, Theja",
editor = "Dragut, Eduard and
Li, Yunyao and
Popa, Lucian and
Vucetic, Slobodan and
Srivastava, Shashank",
booktitle = "Proceedings of the Fifth Workshop on Data Science with Human-in-the-Loop (DaSH 2024)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.dash-1.5",
doi = "10.18653/v1/2024.dash-1.5",
pages = "31--39",
abstract = "This paper addresses the challenges of aligning large language models (LLMs) with human values via preference learning (PL), focusing on incomplete and corrupted data in preference datasets. We propose a novel method for robustly and completely recalibrating values within these datasets to enhance LLMs{'} resilience against the issues. In particular, we devise a guaranteed polynomial time ranking algorithm that robustifies several existing models, such as the classic Bradley{--}Terry{--}Luce (BTL) model and certain generalizations of it. To the best of our knowledge, our present work is the first to propose an algorithm that provably recovers an $\epsilon$-optimal ranking with high probability while allowing as large as $O(n)$ perturbed pairwise comparison results per model response. Furthermore, we show robust recovery results in the partially observed setting. Our experiments confirm that our algorithms handle adversarial noise and unobserved comparisons well in LLM preference dataset settings. This work contributes to the development and scaling of more reliable and ethically aligned AI models by equipping the dataset curation pipeline with the ability to handle missing and maliciously manipulated inputs.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nguyen-etal-2024-curatron">
<titleInfo>
<title>CURATRON: Complete and Robust Preference Data for Rigorous Alignment of Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Son</namePart>
<namePart type="given">The</namePart>
<namePart type="family">Nguyen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Niranjan</namePart>
<namePart type="given">Uma</namePart>
<namePart type="family">Naresh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Theja</namePart>
<namePart type="family">Tulabandhula</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Workshop on Data Science with Human-in-the-Loop (DaSH 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Eduard</namePart>
<namePart type="family">Dragut</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yunyao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucian</namePart>
<namePart type="family">Popa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Slobodan</namePart>
<namePart type="family">Vucetic</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shashank</namePart>
<namePart type="family">Srivastava</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper addresses the challenges of aligning large language models (LLMs) with human values via preference learning (PL), focusing on incomplete and corrupted data in preference datasets. We propose a novel method for robustly and completely recalibrating values within these datasets to enhance LLMs’ resilience against the issues. In particular, we devise a guaranteed polynomial time ranking algorithm that robustifies several existing models, such as the classic Bradley–Terry–Luce (BTL) model and certain generalizations of it. To the best of our knowledge, our present work is the first to propose an algorithm that provably recovers an ε-optimal ranking with high probability while allowing as large as O(n) perturbed pairwise comparison results per model response. Furthermore, we show robust recovery results in the partially observed setting. Our experiments confirm that our algorithms handle adversarial noise and unobserved comparisons well in LLM preference dataset settings. This work contributes to the development and scaling of more reliable and ethically aligned AI models by equipping the dataset curation pipeline with the ability to handle missing and maliciously manipulated inputs.</abstract>
<identifier type="citekey">nguyen-etal-2024-curatron</identifier>
<identifier type="doi">10.18653/v1/2024.dash-1.5</identifier>
<location>
<url>https://aclanthology.org/2024.dash-1.5</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>31</start>
<end>39</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CURATRON: Complete and Robust Preference Data for Rigorous Alignment of Large Language Models
%A Nguyen, Son The
%A Naresh, Niranjan Uma
%A Tulabandhula, Theja
%Y Dragut, Eduard
%Y Li, Yunyao
%Y Popa, Lucian
%Y Vucetic, Slobodan
%Y Srivastava, Shashank
%S Proceedings of the Fifth Workshop on Data Science with Human-in-the-Loop (DaSH 2024)
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F nguyen-etal-2024-curatron
%X This paper addresses the challenges of aligning large language models (LLMs) with human values via preference learning (PL), focusing on incomplete and corrupted data in preference datasets. We propose a novel method for robustly and completely recalibrating values within these datasets to enhance LLMs’ resilience against the issues. In particular, we devise a guaranteed polynomial time ranking algorithm that robustifies several existing models, such as the classic Bradley–Terry–Luce (BTL) model and certain generalizations of it. To the best of our knowledge, our present work is the first to propose an algorithm that provably recovers an ε-optimal ranking with high probability while allowing as large as O(n) perturbed pairwise comparison results per model response. Furthermore, we show robust recovery results in the partially observed setting. Our experiments confirm that our algorithms handle adversarial noise and unobserved comparisons well in LLM preference dataset settings. This work contributes to the development and scaling of more reliable and ethically aligned AI models by equipping the dataset curation pipeline with the ability to handle missing and maliciously manipulated inputs.
%R 10.18653/v1/2024.dash-1.5
%U https://aclanthology.org/2024.dash-1.5
%U https://doi.org/10.18653/v1/2024.dash-1.5
%P 31-39
Markdown (Informal)
[CURATRON: Complete and Robust Preference Data for Rigorous Alignment of Large Language Models](https://aclanthology.org/2024.dash-1.5) (Nguyen et al., DaSH-WS 2024)
ACL