@inproceedings{dsouza-kovatchev-2025-sources,
title = "Sources of Disagreement in Data for {LLM} Instruction Tuning",
author = "Dsouza, Russel and
Kovatchev, Venelin",
editor = "Roth, Michael and
Schlechtweg, Dominik",
booktitle = "Proceedings of Context and Meaning: Navigating Disagreements in NLP Annotation",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "International Committee on Computational Linguistics",
url = "https://aclanthology.org/2025.comedi-1.3/",
pages = "20--32",
abstract = "In this paper we study the patterns of label disagreement in data used for instruction tuning Large Language models (LLMs). Specifically, we focus on data used for Reinforcement Learning from Human Feedback (RLHF). Our objective is to determine what is the primary source of disagreement: the individual data points, the choice of annotators, or the task formulation. We annotate the same dataset multiple times under different conditions and compare the overall agreement and the patterns of disagreement. For task formulation, we compare {\textquotedblleft}single{\textquotedblright} format where annotators rate LLM responses individually with {\textquotedblleft}preference{\textquotedblright} format where annotators select one of two possible responses. For annotators, we compare data from human labelers with automatic data labeling using LLMs. Our results indicate that: (1) there are very few {\textquotedblleft}universally ambiguous{\textquotedblright} instances. The label disagreement depends largely on the task formulation and the choice of annotators; (2) the overall agreement remains consistent across experiments. We find no evidence that {\textquotedblleft}preference{\textquotedblright} data is of higher quality than {\textquotedblleft}single{\textquotedblright} data; and (3) the change of task formulation and annotators impacts the resulting instance-level labels. The labels obtained in different experiments are correlated, but not identical."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dsouza-kovatchev-2025-sources">
<titleInfo>
<title>Sources of Disagreement in Data for LLM Instruction Tuning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Russel</namePart>
<namePart type="family">Dsouza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Venelin</namePart>
<namePart type="family">Kovatchev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of Context and Meaning: Navigating Disagreements in NLP Annotation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Roth</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dominik</namePart>
<namePart type="family">Schlechtweg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Committee on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper we study the patterns of label disagreement in data used for instruction tuning Large Language models (LLMs). Specifically, we focus on data used for Reinforcement Learning from Human Feedback (RLHF). Our objective is to determine what is the primary source of disagreement: the individual data points, the choice of annotators, or the task formulation. We annotate the same dataset multiple times under different conditions and compare the overall agreement and the patterns of disagreement. For task formulation, we compare “single” format where annotators rate LLM responses individually with “preference” format where annotators select one of two possible responses. For annotators, we compare data from human labelers with automatic data labeling using LLMs. Our results indicate that: (1) there are very few “universally ambiguous” instances. The label disagreement depends largely on the task formulation and the choice of annotators; (2) the overall agreement remains consistent across experiments. We find no evidence that “preference” data is of higher quality than “single” data; and (3) the change of task formulation and annotators impacts the resulting instance-level labels. The labels obtained in different experiments are correlated, but not identical.</abstract>
<identifier type="citekey">dsouza-kovatchev-2025-sources</identifier>
<location>
<url>https://aclanthology.org/2025.comedi-1.3/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>20</start>
<end>32</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Sources of Disagreement in Data for LLM Instruction Tuning
%A Dsouza, Russel
%A Kovatchev, Venelin
%Y Roth, Michael
%Y Schlechtweg, Dominik
%S Proceedings of Context and Meaning: Navigating Disagreements in NLP Annotation
%D 2025
%8 January
%I International Committee on Computational Linguistics
%C Abu Dhabi, UAE
%F dsouza-kovatchev-2025-sources
%X In this paper we study the patterns of label disagreement in data used for instruction tuning Large Language models (LLMs). Specifically, we focus on data used for Reinforcement Learning from Human Feedback (RLHF). Our objective is to determine what is the primary source of disagreement: the individual data points, the choice of annotators, or the task formulation. We annotate the same dataset multiple times under different conditions and compare the overall agreement and the patterns of disagreement. For task formulation, we compare “single” format where annotators rate LLM responses individually with “preference” format where annotators select one of two possible responses. For annotators, we compare data from human labelers with automatic data labeling using LLMs. Our results indicate that: (1) there are very few “universally ambiguous” instances. The label disagreement depends largely on the task formulation and the choice of annotators; (2) the overall agreement remains consistent across experiments. We find no evidence that “preference” data is of higher quality than “single” data; and (3) the change of task formulation and annotators impacts the resulting instance-level labels. The labels obtained in different experiments are correlated, but not identical.
%U https://aclanthology.org/2025.comedi-1.3/
%P 20-32
Markdown (Informal)
[Sources of Disagreement in Data for LLM Instruction Tuning](https://aclanthology.org/2025.comedi-1.3/) (Dsouza & Kovatchev, CoMeDi 2025)
ACL