@inproceedings{kulmizev-etal-2026-label,
title = "Label and Explanation Variation in {LLM}-Based Annotation: a Case Study in Natural Language Inference",
author = "Kulmizev, Artur and
Lombart, Erika and
Watrin, Patrick and
de Marneffe, Marie-Catherine",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-long.752/",
pages = "16526--16543",
ISBN = "979-8-89176-390-6",
abstract = "Large language models (LLMs) have shown considerable promise for annotation purposes, yet questions remain about their ability to capture human label variation (HLV) {---} genuine disagreement between annotators often observed across NLP tasks. Here, we investigate how label and explanation variation manifests within and across LLMs with respect to the Natural Language Inference (NLI) task. Using zero-shot prompting with exact human annotation instructions, we treat individual model generations as participants and examine three response sampling strategies: varying generation parameters, leveraging within-family model size differences, and pooling responses from distinct LLMs. We show that, while model ensembles can generate label distributions similar to humans, they likewise exhibit distinct, idiosyncratic judgments and disagreement patterns. We further analyze explanation variation, observing that, although models generate longer explanations than humans, they demonstrate substantially less stylistic diversity. Our findings suggest that, while LLMs may serve as useful tools for generating diverse annotations, they should not be viewed as drop-in replacements for human annotators {---} particularly in applications requiring authentic representation of diversity in human judgments, such as NLI."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kulmizev-etal-2026-label">
<titleInfo>
<title>Label and Explanation Variation in LLM-Based Annotation: a Case Study in Natural Language Inference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Artur</namePart>
<namePart type="family">Kulmizev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Erika</namePart>
<namePart type="family">Lombart</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrick</namePart>
<namePart type="family">Watrin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie-Catherine</namePart>
<namePart type="family">de Marneffe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-390-6</identifier>
</relatedItem>
<abstract>Large language models (LLMs) have shown considerable promise for annotation purposes, yet questions remain about their ability to capture human label variation (HLV) — genuine disagreement between annotators often observed across NLP tasks. Here, we investigate how label and explanation variation manifests within and across LLMs with respect to the Natural Language Inference (NLI) task. Using zero-shot prompting with exact human annotation instructions, we treat individual model generations as participants and examine three response sampling strategies: varying generation parameters, leveraging within-family model size differences, and pooling responses from distinct LLMs. We show that, while model ensembles can generate label distributions similar to humans, they likewise exhibit distinct, idiosyncratic judgments and disagreement patterns. We further analyze explanation variation, observing that, although models generate longer explanations than humans, they demonstrate substantially less stylistic diversity. Our findings suggest that, while LLMs may serve as useful tools for generating diverse annotations, they should not be viewed as drop-in replacements for human annotators — particularly in applications requiring authentic representation of diversity in human judgments, such as NLI.</abstract>
<identifier type="citekey">kulmizev-etal-2026-label</identifier>
<location>
<url>https://aclanthology.org/2026.acl-long.752/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>16526</start>
<end>16543</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Label and Explanation Variation in LLM-Based Annotation: a Case Study in Natural Language Inference
%A Kulmizev, Artur
%A Lombart, Erika
%A Watrin, Patrick
%A de Marneffe, Marie-Catherine
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-390-6
%F kulmizev-etal-2026-label
%X Large language models (LLMs) have shown considerable promise for annotation purposes, yet questions remain about their ability to capture human label variation (HLV) — genuine disagreement between annotators often observed across NLP tasks. Here, we investigate how label and explanation variation manifests within and across LLMs with respect to the Natural Language Inference (NLI) task. Using zero-shot prompting with exact human annotation instructions, we treat individual model generations as participants and examine three response sampling strategies: varying generation parameters, leveraging within-family model size differences, and pooling responses from distinct LLMs. We show that, while model ensembles can generate label distributions similar to humans, they likewise exhibit distinct, idiosyncratic judgments and disagreement patterns. We further analyze explanation variation, observing that, although models generate longer explanations than humans, they demonstrate substantially less stylistic diversity. Our findings suggest that, while LLMs may serve as useful tools for generating diverse annotations, they should not be viewed as drop-in replacements for human annotators — particularly in applications requiring authentic representation of diversity in human judgments, such as NLI.
%U https://aclanthology.org/2026.acl-long.752/
%P 16526-16543
Markdown (Informal)
[Label and Explanation Variation in LLM-Based Annotation: a Case Study in Natural Language Inference](https://aclanthology.org/2026.acl-long.752/) (Kulmizev et al., ACL 2026)
ACL