@inproceedings{levine-zeldes-2026-cracks,
title = "Cracks in the Bridge{---}or A Bridge Too Far? Comparing Human and {LLM} Errors in the Annotation of Bridging Anaphora",
author = "Levine, Lauren and
Zeldes, Amir",
editor = "Liu, Yang Janet and
Gessler, Luke",
booktitle = "Proceedings of the 20th Linguistic Annotation Workshop ({LAW} {XX})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.law-main.16/",
pages = "219--228",
ISBN = "979-8-89176-404-0",
abstract = "In this paper, we perform an error analysis on human and LLM annotation data from the recent GUMBridge corpus for varieties of bridging anaphora. We explore the distribution of precision and recall errors made by annotators and how that distribution correlates with bridging subtypes. We find that while LLMs perform substantially worse than human annotators, they are more balanced in their precision and recall scores than humans, whose performance strongly favors precision. With regard to subtypes, we find that comparison and meronomy relations are easier to reliably annotate than the more broadly construed entity relations for both human and LLM annotators, but that LLM errors are more distributed across subtypes than human errors. Analyzing these results, we provide insights for future annotation projects on bridging anaphora."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="levine-zeldes-2026-cracks">
<titleInfo>
<title>Cracks in the Bridge—or A Bridge Too Far? Comparing Human and LLM Errors in the Annotation of Bridging Anaphora</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lauren</namePart>
<namePart type="family">Levine</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amir</namePart>
<namePart type="family">Zeldes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 20th Linguistic Annotation Workshop (LAW XX)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="given">Janet</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luke</namePart>
<namePart type="family">Gessler</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-404-0</identifier>
</relatedItem>
<abstract>In this paper, we perform an error analysis on human and LLM annotation data from the recent GUMBridge corpus for varieties of bridging anaphora. We explore the distribution of precision and recall errors made by annotators and how that distribution correlates with bridging subtypes. We find that while LLMs perform substantially worse than human annotators, they are more balanced in their precision and recall scores than humans, whose performance strongly favors precision. With regard to subtypes, we find that comparison and meronomy relations are easier to reliably annotate than the more broadly construed entity relations for both human and LLM annotators, but that LLM errors are more distributed across subtypes than human errors. Analyzing these results, we provide insights for future annotation projects on bridging anaphora.</abstract>
<identifier type="citekey">levine-zeldes-2026-cracks</identifier>
<location>
<url>https://aclanthology.org/2026.law-main.16/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>219</start>
<end>228</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Cracks in the Bridge—or A Bridge Too Far? Comparing Human and LLM Errors in the Annotation of Bridging Anaphora
%A Levine, Lauren
%A Zeldes, Amir
%Y Liu, Yang Janet
%Y Gessler, Luke
%S Proceedings of the 20th Linguistic Annotation Workshop (LAW XX)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-404-0
%F levine-zeldes-2026-cracks
%X In this paper, we perform an error analysis on human and LLM annotation data from the recent GUMBridge corpus for varieties of bridging anaphora. We explore the distribution of precision and recall errors made by annotators and how that distribution correlates with bridging subtypes. We find that while LLMs perform substantially worse than human annotators, they are more balanced in their precision and recall scores than humans, whose performance strongly favors precision. With regard to subtypes, we find that comparison and meronomy relations are easier to reliably annotate than the more broadly construed entity relations for both human and LLM annotators, but that LLM errors are more distributed across subtypes than human errors. Analyzing these results, we provide insights for future annotation projects on bridging anaphora.
%U https://aclanthology.org/2026.law-main.16/
%P 219-228
Markdown (Informal)
[Cracks in the Bridge—or A Bridge Too Far? Comparing Human and LLM Errors in the Annotation of Bridging Anaphora](https://aclanthology.org/2026.law-main.16/) (Levine & Zeldes, LAW 2026)
ACL