@inproceedings{gershuni-shmidman-2026-human,
title = "Human-{AI} Annotation Error Auditing for {H}ebrew Diacritization with Frontier {LLM}s",
author = "Gershuni, Hillel and
Shmidman, Avi",
editor = "Liu, Yang Janet and
Gessler, Luke",
booktitle = "Proceedings of the 20th Linguistic Annotation Workshop ({LAW} {XX})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.law-main.4/",
pages = "33--46",
ISBN = "979-8-89176-404-0",
abstract = "Large annotated datasets inevitably contain errors that are costly to identify via manual review. We study a human-AI annotation error auditing workflow using frontier Large Language Models (LLMs), focusing on Hebrew \textit{nikud} (diacritization). We take the the EACL 2023 Hebrew Homograph Challenge Set as our test case. In a focused evaluation on 12 of the homograph sets with 271 confirmed errors (verified through exhaustive manual review of all 7,241 sentences), Gemini 3 Pro achieves 83.6{\%} recall (95{\%} confidence interval: [79.3{\%}, 88.2{\%}]) and 99.1{\%} precision - substantially higher than other frontier LLMs. Two independent human experts achieved 62.4{\%} and 42.8{\%} recall respectively, a 20-percentage-point spread that reflects the difficulty of sparse-target error search. Even the union of both experts' findings (73.4{\%} recall) falls short of a single LLM run (83.6{\%}), while LLM-aided auditing reduces review effort by over 95{\%}. We analyze the trade-offs between batch size and recall, and release both a human-verified Gold Standard with per-error difficulty annotations and a globally corrected version of the Challenge Set."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gershuni-shmidman-2026-human">
<titleInfo>
<title>Human-AI Annotation Error Auditing for Hebrew Diacritization with Frontier LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hillel</namePart>
<namePart type="family">Gershuni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Avi</namePart>
<namePart type="family">Shmidman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 20th Linguistic Annotation Workshop (LAW XX)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="given">Janet</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luke</namePart>
<namePart type="family">Gessler</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-404-0</identifier>
</relatedItem>
<abstract>Large annotated datasets inevitably contain errors that are costly to identify via manual review. We study a human-AI annotation error auditing workflow using frontier Large Language Models (LLMs), focusing on Hebrew nikud (diacritization). We take the the EACL 2023 Hebrew Homograph Challenge Set as our test case. In a focused evaluation on 12 of the homograph sets with 271 confirmed errors (verified through exhaustive manual review of all 7,241 sentences), Gemini 3 Pro achieves 83.6% recall (95% confidence interval: [79.3%, 88.2%]) and 99.1% precision - substantially higher than other frontier LLMs. Two independent human experts achieved 62.4% and 42.8% recall respectively, a 20-percentage-point spread that reflects the difficulty of sparse-target error search. Even the union of both experts’ findings (73.4% recall) falls short of a single LLM run (83.6%), while LLM-aided auditing reduces review effort by over 95%. We analyze the trade-offs between batch size and recall, and release both a human-verified Gold Standard with per-error difficulty annotations and a globally corrected version of the Challenge Set.</abstract>
<identifier type="citekey">gershuni-shmidman-2026-human</identifier>
<location>
<url>https://aclanthology.org/2026.law-main.4/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>33</start>
<end>46</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Human-AI Annotation Error Auditing for Hebrew Diacritization with Frontier LLMs
%A Gershuni, Hillel
%A Shmidman, Avi
%Y Liu, Yang Janet
%Y Gessler, Luke
%S Proceedings of the 20th Linguistic Annotation Workshop (LAW XX)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-404-0
%F gershuni-shmidman-2026-human
%X Large annotated datasets inevitably contain errors that are costly to identify via manual review. We study a human-AI annotation error auditing workflow using frontier Large Language Models (LLMs), focusing on Hebrew nikud (diacritization). We take the the EACL 2023 Hebrew Homograph Challenge Set as our test case. In a focused evaluation on 12 of the homograph sets with 271 confirmed errors (verified through exhaustive manual review of all 7,241 sentences), Gemini 3 Pro achieves 83.6% recall (95% confidence interval: [79.3%, 88.2%]) and 99.1% precision - substantially higher than other frontier LLMs. Two independent human experts achieved 62.4% and 42.8% recall respectively, a 20-percentage-point spread that reflects the difficulty of sparse-target error search. Even the union of both experts’ findings (73.4% recall) falls short of a single LLM run (83.6%), while LLM-aided auditing reduces review effort by over 95%. We analyze the trade-offs between batch size and recall, and release both a human-verified Gold Standard with per-error difficulty annotations and a globally corrected version of the Challenge Set.
%U https://aclanthology.org/2026.law-main.4/
%P 33-46
Markdown (Informal)
[Human-AI Annotation Error Auditing for Hebrew Diacritization with Frontier LLMs](https://aclanthology.org/2026.law-main.4/) (Gershuni & Shmidman, LAW 2026)
ACL