@inproceedings{erez-etal-2026-scanners,
title = "When Scanners Lie: Evaluator Instability in {LLM} Red-Teaming",
author = "Erez, Lidor and
Hofman, Omer and
Nizri, Tamir and
Vainshtein, Roman",
editor = "Akhtar, Mubashara and
Batzner, Jan and
Choshen, Leshem and
Ghosh, Avijit and
Gohar, Usman and
Mickel, Jennifer and
Pant, Ichhya and
Talat, Zeerak and
Lin, Michelle",
booktitle = "Proceedings of the Workshop on Evaluating Evaluations ({E}val{E}val)",
month = jul,
year = "2026",
address = "San Diego, CA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.evaleval-1.11/",
pages = "56--69",
ISBN = "979-8-89176-429-3",
abstract = "Automated LLM vulnerability scanners are increasingly used to assess security risks by measuring different attack type success rates (ASR). Yet the validity of these measurements hinges on an often-overlooked component: the evaluator who determines whether an attack has succeeded. In this study, we demonstrate that commonly used open-source scanners exhibit measurement instability that depends on the evaluator component. Consequently, changing the evaluator while keeping the attacks and model outputs constant can significantly alter the reported ASR. To tackle this problem, we present a two-phase, reliability-aware evaluation framework. In the first phase, we quantify evaluator disagreement to identify attack categories where ASR reliability cannot be assumed. In the second phase, we propose a verification-based evaluation method where evaluators are validated by an independent verifier, enabling reliability assessment without relying on extensive human annotation. Applied to the widely used Garak scanner, we observe that 22 of 25 attack categories exhibit evaluator instability, reflected in high disagreement among evaluators. Our approach raises evaluator accuracy from 72{\%} to 89{\%} while enabling selective deployment to control cost and computational overhead. We further quantify evaluator uncertainty in ASR estimates, showing that reported vulnerability scores can vary by up to {\ensuremath{\pm}}33{\%} depending on the evaluator. Our results indicate that the outputs of vulnerability scanners are highly sensitive to the choice of evaluators. Our framework offers a practical approach to quantify unreliable evaluations and enhance the reliability of measurements in automated LLM security assessments."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="erez-etal-2026-scanners">
<titleInfo>
<title>When Scanners Lie: Evaluator Instability in LLM Red-Teaming</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lidor</namePart>
<namePart type="family">Erez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Omer</namePart>
<namePart type="family">Hofman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tamir</namePart>
<namePart type="family">Nizri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roman</namePart>
<namePart type="family">Vainshtein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop on Evaluating Evaluations (EvalEval)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mubashara</namePart>
<namePart type="family">Akhtar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Batzner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leshem</namePart>
<namePart type="family">Choshen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Avijit</namePart>
<namePart type="family">Ghosh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Usman</namePart>
<namePart type="family">Gohar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jennifer</namePart>
<namePart type="family">Mickel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ichhya</namePart>
<namePart type="family">Pant</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zeerak</namePart>
<namePart type="family">Talat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michelle</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, CA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-429-3</identifier>
</relatedItem>
<abstract>Automated LLM vulnerability scanners are increasingly used to assess security risks by measuring different attack type success rates (ASR). Yet the validity of these measurements hinges on an often-overlooked component: the evaluator who determines whether an attack has succeeded. In this study, we demonstrate that commonly used open-source scanners exhibit measurement instability that depends on the evaluator component. Consequently, changing the evaluator while keeping the attacks and model outputs constant can significantly alter the reported ASR. To tackle this problem, we present a two-phase, reliability-aware evaluation framework. In the first phase, we quantify evaluator disagreement to identify attack categories where ASR reliability cannot be assumed. In the second phase, we propose a verification-based evaluation method where evaluators are validated by an independent verifier, enabling reliability assessment without relying on extensive human annotation. Applied to the widely used Garak scanner, we observe that 22 of 25 attack categories exhibit evaluator instability, reflected in high disagreement among evaluators. Our approach raises evaluator accuracy from 72% to 89% while enabling selective deployment to control cost and computational overhead. We further quantify evaluator uncertainty in ASR estimates, showing that reported vulnerability scores can vary by up to \ensuremath\pm33% depending on the evaluator. Our results indicate that the outputs of vulnerability scanners are highly sensitive to the choice of evaluators. Our framework offers a practical approach to quantify unreliable evaluations and enhance the reliability of measurements in automated LLM security assessments.</abstract>
<identifier type="citekey">erez-etal-2026-scanners</identifier>
<location>
<url>https://aclanthology.org/2026.evaleval-1.11/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>56</start>
<end>69</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T When Scanners Lie: Evaluator Instability in LLM Red-Teaming
%A Erez, Lidor
%A Hofman, Omer
%A Nizri, Tamir
%A Vainshtein, Roman
%Y Akhtar, Mubashara
%Y Batzner, Jan
%Y Choshen, Leshem
%Y Ghosh, Avijit
%Y Gohar, Usman
%Y Mickel, Jennifer
%Y Pant, Ichhya
%Y Talat, Zeerak
%Y Lin, Michelle
%S Proceedings of the Workshop on Evaluating Evaluations (EvalEval)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, CA
%@ 979-8-89176-429-3
%F erez-etal-2026-scanners
%X Automated LLM vulnerability scanners are increasingly used to assess security risks by measuring different attack type success rates (ASR). Yet the validity of these measurements hinges on an often-overlooked component: the evaluator who determines whether an attack has succeeded. In this study, we demonstrate that commonly used open-source scanners exhibit measurement instability that depends on the evaluator component. Consequently, changing the evaluator while keeping the attacks and model outputs constant can significantly alter the reported ASR. To tackle this problem, we present a two-phase, reliability-aware evaluation framework. In the first phase, we quantify evaluator disagreement to identify attack categories where ASR reliability cannot be assumed. In the second phase, we propose a verification-based evaluation method where evaluators are validated by an independent verifier, enabling reliability assessment without relying on extensive human annotation. Applied to the widely used Garak scanner, we observe that 22 of 25 attack categories exhibit evaluator instability, reflected in high disagreement among evaluators. Our approach raises evaluator accuracy from 72% to 89% while enabling selective deployment to control cost and computational overhead. We further quantify evaluator uncertainty in ASR estimates, showing that reported vulnerability scores can vary by up to \ensuremath\pm33% depending on the evaluator. Our results indicate that the outputs of vulnerability scanners are highly sensitive to the choice of evaluators. Our framework offers a practical approach to quantify unreliable evaluations and enhance the reliability of measurements in automated LLM security assessments.
%U https://aclanthology.org/2026.evaleval-1.11/
%P 56-69
Markdown (Informal)
[When Scanners Lie: Evaluator Instability in LLM Red-Teaming](https://aclanthology.org/2026.evaleval-1.11/) (Erez et al., EvalEval 2026)
ACL