@inproceedings{madhavan-etal-2025-flare,
title = "{FLARE}: An Error Analysis Framework for Diagnosing {LLM} Classification Failures",
author = "Madhavan, Keerthana and
Antonie, Luiza and
Scott, Stacey",
editor = "Przyby{\l}a, Piotr and
Shardlow, Matthew and
Colombatto, Clara and
Inie, Nanna",
booktitle = "Proceedings of Interdisciplinary Workshop on Observations of Misunderstood, Misguided and Malicious Use of Language Models",
month = sep,
year = "2025",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2025.ommm-1.4/",
pages = "40--44",
abstract = "When Large Language Models return ``Inconclusive'' in classification tasks, practitioners are left without insight into what went wrong. This diagnostic gap can delay medical decisions, undermine content moderation, and mislead downstream systems. We present FLARE (Failure Location and Reasoning Evaluation), a framework that transforms opaque failures into seven actionable categories. Applied to 5,400 election-misinformation classifications, FLARE reveals a surprising result: Few-Shot prompting{---}widely considered a best practice{---}produced 38{\texttimes} more failures than Zero-Shot, with 70.8{\%} due to simple parsing issues. By exposing hidden failure modes, FLARE addresses critical misunderstandings in LLM deployment with implications across domains."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="madhavan-etal-2025-flare">
<titleInfo>
<title>FLARE: An Error Analysis Framework for Diagnosing LLM Classification Failures</title>
</titleInfo>
<name type="personal">
<namePart type="given">Keerthana</namePart>
<namePart type="family">Madhavan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luiza</namePart>
<namePart type="family">Antonie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stacey</namePart>
<namePart type="family">Scott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of Interdisciplinary Workshop on Observations of Misunderstood, Misguided and Malicious Use of Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Piotr</namePart>
<namePart type="family">Przybyła</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matthew</namePart>
<namePart type="family">Shardlow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Clara</namePart>
<namePart type="family">Colombatto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nanna</namePart>
<namePart type="family">Inie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>When Large Language Models return “Inconclusive” in classification tasks, practitioners are left without insight into what went wrong. This diagnostic gap can delay medical decisions, undermine content moderation, and mislead downstream systems. We present FLARE (Failure Location and Reasoning Evaluation), a framework that transforms opaque failures into seven actionable categories. Applied to 5,400 election-misinformation classifications, FLARE reveals a surprising result: Few-Shot prompting—widely considered a best practice—produced 38× more failures than Zero-Shot, with 70.8% due to simple parsing issues. By exposing hidden failure modes, FLARE addresses critical misunderstandings in LLM deployment with implications across domains.</abstract>
<identifier type="citekey">madhavan-etal-2025-flare</identifier>
<location>
<url>https://aclanthology.org/2025.ommm-1.4/</url>
</location>
<part>
<date>2025-09</date>
<extent unit="page">
<start>40</start>
<end>44</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T FLARE: An Error Analysis Framework for Diagnosing LLM Classification Failures
%A Madhavan, Keerthana
%A Antonie, Luiza
%A Scott, Stacey
%Y Przybyła, Piotr
%Y Shardlow, Matthew
%Y Colombatto, Clara
%Y Inie, Nanna
%S Proceedings of Interdisciplinary Workshop on Observations of Misunderstood, Misguided and Malicious Use of Language Models
%D 2025
%8 September
%I INCOMA Ltd., Shoumen, Bulgaria
%C Varna, Bulgaria
%F madhavan-etal-2025-flare
%X When Large Language Models return “Inconclusive” in classification tasks, practitioners are left without insight into what went wrong. This diagnostic gap can delay medical decisions, undermine content moderation, and mislead downstream systems. We present FLARE (Failure Location and Reasoning Evaluation), a framework that transforms opaque failures into seven actionable categories. Applied to 5,400 election-misinformation classifications, FLARE reveals a surprising result: Few-Shot prompting—widely considered a best practice—produced 38× more failures than Zero-Shot, with 70.8% due to simple parsing issues. By exposing hidden failure modes, FLARE addresses critical misunderstandings in LLM deployment with implications across domains.
%U https://aclanthology.org/2025.ommm-1.4/
%P 40-44
Markdown (Informal)
[FLARE: An Error Analysis Framework for Diagnosing LLM Classification Failures](https://aclanthology.org/2025.ommm-1.4/) (Madhavan et al., OMMM 2025)
ACL