@inproceedings{wagh-srivastava-2025-dentist,
title = "``The dentist is an involved parent, the bartender is not'': Revealing Implicit Biases in {QA} with Implicit {BBQ}",
author = "Wagh, Aarushi and
Srivastava, Saniya",
editor = "Akter, Mousumi and
Chowdhury, Tahiya and
Eger, Steffen and
Leiter, Christoph and
Opitz, Juri and
{\c{C}}ano, Erion",
booktitle = "Proceedings of the 5th Workshop on Evaluation and Comparison of NLP Systems",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.eval4nlp-1.7/",
pages = "85--90",
ISBN = "979-8-89176-305-0",
abstract = "Existing benchmarks evaluating biases in large language models (LLMs) primarily rely on explicit cues, declaring protected attributes like religion, race, gender by name. However, real-world interactions often contain implicit biases, inferred subtly through names, cultural cues, or traits. This critical oversight creates a significant blind spot in fairness evaluation. We introduce ImplicitBBQ, a benchmark extending the Bias Benchmark for QA (BBQ) with implicitly cued protected attributes across 6 categories. Our evaluation of GPT-4o on ImplicitBBQ illustrates troubling performance disparity from explicit BBQ prompts, with accuracy declining up to 7{\%} in the ``sexual orientation'' subcategory and consistent decline located across most other categories. This indicates that current LLMs contain implicit biases undetected by explicit benchmarks. ImplicitBBQ offers a crucial tool for nuanced fairness evaluation in NLP."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wagh-srivastava-2025-dentist">
<titleInfo>
<title>“The dentist is an involved parent, the bartender is not”: Revealing Implicit Biases in QA with Implicit BBQ</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aarushi</namePart>
<namePart type="family">Wagh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Saniya</namePart>
<namePart type="family">Srivastava</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 5th Workshop on Evaluation and Comparison of NLP Systems</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mousumi</namePart>
<namePart type="family">Akter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tahiya</namePart>
<namePart type="family">Chowdhury</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steffen</namePart>
<namePart type="family">Eger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christoph</namePart>
<namePart type="family">Leiter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juri</namePart>
<namePart type="family">Opitz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Erion</namePart>
<namePart type="family">Çano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mumbai, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-305-0</identifier>
</relatedItem>
<abstract>Existing benchmarks evaluating biases in large language models (LLMs) primarily rely on explicit cues, declaring protected attributes like religion, race, gender by name. However, real-world interactions often contain implicit biases, inferred subtly through names, cultural cues, or traits. This critical oversight creates a significant blind spot in fairness evaluation. We introduce ImplicitBBQ, a benchmark extending the Bias Benchmark for QA (BBQ) with implicitly cued protected attributes across 6 categories. Our evaluation of GPT-4o on ImplicitBBQ illustrates troubling performance disparity from explicit BBQ prompts, with accuracy declining up to 7% in the “sexual orientation” subcategory and consistent decline located across most other categories. This indicates that current LLMs contain implicit biases undetected by explicit benchmarks. ImplicitBBQ offers a crucial tool for nuanced fairness evaluation in NLP.</abstract>
<identifier type="citekey">wagh-srivastava-2025-dentist</identifier>
<location>
<url>https://aclanthology.org/2025.eval4nlp-1.7/</url>
</location>
<part>
<date>2025-12</date>
<extent unit="page">
<start>85</start>
<end>90</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T “The dentist is an involved parent, the bartender is not”: Revealing Implicit Biases in QA with Implicit BBQ
%A Wagh, Aarushi
%A Srivastava, Saniya
%Y Akter, Mousumi
%Y Chowdhury, Tahiya
%Y Eger, Steffen
%Y Leiter, Christoph
%Y Opitz, Juri
%Y Çano, Erion
%S Proceedings of the 5th Workshop on Evaluation and Comparison of NLP Systems
%D 2025
%8 December
%I Association for Computational Linguistics
%C Mumbai, India
%@ 979-8-89176-305-0
%F wagh-srivastava-2025-dentist
%X Existing benchmarks evaluating biases in large language models (LLMs) primarily rely on explicit cues, declaring protected attributes like religion, race, gender by name. However, real-world interactions often contain implicit biases, inferred subtly through names, cultural cues, or traits. This critical oversight creates a significant blind spot in fairness evaluation. We introduce ImplicitBBQ, a benchmark extending the Bias Benchmark for QA (BBQ) with implicitly cued protected attributes across 6 categories. Our evaluation of GPT-4o on ImplicitBBQ illustrates troubling performance disparity from explicit BBQ prompts, with accuracy declining up to 7% in the “sexual orientation” subcategory and consistent decline located across most other categories. This indicates that current LLMs contain implicit biases undetected by explicit benchmarks. ImplicitBBQ offers a crucial tool for nuanced fairness evaluation in NLP.
%U https://aclanthology.org/2025.eval4nlp-1.7/
%P 85-90
Markdown (Informal)
[“The dentist is an involved parent, the bartender is not”: Revealing Implicit Biases in QA with Implicit BBQ](https://aclanthology.org/2025.eval4nlp-1.7/) (Wagh & Srivastava, Eval4NLP 2025)
ACL