@inproceedings{van-der-weide-etal-2026-crows,
title = "{C}row{S}-Pairs-{NL}: A Benchmark to Evaluate {D}utch Stereotype Bias in {LLM}s",
author = "van der Weide, Jens and
Nguyen, Dong and
Schaaphok, Marianne and
Bakker, Roos M.",
editor = "Ma, Weicheng and
Vosoughi, Soroush and
Gillani, Nabeel and
Coto-Solano, Rolando",
booktitle = "Proceedings of the 1st Workshop on Stereotypes Across Cultures in Language Technologies ({S}tere{AC}u{LT} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.stereacult-1.1/",
pages = "1--12",
ISBN = "979-8-89176-408-8",
abstract = "Bias benchmarks for LLMs largely focus on English, overlooking language- and culture-specific stereotypes. We introduce CrowS-Pairs-NL, a Dutch stereotype benchmark built by filtering, translating, and adapting the English CrowS-Pairs dataset to address known conceptual pitfalls, and extending it with newly crowdsourced Dutch sentence pairs. We evaluate six multilingual and Dutch-trained models using both a pseudo-log-likelihood metric adapted for autoregressive models and a prompt-based metric with three template variants. Models explicitly trained on Dutch data consistently exhibit higher stereotyping scores, suggesting that language-specific fine-tuning introduces language-specific bias. The two metrics broadly agree on model rankings but differ in sensitivity, with the prompt metric showing a narrower range of scores. Our benchmark and findings underscore the need for culturally grounded bias evaluation beyond English."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="van-der-weide-etal-2026-crows">
<titleInfo>
<title>CrowS-Pairs-NL: A Benchmark to Evaluate Dutch Stereotype Bias in LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jens</namePart>
<namePart type="family">van der Weide</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dong</namePart>
<namePart type="family">Nguyen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianne</namePart>
<namePart type="family">Schaaphok</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roos</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Bakker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on Stereotypes Across Cultures in Language Technologies (StereACuLT 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Weicheng</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Soroush</namePart>
<namePart type="family">Vosoughi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nabeel</namePart>
<namePart type="family">Gillani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rolando</namePart>
<namePart type="family">Coto-Solano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-408-8</identifier>
</relatedItem>
<abstract>Bias benchmarks for LLMs largely focus on English, overlooking language- and culture-specific stereotypes. We introduce CrowS-Pairs-NL, a Dutch stereotype benchmark built by filtering, translating, and adapting the English CrowS-Pairs dataset to address known conceptual pitfalls, and extending it with newly crowdsourced Dutch sentence pairs. We evaluate six multilingual and Dutch-trained models using both a pseudo-log-likelihood metric adapted for autoregressive models and a prompt-based metric with three template variants. Models explicitly trained on Dutch data consistently exhibit higher stereotyping scores, suggesting that language-specific fine-tuning introduces language-specific bias. The two metrics broadly agree on model rankings but differ in sensitivity, with the prompt metric showing a narrower range of scores. Our benchmark and findings underscore the need for culturally grounded bias evaluation beyond English.</abstract>
<identifier type="citekey">van-der-weide-etal-2026-crows</identifier>
<location>
<url>https://aclanthology.org/2026.stereacult-1.1/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>1</start>
<end>12</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CrowS-Pairs-NL: A Benchmark to Evaluate Dutch Stereotype Bias in LLMs
%A van der Weide, Jens
%A Nguyen, Dong
%A Schaaphok, Marianne
%A Bakker, Roos M.
%Y Ma, Weicheng
%Y Vosoughi, Soroush
%Y Gillani, Nabeel
%Y Coto-Solano, Rolando
%S Proceedings of the 1st Workshop on Stereotypes Across Cultures in Language Technologies (StereACuLT 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-408-8
%F van-der-weide-etal-2026-crows
%X Bias benchmarks for LLMs largely focus on English, overlooking language- and culture-specific stereotypes. We introduce CrowS-Pairs-NL, a Dutch stereotype benchmark built by filtering, translating, and adapting the English CrowS-Pairs dataset to address known conceptual pitfalls, and extending it with newly crowdsourced Dutch sentence pairs. We evaluate six multilingual and Dutch-trained models using both a pseudo-log-likelihood metric adapted for autoregressive models and a prompt-based metric with three template variants. Models explicitly trained on Dutch data consistently exhibit higher stereotyping scores, suggesting that language-specific fine-tuning introduces language-specific bias. The two metrics broadly agree on model rankings but differ in sensitivity, with the prompt metric showing a narrower range of scores. Our benchmark and findings underscore the need for culturally grounded bias evaluation beyond English.
%U https://aclanthology.org/2026.stereacult-1.1/
%P 1-12
Markdown (Informal)
[CrowS-Pairs-NL: A Benchmark to Evaluate Dutch Stereotype Bias in LLMs](https://aclanthology.org/2026.stereacult-1.1/) (van der Weide et al., StereACuLT 2026)
ACL
- Jens van der Weide, Dong Nguyen, Marianne Schaaphok, and Roos M. Bakker. 2026. CrowS-Pairs-NL: A Benchmark to Evaluate Dutch Stereotype Bias in LLMs. In Proceedings of the 1st Workshop on Stereotypes Across Cultures in Language Technologies (StereACuLT 2026), pages 1–12, San Diego, California, United States. Association for Computational Linguistics.