@inproceedings{tan-etal-2025-towards,
title = "Towards Massive Multilingual Holistic Bias",
author = "Tan, Xiaoqing and
Hansanti, Prangthip and
Turkatenko, Arina and
Chuang, Joe and
Wood, Carleigh and
Yu, Bokai and
Ropers, Christophe and
Costa-juss{\`a}, Marta R.",
editor = "Fale{\'n}ska, Agnieszka and
Basta, Christine and
Costa-juss{\`a}, Marta and
Sta{\'n}czak, Karolina and
Nozza, Debora",
booktitle = "Proceedings of the 6th Workshop on Gender Bias in Natural Language Processing (GeBNLP)",
month = aug,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.gebnlp-1.35/",
doi = "10.18653/v1/2025.gebnlp-1.35",
pages = "403--426",
ISBN = "979-8-89176-277-0",
abstract = "In the current landscape of automatic language generation, there is a need to understand, evaluate, and mitigate demographic biases, as existing models are becoming increasingly multilingual. To address this, we present the initial eight languages from the Massive Multilingual Holistic Bias (MMHB) dataset and benchmark consisting of approximately 6 million sentences. The sentences are designed to induce biases towards different groups of people which can yield significant results when using them as a benchmark to test different text generation models. To further scale up in terms of both language coverage and size and to leverage limited human translation, we use systematic approach to independently translate sentence parts. This technique carefully designs a structure to dynamically generate multiple sentence variations and significantly reduces the human translation workload. The translation process has been meticulously conducted to avoid an English-centric perspective and include all necessary morphological variations for languages that require them, improving from the original English HOLISTICBIAS. Finally, we utilize MMHB to report results on gender bias and added toxicity in MT tasks."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tan-etal-2025-towards">
<titleInfo>
<title>Towards Massive Multilingual Holistic Bias</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xiaoqing</namePart>
<namePart type="family">Tan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Prangthip</namePart>
<namePart type="family">Hansanti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arina</namePart>
<namePart type="family">Turkatenko</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joe</namePart>
<namePart type="family">Chuang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carleigh</namePart>
<namePart type="family">Wood</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bokai</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christophe</namePart>
<namePart type="family">Ropers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marta</namePart>
<namePart type="given">R</namePart>
<namePart type="family">Costa-jussà</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 6th Workshop on Gender Bias in Natural Language Processing (GeBNLP)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Agnieszka</namePart>
<namePart type="family">Faleńska</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christine</namePart>
<namePart type="family">Basta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marta</namePart>
<namePart type="family">Costa-jussà</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Karolina</namePart>
<namePart type="family">Stańczak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Debora</namePart>
<namePart type="family">Nozza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-277-0</identifier>
</relatedItem>
<abstract>In the current landscape of automatic language generation, there is a need to understand, evaluate, and mitigate demographic biases, as existing models are becoming increasingly multilingual. To address this, we present the initial eight languages from the Massive Multilingual Holistic Bias (MMHB) dataset and benchmark consisting of approximately 6 million sentences. The sentences are designed to induce biases towards different groups of people which can yield significant results when using them as a benchmark to test different text generation models. To further scale up in terms of both language coverage and size and to leverage limited human translation, we use systematic approach to independently translate sentence parts. This technique carefully designs a structure to dynamically generate multiple sentence variations and significantly reduces the human translation workload. The translation process has been meticulously conducted to avoid an English-centric perspective and include all necessary morphological variations for languages that require them, improving from the original English HOLISTICBIAS. Finally, we utilize MMHB to report results on gender bias and added toxicity in MT tasks.</abstract>
<identifier type="citekey">tan-etal-2025-towards</identifier>
<identifier type="doi">10.18653/v1/2025.gebnlp-1.35</identifier>
<location>
<url>https://aclanthology.org/2025.gebnlp-1.35/</url>
</location>
<part>
<date>2025-08</date>
<extent unit="page">
<start>403</start>
<end>426</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Towards Massive Multilingual Holistic Bias
%A Tan, Xiaoqing
%A Hansanti, Prangthip
%A Turkatenko, Arina
%A Chuang, Joe
%A Wood, Carleigh
%A Yu, Bokai
%A Ropers, Christophe
%A Costa-jussà, Marta R.
%Y Faleńska, Agnieszka
%Y Basta, Christine
%Y Costa-jussà, Marta
%Y Stańczak, Karolina
%Y Nozza, Debora
%S Proceedings of the 6th Workshop on Gender Bias in Natural Language Processing (GeBNLP)
%D 2025
%8 August
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-277-0
%F tan-etal-2025-towards
%X In the current landscape of automatic language generation, there is a need to understand, evaluate, and mitigate demographic biases, as existing models are becoming increasingly multilingual. To address this, we present the initial eight languages from the Massive Multilingual Holistic Bias (MMHB) dataset and benchmark consisting of approximately 6 million sentences. The sentences are designed to induce biases towards different groups of people which can yield significant results when using them as a benchmark to test different text generation models. To further scale up in terms of both language coverage and size and to leverage limited human translation, we use systematic approach to independently translate sentence parts. This technique carefully designs a structure to dynamically generate multiple sentence variations and significantly reduces the human translation workload. The translation process has been meticulously conducted to avoid an English-centric perspective and include all necessary morphological variations for languages that require them, improving from the original English HOLISTICBIAS. Finally, we utilize MMHB to report results on gender bias and added toxicity in MT tasks.
%R 10.18653/v1/2025.gebnlp-1.35
%U https://aclanthology.org/2025.gebnlp-1.35/
%U https://doi.org/10.18653/v1/2025.gebnlp-1.35
%P 403-426
Markdown (Informal)
[Towards Massive Multilingual Holistic Bias](https://aclanthology.org/2025.gebnlp-1.35/) (Tan et al., GeBNLP 2025)
ACL
- Xiaoqing Tan, Prangthip Hansanti, Arina Turkatenko, Joe Chuang, Carleigh Wood, Bokai Yu, Christophe Ropers, and Marta R. Costa-jussà. 2025. Towards Massive Multilingual Holistic Bias. In Proceedings of the 6th Workshop on Gender Bias in Natural Language Processing (GeBNLP), pages 403–426, Vienna, Austria. Association for Computational Linguistics.