@inproceedings{zhang-etal-2025-hire,
title = "Hire Me or Not? Examining Language Model`s Behavior with Occupation Attributes",
author = "Zhang, Damin and
Zhang, Yi and
Bihani, Geetanjali and
Rayz, Julia",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.529/",
pages = "7891--7911",
abstract = "With the impressive performance in various downstream tasks, large language models (LLMs) have been widely integrated into production pipelines, such as recruitment and recommendation systems. A known issue of models trained on natural language data is the presence of human biases, which can impact the fairness of the system. This paper investigates LLMs' behavior with respect to gender stereotypes in the context of occupation decision making. Our framework is designed to investigate and quantify the presence of gender stereotypes in LLMs' behavior via multi-round question answering. Inspired by prior work, we constructed a dataset using a standard occupation classification knowledge base released by authoritative agencies. We tested it on three families of LMs (RoBERTa, GPT, and Llama) and found that all models exhibit gender stereotypes analogous to human biases, but with different preferences. The distinct preferences of GPT-3.5-turbo and Llama2-70b-chat, along with additional analysis indicating GPT-4o-mini favors female subjects, may imply that the current alignment methods are insufficient for debiasing and could introduce new biases contradicting the traditional gender stereotypes. Our contribution includes a 73,500 prompts dataset constructed with a taxonomy of real-world occupations and a multi-step verification framework to evaluate model`s behavior regarding gender stereotype."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2025-hire">
<titleInfo>
<title>Hire Me or Not? Examining Language Model‘s Behavior with Occupation Attributes</title>
</titleInfo>
<name type="personal">
<namePart type="given">Damin</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yi</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Geetanjali</namePart>
<namePart type="family">Bihani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Julia</namePart>
<namePart type="family">Rayz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>With the impressive performance in various downstream tasks, large language models (LLMs) have been widely integrated into production pipelines, such as recruitment and recommendation systems. A known issue of models trained on natural language data is the presence of human biases, which can impact the fairness of the system. This paper investigates LLMs’ behavior with respect to gender stereotypes in the context of occupation decision making. Our framework is designed to investigate and quantify the presence of gender stereotypes in LLMs’ behavior via multi-round question answering. Inspired by prior work, we constructed a dataset using a standard occupation classification knowledge base released by authoritative agencies. We tested it on three families of LMs (RoBERTa, GPT, and Llama) and found that all models exhibit gender stereotypes analogous to human biases, but with different preferences. The distinct preferences of GPT-3.5-turbo and Llama2-70b-chat, along with additional analysis indicating GPT-4o-mini favors female subjects, may imply that the current alignment methods are insufficient for debiasing and could introduce new biases contradicting the traditional gender stereotypes. Our contribution includes a 73,500 prompts dataset constructed with a taxonomy of real-world occupations and a multi-step verification framework to evaluate model‘s behavior regarding gender stereotype.</abstract>
<identifier type="citekey">zhang-etal-2025-hire</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.529/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>7891</start>
<end>7911</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Hire Me or Not? Examining Language Model‘s Behavior with Occupation Attributes
%A Zhang, Damin
%A Zhang, Yi
%A Bihani, Geetanjali
%A Rayz, Julia
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F zhang-etal-2025-hire
%X With the impressive performance in various downstream tasks, large language models (LLMs) have been widely integrated into production pipelines, such as recruitment and recommendation systems. A known issue of models trained on natural language data is the presence of human biases, which can impact the fairness of the system. This paper investigates LLMs’ behavior with respect to gender stereotypes in the context of occupation decision making. Our framework is designed to investigate and quantify the presence of gender stereotypes in LLMs’ behavior via multi-round question answering. Inspired by prior work, we constructed a dataset using a standard occupation classification knowledge base released by authoritative agencies. We tested it on three families of LMs (RoBERTa, GPT, and Llama) and found that all models exhibit gender stereotypes analogous to human biases, but with different preferences. The distinct preferences of GPT-3.5-turbo and Llama2-70b-chat, along with additional analysis indicating GPT-4o-mini favors female subjects, may imply that the current alignment methods are insufficient for debiasing and could introduce new biases contradicting the traditional gender stereotypes. Our contribution includes a 73,500 prompts dataset constructed with a taxonomy of real-world occupations and a multi-step verification framework to evaluate model‘s behavior regarding gender stereotype.
%U https://aclanthology.org/2025.coling-main.529/
%P 7891-7911
Markdown (Informal)
[Hire Me or Not? Examining Language Model’s Behavior with Occupation Attributes](https://aclanthology.org/2025.coling-main.529/) (Zhang et al., COLING 2025)
ACL