@inproceedings{kim-etal-2025-chronobias,
title = "{C}hrono{B}ias: A Benchmark for Evaluating Temporal Group Bias in the Time-sensitive Knowledge of Large Language Models",
author = "Kim, Kyungmin and
Choi, Youngbin and
Kim, Hyounghun and
Kim, Dongwoo and
Park, Sangdon",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.405/",
pages = "7658--7693",
ISBN = "979-8-89176-335-7",
abstract = "In this paper, we propose $\texttt{ChronoBias}$, a novel benchmark for evaluating $\textit{time-conditional group bias}$ in the $\textit{time-sensitive}$ knowledge of large language models (LLMs).Our benchmark is constructed via a template-based semi-automated generation method, balancing the quality-quantity trade-off in existing benchmark curation approaches.For knowledge that changes over time, $\textit{time-conditional group bias}$ exhibits varying patterns across time intervals, evident in both the best- and worst-performing groups and in the bias metric itself.In addition to $\textit{parametric knowledge bias}${--}which influences group bias across all time intervals{--}we identify $\textit{time-sensitivity bias}$ as an additional factor after a model{'}s knowledge cutoff, accounting for much of the variation in $\textit{time-conditional group bias}$ over time.Since both biases are irreducible, retrieval-augmented generation (RAG) can be a promising approach, as it can address post-cutoff knowledge and better leverage pretraining knowledge that is underrepresented in the model parameters.While RAG improves both overall performance and group bias, we observe that the disparate patterns of $\textit{time-conditional group bias}$ still persist.Therefore, through extensive experiments with various model configurations, we illustrate how accurate and fair RAG-based LLMs should behave and provide actionable guidelines toward constructing such ideal models."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kim-etal-2025-chronobias">
<titleInfo>
<title>ChronoBias: A Benchmark for Evaluating Temporal Group Bias in the Time-sensitive Knowledge of Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kyungmin</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Youngbin</namePart>
<namePart type="family">Choi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hyounghun</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dongwoo</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sangdon</namePart>
<namePart type="family">Park</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>In this paper, we propose ChronoBias, a novel benchmark for evaluating time-conditional group bias in the time-sensitive knowledge of large language models (LLMs).Our benchmark is constructed via a template-based semi-automated generation method, balancing the quality-quantity trade-off in existing benchmark curation approaches.For knowledge that changes over time, time-conditional group bias exhibits varying patterns across time intervals, evident in both the best- and worst-performing groups and in the bias metric itself.In addition to parametric knowledge bias–which influences group bias across all time intervals–we identify time-sensitivity bias as an additional factor after a model’s knowledge cutoff, accounting for much of the variation in time-conditional group bias over time.Since both biases are irreducible, retrieval-augmented generation (RAG) can be a promising approach, as it can address post-cutoff knowledge and better leverage pretraining knowledge that is underrepresented in the model parameters.While RAG improves both overall performance and group bias, we observe that the disparate patterns of time-conditional group bias still persist.Therefore, through extensive experiments with various model configurations, we illustrate how accurate and fair RAG-based LLMs should behave and provide actionable guidelines toward constructing such ideal models.</abstract>
<identifier type="citekey">kim-etal-2025-chronobias</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.405/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>7658</start>
<end>7693</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ChronoBias: A Benchmark for Evaluating Temporal Group Bias in the Time-sensitive Knowledge of Large Language Models
%A Kim, Kyungmin
%A Choi, Youngbin
%A Kim, Hyounghun
%A Kim, Dongwoo
%A Park, Sangdon
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F kim-etal-2025-chronobias
%X In this paper, we propose ChronoBias, a novel benchmark for evaluating time-conditional group bias in the time-sensitive knowledge of large language models (LLMs).Our benchmark is constructed via a template-based semi-automated generation method, balancing the quality-quantity trade-off in existing benchmark curation approaches.For knowledge that changes over time, time-conditional group bias exhibits varying patterns across time intervals, evident in both the best- and worst-performing groups and in the bias metric itself.In addition to parametric knowledge bias–which influences group bias across all time intervals–we identify time-sensitivity bias as an additional factor after a model’s knowledge cutoff, accounting for much of the variation in time-conditional group bias over time.Since both biases are irreducible, retrieval-augmented generation (RAG) can be a promising approach, as it can address post-cutoff knowledge and better leverage pretraining knowledge that is underrepresented in the model parameters.While RAG improves both overall performance and group bias, we observe that the disparate patterns of time-conditional group bias still persist.Therefore, through extensive experiments with various model configurations, we illustrate how accurate and fair RAG-based LLMs should behave and provide actionable guidelines toward constructing such ideal models.
%U https://aclanthology.org/2025.findings-emnlp.405/
%P 7658-7693
Markdown (Informal)
[ChronoBias: A Benchmark for Evaluating Temporal Group Bias in the Time-sensitive Knowledge of Large Language Models](https://aclanthology.org/2025.findings-emnlp.405/) (Kim et al., Findings 2025)
ACL