@inproceedings{sun-etal-2023-delphi,
title = "{DELPHI}: Data for Evaluating {LLM}s{'} Performance in Handling Controversial Issues",
author = "Sun, David and
Abzaliev, Artem and
Kotek, Hadas and
Klein, Christopher and
Xiu, Zidi and
Williams, Jason",
editor = "Wang, Mingxuan and
Zitouni, Imed",
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: Industry Track",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.emnlp-industry.76",
doi = "10.18653/v1/2023.emnlp-industry.76",
pages = "820--827",
abstract = "Controversy is a reflection of our zeitgeist, and an important aspect to any discourse. The rise of large language models (LLMs) as conversational systems has increased public reliance on these systems for answers to their various questions. Consequently, it is crucial to systematically examine how these models respond to questions that pertaining to ongoing debates. However, few such datasets exist in providing human-annotated labels reflecting the contemporary discussions. To foster research in this area, we propose a novel construction of a controversial questions dataset, expanding upon the publicly released Quora Question Pairs Dataset. This dataset presents challenges concerning knowledge recency, safety, fairness, and bias. We evaluate different LLMs using a subset of this dataset, illuminating how they handle controversial issues and the stances they adopt. This research ultimately contributes to our understanding of LLMs{'} interaction with controversial issues, paving the way for improvements in their comprehension and handling of complex societal debates.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sun-etal-2023-delphi">
<titleInfo>
<title>DELPHI: Data for Evaluating LLMs’ Performance in Handling Controversial Issues</title>
</titleInfo>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Artem</namePart>
<namePart type="family">Abzaliev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hadas</namePart>
<namePart type="family">Kotek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="family">Klein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zidi</namePart>
<namePart type="family">Xiu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jason</namePart>
<namePart type="family">Williams</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mingxuan</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Imed</namePart>
<namePart type="family">Zitouni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Controversy is a reflection of our zeitgeist, and an important aspect to any discourse. The rise of large language models (LLMs) as conversational systems has increased public reliance on these systems for answers to their various questions. Consequently, it is crucial to systematically examine how these models respond to questions that pertaining to ongoing debates. However, few such datasets exist in providing human-annotated labels reflecting the contemporary discussions. To foster research in this area, we propose a novel construction of a controversial questions dataset, expanding upon the publicly released Quora Question Pairs Dataset. This dataset presents challenges concerning knowledge recency, safety, fairness, and bias. We evaluate different LLMs using a subset of this dataset, illuminating how they handle controversial issues and the stances they adopt. This research ultimately contributes to our understanding of LLMs’ interaction with controversial issues, paving the way for improvements in their comprehension and handling of complex societal debates.</abstract>
<identifier type="citekey">sun-etal-2023-delphi</identifier>
<identifier type="doi">10.18653/v1/2023.emnlp-industry.76</identifier>
<location>
<url>https://aclanthology.org/2023.emnlp-industry.76</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>820</start>
<end>827</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T DELPHI: Data for Evaluating LLMs’ Performance in Handling Controversial Issues
%A Sun, David
%A Abzaliev, Artem
%A Kotek, Hadas
%A Klein, Christopher
%A Xiu, Zidi
%A Williams, Jason
%Y Wang, Mingxuan
%Y Zitouni, Imed
%S Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: Industry Track
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F sun-etal-2023-delphi
%X Controversy is a reflection of our zeitgeist, and an important aspect to any discourse. The rise of large language models (LLMs) as conversational systems has increased public reliance on these systems for answers to their various questions. Consequently, it is crucial to systematically examine how these models respond to questions that pertaining to ongoing debates. However, few such datasets exist in providing human-annotated labels reflecting the contemporary discussions. To foster research in this area, we propose a novel construction of a controversial questions dataset, expanding upon the publicly released Quora Question Pairs Dataset. This dataset presents challenges concerning knowledge recency, safety, fairness, and bias. We evaluate different LLMs using a subset of this dataset, illuminating how they handle controversial issues and the stances they adopt. This research ultimately contributes to our understanding of LLMs’ interaction with controversial issues, paving the way for improvements in their comprehension and handling of complex societal debates.
%R 10.18653/v1/2023.emnlp-industry.76
%U https://aclanthology.org/2023.emnlp-industry.76
%U https://doi.org/10.18653/v1/2023.emnlp-industry.76
%P 820-827
Markdown (Informal)
[DELPHI: Data for Evaluating LLMs’ Performance in Handling Controversial Issues](https://aclanthology.org/2023.emnlp-industry.76) (Sun et al., EMNLP 2023)
ACL