@inproceedings{welivita-etal-2026-generates,
title = "Who Generates More Empathetic Responses{---}Humans or {LLM}s? A Comparative Evaluation with Human and {LLM} Judges",
author = "Welivita, Anuradha and
Zeitoun, Fawzia and
Pu, Pearl",
editor = "Bonial, Claire and
Berzak, Yevgeni",
booktitle = "Proceedings of the 30th Conference on Computational Natural Language Learning",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.conll-main.21/",
pages = "358--381",
ISBN = "979-8-89176-410-1",
abstract = "This paper compares the empathetic quality of responses generated by humans and large language models (LLMs). We evaluate four LLMs that were widely used at the time of study{---}GPT-4, LLaMA-2-70B-Chat, Gemini-1.0-Pro, and Mixtral-8{\texttimes}7B-Instruct{---}against a human baseline using a large-scale between-subjects study. A total of 1,000 human participants evaluated the empathetic quality of human- and LLM-generated responses to 2,000 dialogue prompts spanning 32 positive and negative emotions. To complement human judgments, we also employed an LLM-as-judge (GPT-4o-mini) to assess the same responses. Across emotions and evaluators, LLM-generated responses were rated as significantly more empathetic than human-written responses. We also observed that both human judges and the LLM-as-judge tended to rate responses generated by their own group more favorably, indicating self-favoring tendencies. These findings highlight both the strong performance of contemporary LLMs in empathetic responding and the need to interpret human- and LLM-based evaluations with care."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="welivita-etal-2026-generates">
<titleInfo>
<title>Who Generates More Empathetic Responses—Humans or LLMs? A Comparative Evaluation with Human and LLM Judges</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anuradha</namePart>
<namePart type="family">Welivita</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fawzia</namePart>
<namePart type="family">Zeitoun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pearl</namePart>
<namePart type="family">Pu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 30th Conference on Computational Natural Language Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Claire</namePart>
<namePart type="family">Bonial</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yevgeni</namePart>
<namePart type="family">Berzak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-410-1</identifier>
</relatedItem>
<abstract>This paper compares the empathetic quality of responses generated by humans and large language models (LLMs). We evaluate four LLMs that were widely used at the time of study—GPT-4, LLaMA-2-70B-Chat, Gemini-1.0-Pro, and Mixtral-8×7B-Instruct—against a human baseline using a large-scale between-subjects study. A total of 1,000 human participants evaluated the empathetic quality of human- and LLM-generated responses to 2,000 dialogue prompts spanning 32 positive and negative emotions. To complement human judgments, we also employed an LLM-as-judge (GPT-4o-mini) to assess the same responses. Across emotions and evaluators, LLM-generated responses were rated as significantly more empathetic than human-written responses. We also observed that both human judges and the LLM-as-judge tended to rate responses generated by their own group more favorably, indicating self-favoring tendencies. These findings highlight both the strong performance of contemporary LLMs in empathetic responding and the need to interpret human- and LLM-based evaluations with care.</abstract>
<identifier type="citekey">welivita-etal-2026-generates</identifier>
<location>
<url>https://aclanthology.org/2026.conll-main.21/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>358</start>
<end>381</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Who Generates More Empathetic Responses—Humans or LLMs? A Comparative Evaluation with Human and LLM Judges
%A Welivita, Anuradha
%A Zeitoun, Fawzia
%A Pu, Pearl
%Y Bonial, Claire
%Y Berzak, Yevgeni
%S Proceedings of the 30th Conference on Computational Natural Language Learning
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-410-1
%F welivita-etal-2026-generates
%X This paper compares the empathetic quality of responses generated by humans and large language models (LLMs). We evaluate four LLMs that were widely used at the time of study—GPT-4, LLaMA-2-70B-Chat, Gemini-1.0-Pro, and Mixtral-8×7B-Instruct—against a human baseline using a large-scale between-subjects study. A total of 1,000 human participants evaluated the empathetic quality of human- and LLM-generated responses to 2,000 dialogue prompts spanning 32 positive and negative emotions. To complement human judgments, we also employed an LLM-as-judge (GPT-4o-mini) to assess the same responses. Across emotions and evaluators, LLM-generated responses were rated as significantly more empathetic than human-written responses. We also observed that both human judges and the LLM-as-judge tended to rate responses generated by their own group more favorably, indicating self-favoring tendencies. These findings highlight both the strong performance of contemporary LLMs in empathetic responding and the need to interpret human- and LLM-based evaluations with care.
%U https://aclanthology.org/2026.conll-main.21/
%P 358-381
Markdown (Informal)
[Who Generates More Empathetic Responses—Humans or LLMs? A Comparative Evaluation with Human and LLM Judges](https://aclanthology.org/2026.conll-main.21/) (Welivita et al., CoNLL 2026)
ACL