@inproceedings{siledar-etal-2024-one,
title = "One Prompt To Rule Them All: {LLM}s for Opinion Summary Evaluation",
author = "Siledar, Tejpalsingh and
Nath, Swaroop and
Muddu, Sankara and
Rangaraju, Rupasai and
Nath, Swaprava and
Bhattacharyya, Pushpak and
Banerjee, Suman and
Patil, Amey and
Singh, Sudhanshu and
Chelliah, Muthusamy and
Garera, Nikesh",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.luhme-long.655/",
doi = "10.18653/v1/2024.acl-long.655",
pages = "12119--12134",
abstract = "Evaluation of opinion summaries using conventional reference-based metrics often fails to provide a comprehensive assessment and exhibits limited correlation with human judgments. While Large Language Models (LLMs) have shown promise as reference-free metrics for NLG evaluation, their potential remains unexplored for opinion summary evaluation. Furthermore, the absence of sufficient opinion summary evaluation datasets hinders progress in this area. In response, we introduce the SUMMEVAL-OP dataset, encompassing 7 dimensions crucial to the evaluation of opinion summaries: fluency, coherence, relevance, faithfulness, aspect coverage, sentiment consistency, and specificity. We propose OP-I-PROMPT, a dimension-independent prompt, along with OP-PROMPTS, a dimension-dependent set of prompts for opinion summary evaluation. Our experiments demonstrate that OP-I-PROMPT emerges as a good alternative for evaluating opinion summaries, achieving an average Spearman correlation of 0.70 with human judgments, surpassing prior methodologies. Remarkably, we are the first to explore the efficacy of LLMs as evaluators, both on closed-source and open-source models, in the opinion summary evaluation domain."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="siledar-etal-2024-one">
<titleInfo>
<title>One Prompt To Rule Them All: LLMs for Opinion Summary Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tejpalsingh</namePart>
<namePart type="family">Siledar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Swaroop</namePart>
<namePart type="family">Nath</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sankara</namePart>
<namePart type="family">Muddu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rupasai</namePart>
<namePart type="family">Rangaraju</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Swaprava</namePart>
<namePart type="family">Nath</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pushpak</namePart>
<namePart type="family">Bhattacharyya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Suman</namePart>
<namePart type="family">Banerjee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amey</namePart>
<namePart type="family">Patil</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sudhanshu</namePart>
<namePart type="family">Singh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Muthusamy</namePart>
<namePart type="family">Chelliah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikesh</namePart>
<namePart type="family">Garera</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Evaluation of opinion summaries using conventional reference-based metrics often fails to provide a comprehensive assessment and exhibits limited correlation with human judgments. While Large Language Models (LLMs) have shown promise as reference-free metrics for NLG evaluation, their potential remains unexplored for opinion summary evaluation. Furthermore, the absence of sufficient opinion summary evaluation datasets hinders progress in this area. In response, we introduce the SUMMEVAL-OP dataset, encompassing 7 dimensions crucial to the evaluation of opinion summaries: fluency, coherence, relevance, faithfulness, aspect coverage, sentiment consistency, and specificity. We propose OP-I-PROMPT, a dimension-independent prompt, along with OP-PROMPTS, a dimension-dependent set of prompts for opinion summary evaluation. Our experiments demonstrate that OP-I-PROMPT emerges as a good alternative for evaluating opinion summaries, achieving an average Spearman correlation of 0.70 with human judgments, surpassing prior methodologies. Remarkably, we are the first to explore the efficacy of LLMs as evaluators, both on closed-source and open-source models, in the opinion summary evaluation domain.</abstract>
<identifier type="citekey">siledar-etal-2024-one</identifier>
<identifier type="doi">10.18653/v1/2024.acl-long.655</identifier>
<location>
<url>https://aclanthology.org/2024.luhme-long.655/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>12119</start>
<end>12134</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T One Prompt To Rule Them All: LLMs for Opinion Summary Evaluation
%A Siledar, Tejpalsingh
%A Nath, Swaroop
%A Muddu, Sankara
%A Rangaraju, Rupasai
%A Nath, Swaprava
%A Bhattacharyya, Pushpak
%A Banerjee, Suman
%A Patil, Amey
%A Singh, Sudhanshu
%A Chelliah, Muthusamy
%A Garera, Nikesh
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F siledar-etal-2024-one
%X Evaluation of opinion summaries using conventional reference-based metrics often fails to provide a comprehensive assessment and exhibits limited correlation with human judgments. While Large Language Models (LLMs) have shown promise as reference-free metrics for NLG evaluation, their potential remains unexplored for opinion summary evaluation. Furthermore, the absence of sufficient opinion summary evaluation datasets hinders progress in this area. In response, we introduce the SUMMEVAL-OP dataset, encompassing 7 dimensions crucial to the evaluation of opinion summaries: fluency, coherence, relevance, faithfulness, aspect coverage, sentiment consistency, and specificity. We propose OP-I-PROMPT, a dimension-independent prompt, along with OP-PROMPTS, a dimension-dependent set of prompts for opinion summary evaluation. Our experiments demonstrate that OP-I-PROMPT emerges as a good alternative for evaluating opinion summaries, achieving an average Spearman correlation of 0.70 with human judgments, surpassing prior methodologies. Remarkably, we are the first to explore the efficacy of LLMs as evaluators, both on closed-source and open-source models, in the opinion summary evaluation domain.
%R 10.18653/v1/2024.acl-long.655
%U https://aclanthology.org/2024.luhme-long.655/
%U https://doi.org/10.18653/v1/2024.acl-long.655
%P 12119-12134
Markdown (Informal)
[One Prompt To Rule Them All: LLMs for Opinion Summary Evaluation](https://aclanthology.org/2024.luhme-long.655/) (Siledar et al., ACL 2024)
ACL
- Tejpalsingh Siledar, Swaroop Nath, Sankara Muddu, Rupasai Rangaraju, Swaprava Nath, Pushpak Bhattacharyya, Suman Banerjee, Amey Patil, Sudhanshu Singh, Muthusamy Chelliah, and Nikesh Garera. 2024. One Prompt To Rule Them All: LLMs for Opinion Summary Evaluation. In Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 12119–12134, Bangkok, Thailand. Association for Computational Linguistics.