@inproceedings{shu-etal-2024-fusion,
title = "Fusion-Eval: Integrating Assistant Evaluators with {LLM}s",
author = "Shu, Lei and
Wichers, Nevan and
Luo, Liangchen and
Zhu, Yun and
Liu, Yinxiao and
Chen, Jindong and
Meng, Lei",
editor = "Dernoncourt, Franck and
Preo{\c{t}}iuc-Pietro, Daniel and
Shimorina, Anastasia",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track",
month = nov,
year = "2024",
address = "Miami, Florida, US",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-industry.18",
pages = "225--238",
abstract = "Evaluating natural language generation (NLG) systems automatically poses significant challenges.Recent studies have employed large language models (LLMs) as reference-free metrics for NLG evaluation, enhancing adaptability to new tasks tasks. However, these methods still show lower correspondence with human judgments compared to specialized neural evaluators.In this paper, we introduce {``}Fusion-Eval{''}, an innovative approach that leverages LLMs to integrate insights from various assistant evaluators. The LLM is given the example to evaluate along with scores from the assistant evaluators. Each of these evaluators specializes in assessing distinct aspects of responses.Fusion-Eval achieves a 0.962 system-level Kendall-Tau correlation with humans on SummEval and a 0.744 turn-level Spearman correlation on TopicalChat, which is significantly higher than baseline methods. These results highlight Fusion-Eval{'}s significant potential in the realm of natural language system evaluation.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="shu-etal-2024-fusion">
<titleInfo>
<title>Fusion-Eval: Integrating Assistant Evaluators with LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lei</namePart>
<namePart type="family">Shu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nevan</namePart>
<namePart type="family">Wichers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Liangchen</namePart>
<namePart type="family">Luo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yinxiao</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jindong</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lei</namePart>
<namePart type="family">Meng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Franck</namePart>
<namePart type="family">Dernoncourt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Preoţiuc-Pietro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anastasia</namePart>
<namePart type="family">Shimorina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, US</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Evaluating natural language generation (NLG) systems automatically poses significant challenges.Recent studies have employed large language models (LLMs) as reference-free metrics for NLG evaluation, enhancing adaptability to new tasks tasks. However, these methods still show lower correspondence with human judgments compared to specialized neural evaluators.In this paper, we introduce “Fusion-Eval”, an innovative approach that leverages LLMs to integrate insights from various assistant evaluators. The LLM is given the example to evaluate along with scores from the assistant evaluators. Each of these evaluators specializes in assessing distinct aspects of responses.Fusion-Eval achieves a 0.962 system-level Kendall-Tau correlation with humans on SummEval and a 0.744 turn-level Spearman correlation on TopicalChat, which is significantly higher than baseline methods. These results highlight Fusion-Eval’s significant potential in the realm of natural language system evaluation.</abstract>
<identifier type="citekey">shu-etal-2024-fusion</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-industry.18</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>225</start>
<end>238</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Fusion-Eval: Integrating Assistant Evaluators with LLMs
%A Shu, Lei
%A Wichers, Nevan
%A Luo, Liangchen
%A Zhu, Yun
%A Liu, Yinxiao
%A Chen, Jindong
%A Meng, Lei
%Y Dernoncourt, Franck
%Y Preoţiuc-Pietro, Daniel
%Y Shimorina, Anastasia
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, US
%F shu-etal-2024-fusion
%X Evaluating natural language generation (NLG) systems automatically poses significant challenges.Recent studies have employed large language models (LLMs) as reference-free metrics for NLG evaluation, enhancing adaptability to new tasks tasks. However, these methods still show lower correspondence with human judgments compared to specialized neural evaluators.In this paper, we introduce “Fusion-Eval”, an innovative approach that leverages LLMs to integrate insights from various assistant evaluators. The LLM is given the example to evaluate along with scores from the assistant evaluators. Each of these evaluators specializes in assessing distinct aspects of responses.Fusion-Eval achieves a 0.962 system-level Kendall-Tau correlation with humans on SummEval and a 0.744 turn-level Spearman correlation on TopicalChat, which is significantly higher than baseline methods. These results highlight Fusion-Eval’s significant potential in the realm of natural language system evaluation.
%U https://aclanthology.org/2024.emnlp-industry.18
%P 225-238
Markdown (Informal)
[Fusion-Eval: Integrating Assistant Evaluators with LLMs](https://aclanthology.org/2024.emnlp-industry.18) (Shu et al., EMNLP 2024)
ACL
- Lei Shu, Nevan Wichers, Liangchen Luo, Yun Zhu, Yinxiao Liu, Jindong Chen, and Lei Meng. 2024. Fusion-Eval: Integrating Assistant Evaluators with LLMs. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track, pages 225–238, Miami, Florida, US. Association for Computational Linguistics.