@inproceedings{mrabet-demner-fushman-2020-holms,
title = "{HOLMS}: Alternative Summary Evaluation with Large Language Models",
author = "Mrabet, Yassine and
Demner-Fushman, Dina",
booktitle = "Proceedings of the 28th International Conference on Computational Linguistics",
month = dec,
year = "2020",
address = "Barcelona, Spain (Online)",
publisher = "International Committee on Computational Linguistics",
url = "https://aclanthology.org/2020.coling-main.498",
doi = "10.18653/v1/2020.coling-main.498",
pages = "5679--5688",
abstract = "Efficient document summarization requires evaluation measures that can not only rank a set of systems based on an average score, but also highlight which individual summary is better than another. However, despite the very active research on summarization approaches, few works have proposed new evaluation measures in the recent years. The standard measures relied upon for the development of summarization systems are most often ROUGE and BLEU which, despite being efficient in overall system ranking, remain lexical in nature and have a limited potential when it comes to training neural networks. In this paper, we present a new hybrid evaluation measure for summarization, called HOLMS, that combines both language models pre-trained on large corpora and lexical similarity measures. Through several experiments, we show that HOLMS outperforms ROUGE and BLEU substantially in its correlation with human judgments on several extractive summarization datasets for both linguistic quality and pyramid scores.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mrabet-demner-fushman-2020-holms">
<titleInfo>
<title>HOLMS: Alternative Summary Evaluation with Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yassine</namePart>
<namePart type="family">Mrabet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dina</namePart>
<namePart type="family">Demner-Fushman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 28th International Conference on Computational Linguistics</title>
</titleInfo>
<originInfo>
<publisher>International Committee on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Barcelona, Spain (Online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Efficient document summarization requires evaluation measures that can not only rank a set of systems based on an average score, but also highlight which individual summary is better than another. However, despite the very active research on summarization approaches, few works have proposed new evaluation measures in the recent years. The standard measures relied upon for the development of summarization systems are most often ROUGE and BLEU which, despite being efficient in overall system ranking, remain lexical in nature and have a limited potential when it comes to training neural networks. In this paper, we present a new hybrid evaluation measure for summarization, called HOLMS, that combines both language models pre-trained on large corpora and lexical similarity measures. Through several experiments, we show that HOLMS outperforms ROUGE and BLEU substantially in its correlation with human judgments on several extractive summarization datasets for both linguistic quality and pyramid scores.</abstract>
<identifier type="citekey">mrabet-demner-fushman-2020-holms</identifier>
<identifier type="doi">10.18653/v1/2020.coling-main.498</identifier>
<location>
<url>https://aclanthology.org/2020.coling-main.498</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>5679</start>
<end>5688</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T HOLMS: Alternative Summary Evaluation with Large Language Models
%A Mrabet, Yassine
%A Demner-Fushman, Dina
%S Proceedings of the 28th International Conference on Computational Linguistics
%D 2020
%8 December
%I International Committee on Computational Linguistics
%C Barcelona, Spain (Online)
%F mrabet-demner-fushman-2020-holms
%X Efficient document summarization requires evaluation measures that can not only rank a set of systems based on an average score, but also highlight which individual summary is better than another. However, despite the very active research on summarization approaches, few works have proposed new evaluation measures in the recent years. The standard measures relied upon for the development of summarization systems are most often ROUGE and BLEU which, despite being efficient in overall system ranking, remain lexical in nature and have a limited potential when it comes to training neural networks. In this paper, we present a new hybrid evaluation measure for summarization, called HOLMS, that combines both language models pre-trained on large corpora and lexical similarity measures. Through several experiments, we show that HOLMS outperforms ROUGE and BLEU substantially in its correlation with human judgments on several extractive summarization datasets for both linguistic quality and pyramid scores.
%R 10.18653/v1/2020.coling-main.498
%U https://aclanthology.org/2020.coling-main.498
%U https://doi.org/10.18653/v1/2020.coling-main.498
%P 5679-5688
Markdown (Informal)
[HOLMS: Alternative Summary Evaluation with Large Language Models](https://aclanthology.org/2020.coling-main.498) (Mrabet & Demner-Fushman, COLING 2020)
ACL