@inproceedings{askari-etal-2025-assessing,
title = "Assessing {LLM}s for Zero-shot Abstractive Summarization Through the Lens of Relevance Paraphrasing",
author = "Askari, Hadi and
Chhabra, Anshuman and
Chen, Muhao and
Mohapatra, Prasant",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2025",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-naacl.116/",
doi = "10.18653/v1/2025.findings-naacl.116",
pages = "2187--2201",
ISBN = "979-8-89176-195-7",
abstract = "Large Language Models (LLMs) have achieved state-of-the-art performance at zero-shot generation of abstractive summaries for given articles. However, little is known about the robustness of such a process of zero-shot summarization.To bridge this gap, we propose *relevance paraphrasing*, a simple strategy that can be used to measure the robustness of LLMs as summarizers. The relevance paraphrasing approach identifies the most *relevant* sentences that contribute to generating an ideal summary, and then *paraphrases* these inputs to obtain a minimally perturbed dataset. Then, by evaluating model performance for summarization on both the original and perturbed datasets, we can assess the LLM{'}s one aspect of robustness. We conduct extensive experiments with relevance paraphrasing on 4 diverse datasets, as well as 4 LLMs of different sizes (GPT-3.5-Turbo, Llama-2-13B, Mistral-7B-v1, and Dolly-v2-7B). Our results indicate that LLMs are not consistent summarizers for the minimally perturbed articles, necessitating further improvements."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="askari-etal-2025-assessing">
<titleInfo>
<title>Assessing LLMs for Zero-shot Abstractive Summarization Through the Lens of Relevance Paraphrasing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hadi</namePart>
<namePart type="family">Askari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anshuman</namePart>
<namePart type="family">Chhabra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Muhao</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Prasant</namePart>
<namePart type="family">Mohapatra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-195-7</identifier>
</relatedItem>
<abstract>Large Language Models (LLMs) have achieved state-of-the-art performance at zero-shot generation of abstractive summaries for given articles. However, little is known about the robustness of such a process of zero-shot summarization.To bridge this gap, we propose *relevance paraphrasing*, a simple strategy that can be used to measure the robustness of LLMs as summarizers. The relevance paraphrasing approach identifies the most *relevant* sentences that contribute to generating an ideal summary, and then *paraphrases* these inputs to obtain a minimally perturbed dataset. Then, by evaluating model performance for summarization on both the original and perturbed datasets, we can assess the LLM’s one aspect of robustness. We conduct extensive experiments with relevance paraphrasing on 4 diverse datasets, as well as 4 LLMs of different sizes (GPT-3.5-Turbo, Llama-2-13B, Mistral-7B-v1, and Dolly-v2-7B). Our results indicate that LLMs are not consistent summarizers for the minimally perturbed articles, necessitating further improvements.</abstract>
<identifier type="citekey">askari-etal-2025-assessing</identifier>
<identifier type="doi">10.18653/v1/2025.findings-naacl.116</identifier>
<location>
<url>https://aclanthology.org/2025.findings-naacl.116/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>2187</start>
<end>2201</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Assessing LLMs for Zero-shot Abstractive Summarization Through the Lens of Relevance Paraphrasing
%A Askari, Hadi
%A Chhabra, Anshuman
%A Chen, Muhao
%A Mohapatra, Prasant
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Findings of the Association for Computational Linguistics: NAACL 2025
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-195-7
%F askari-etal-2025-assessing
%X Large Language Models (LLMs) have achieved state-of-the-art performance at zero-shot generation of abstractive summaries for given articles. However, little is known about the robustness of such a process of zero-shot summarization.To bridge this gap, we propose *relevance paraphrasing*, a simple strategy that can be used to measure the robustness of LLMs as summarizers. The relevance paraphrasing approach identifies the most *relevant* sentences that contribute to generating an ideal summary, and then *paraphrases* these inputs to obtain a minimally perturbed dataset. Then, by evaluating model performance for summarization on both the original and perturbed datasets, we can assess the LLM’s one aspect of robustness. We conduct extensive experiments with relevance paraphrasing on 4 diverse datasets, as well as 4 LLMs of different sizes (GPT-3.5-Turbo, Llama-2-13B, Mistral-7B-v1, and Dolly-v2-7B). Our results indicate that LLMs are not consistent summarizers for the minimally perturbed articles, necessitating further improvements.
%R 10.18653/v1/2025.findings-naacl.116
%U https://aclanthology.org/2025.findings-naacl.116/
%U https://doi.org/10.18653/v1/2025.findings-naacl.116
%P 2187-2201
Markdown (Informal)
[Assessing LLMs for Zero-shot Abstractive Summarization Through the Lens of Relevance Paraphrasing](https://aclanthology.org/2025.findings-naacl.116/) (Askari et al., Findings 2025)
ACL