@inproceedings{puduppully-etal-2023-multi,
title = "Multi-Document Summarization with Centroid-Based Pretraining",
author = "Puduppully, Ratish Surendran and
Jain, Parag and
Chen, Nancy and
Steedman, Mark",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.acl-short.13",
doi = "10.18653/v1/2023.acl-short.13",
pages = "128--138",
abstract = "In Multi-Document Summarization (MDS), the input can be modeled as a set of documents, and the output is its summary. In this paper, we focus on pretraining objectives for MDS. Specifically, we introduce a novel pretraining objective, which involves selecting the ROUGE-based centroid of each document cluster as a proxy for its summary. Our objective thus does not require human written summaries and can be utilized for pretraining on a dataset consisting solely of document sets. Through zero-shot, few-shot, and fully supervised experiments on multiple MDS datasets, we show that our model \textit{Centrum} is better or comparable to a state-of-the-art model. We make the pretrained and fine-tuned models freely available to the research community \url{https://github.com/ratishsp/centrum}.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="puduppully-etal-2023-multi">
<titleInfo>
<title>Multi-Document Summarization with Centroid-Based Pretraining</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ratish</namePart>
<namePart type="given">Surendran</namePart>
<namePart type="family">Puduppully</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Parag</namePart>
<namePart type="family">Jain</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nancy</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Steedman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Boyd-Graber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In Multi-Document Summarization (MDS), the input can be modeled as a set of documents, and the output is its summary. In this paper, we focus on pretraining objectives for MDS. Specifically, we introduce a novel pretraining objective, which involves selecting the ROUGE-based centroid of each document cluster as a proxy for its summary. Our objective thus does not require human written summaries and can be utilized for pretraining on a dataset consisting solely of document sets. Through zero-shot, few-shot, and fully supervised experiments on multiple MDS datasets, we show that our model Centrum is better or comparable to a state-of-the-art model. We make the pretrained and fine-tuned models freely available to the research community https://github.com/ratishsp/centrum.</abstract>
<identifier type="citekey">puduppully-etal-2023-multi</identifier>
<identifier type="doi">10.18653/v1/2023.acl-short.13</identifier>
<location>
<url>https://aclanthology.org/2023.acl-short.13</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>128</start>
<end>138</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Multi-Document Summarization with Centroid-Based Pretraining
%A Puduppully, Ratish Surendran
%A Jain, Parag
%A Chen, Nancy
%A Steedman, Mark
%Y Rogers, Anna
%Y Boyd-Graber, Jordan
%Y Okazaki, Naoaki
%S Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F puduppully-etal-2023-multi
%X In Multi-Document Summarization (MDS), the input can be modeled as a set of documents, and the output is its summary. In this paper, we focus on pretraining objectives for MDS. Specifically, we introduce a novel pretraining objective, which involves selecting the ROUGE-based centroid of each document cluster as a proxy for its summary. Our objective thus does not require human written summaries and can be utilized for pretraining on a dataset consisting solely of document sets. Through zero-shot, few-shot, and fully supervised experiments on multiple MDS datasets, we show that our model Centrum is better or comparable to a state-of-the-art model. We make the pretrained and fine-tuned models freely available to the research community https://github.com/ratishsp/centrum.
%R 10.18653/v1/2023.acl-short.13
%U https://aclanthology.org/2023.acl-short.13
%U https://doi.org/10.18653/v1/2023.acl-short.13
%P 128-138
Markdown (Informal)
[Multi-Document Summarization with Centroid-Based Pretraining](https://aclanthology.org/2023.acl-short.13) (Puduppully et al., ACL 2023)
ACL
- Ratish Surendran Puduppully, Parag Jain, Nancy Chen, and Mark Steedman. 2023. Multi-Document Summarization with Centroid-Based Pretraining. In Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers), pages 128–138, Toronto, Canada. Association for Computational Linguistics.