@inproceedings{sotudeh-etal-2021-tldr9,
title = "{TLDR}9+: A Large Scale Resource for Extreme Summarization of Social Media Posts",
author = "Sotudeh, Sajad and
Deilamsalehy, Hanieh and
Dernoncourt, Franck and
Goharian, Nazli",
editor = "Carenini, Giuseppe and
Cheung, Jackie Chi Kit and
Dong, Yue and
Liu, Fei and
Wang, Lu",
booktitle = "Proceedings of the Third Workshop on New Frontiers in Summarization",
month = nov,
year = "2021",
address = "Online and in Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.newsum-1.15/",
doi = "10.18653/v1/2021.newsum-1.15",
pages = "142--151",
abstract = "Recent models in developing summarization systems consist of millions of parameters and the model performance is highly dependent on the abundance of training data. While most existing summarization corpora contain data in the order of thousands to one million, generation of large-scale summarization datasets in order of couple of millions is yet to be explored. Practically, more data is better at generalizing the training patterns to unseen data. In this paper, we introduce TLDR9+ {--}a large-scale summarization dataset{--} containing over 9 million training instances extracted from Reddit discussion forum ([HTTP]). This dataset is specifically gathered to perform \textit{extreme} summarization (i.e., generating one-sentence summary in high compression and abstraction) and is more than twice larger than the previously proposed dataset. We go one step further and with the help of human annotations, we distill a more fine-grained dataset by sampling High-Quality instances from TLDR9+ and call it TLDRHQ dataset. We further pinpoint different state-of-the-art summarization models on our proposed datasets."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sotudeh-etal-2021-tldr9">
<titleInfo>
<title>TLDR9+: A Large Scale Resource for Extreme Summarization of Social Media Posts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sajad</namePart>
<namePart type="family">Sotudeh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hanieh</namePart>
<namePart type="family">Deilamsalehy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Franck</namePart>
<namePart type="family">Dernoncourt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nazli</namePart>
<namePart type="family">Goharian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Third Workshop on New Frontiers in Summarization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Giuseppe</namePart>
<namePart type="family">Carenini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jackie</namePart>
<namePart type="given">Chi</namePart>
<namePart type="given">Kit</namePart>
<namePart type="family">Cheung</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Dong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fei</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online and in Dominican Republic</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recent models in developing summarization systems consist of millions of parameters and the model performance is highly dependent on the abundance of training data. While most existing summarization corpora contain data in the order of thousands to one million, generation of large-scale summarization datasets in order of couple of millions is yet to be explored. Practically, more data is better at generalizing the training patterns to unseen data. In this paper, we introduce TLDR9+ –a large-scale summarization dataset– containing over 9 million training instances extracted from Reddit discussion forum ([HTTP]). This dataset is specifically gathered to perform extreme summarization (i.e., generating one-sentence summary in high compression and abstraction) and is more than twice larger than the previously proposed dataset. We go one step further and with the help of human annotations, we distill a more fine-grained dataset by sampling High-Quality instances from TLDR9+ and call it TLDRHQ dataset. We further pinpoint different state-of-the-art summarization models on our proposed datasets.</abstract>
<identifier type="citekey">sotudeh-etal-2021-tldr9</identifier>
<identifier type="doi">10.18653/v1/2021.newsum-1.15</identifier>
<location>
<url>https://aclanthology.org/2021.newsum-1.15/</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>142</start>
<end>151</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T TLDR9+: A Large Scale Resource for Extreme Summarization of Social Media Posts
%A Sotudeh, Sajad
%A Deilamsalehy, Hanieh
%A Dernoncourt, Franck
%A Goharian, Nazli
%Y Carenini, Giuseppe
%Y Cheung, Jackie Chi Kit
%Y Dong, Yue
%Y Liu, Fei
%Y Wang, Lu
%S Proceedings of the Third Workshop on New Frontiers in Summarization
%D 2021
%8 November
%I Association for Computational Linguistics
%C Online and in Dominican Republic
%F sotudeh-etal-2021-tldr9
%X Recent models in developing summarization systems consist of millions of parameters and the model performance is highly dependent on the abundance of training data. While most existing summarization corpora contain data in the order of thousands to one million, generation of large-scale summarization datasets in order of couple of millions is yet to be explored. Practically, more data is better at generalizing the training patterns to unseen data. In this paper, we introduce TLDR9+ –a large-scale summarization dataset– containing over 9 million training instances extracted from Reddit discussion forum ([HTTP]). This dataset is specifically gathered to perform extreme summarization (i.e., generating one-sentence summary in high compression and abstraction) and is more than twice larger than the previously proposed dataset. We go one step further and with the help of human annotations, we distill a more fine-grained dataset by sampling High-Quality instances from TLDR9+ and call it TLDRHQ dataset. We further pinpoint different state-of-the-art summarization models on our proposed datasets.
%R 10.18653/v1/2021.newsum-1.15
%U https://aclanthology.org/2021.newsum-1.15/
%U https://doi.org/10.18653/v1/2021.newsum-1.15
%P 142-151
Markdown (Informal)
[TLDR9+: A Large Scale Resource for Extreme Summarization of Social Media Posts](https://aclanthology.org/2021.newsum-1.15/) (Sotudeh et al., NewSum 2021)
ACL