@inproceedings{nguyen-etal-2023-loralay,
title = "{L}o{R}a{L}ay: A Multilingual and Multimodal Dataset for Long Range and Layout-Aware Summarization",
author = "Nguyen, Laura and
Scialom, Thomas and
Piwowarski, Benjamin and
Staiano, Jacopo",
editor = "Vlachos, Andreas and
Augenstein, Isabelle",
booktitle = "Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.eacl-main.46",
doi = "10.18653/v1/2023.eacl-main.46",
pages = "636--651",
abstract = "Text Summarization is a popular task and an active area of research for the Natural Language Processing community. By definition, it requires to account for long input texts, a characteristic which poses computational challenges for neural models. Moreover, real-world documents come in a variety of complex, visually-rich, layouts. This information is of great relevance, whether to highlight salient content or to encode long-range interactions between textual passages. Yet, all publicly available summarization datasets only provide plain text content. To facilitate research on how to exploit visual/layout information to better capture long-range dependencies in summarization models, we present LoRaLay, a collection of datasets for long-range summarization with accompanying visual/layout information. We extend existing and popular English datasets (arXiv and PubMed) with layout information and propose four novel datasets {--} consistently built from scholar resources {--} covering French, Spanish, Portuguese, and Korean languages. Further, we propose new baselines merging layout-aware and long-range models {--} two orthogonal approaches {--} and obtain state-of-the-art results, showing the importance of combining both lines of research.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nguyen-etal-2023-loralay">
<titleInfo>
<title>LoRaLay: A Multilingual and Multimodal Dataset for Long Range and Layout-Aware Summarization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Laura</namePart>
<namePart type="family">Nguyen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Scialom</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Benjamin</namePart>
<namePart type="family">Piwowarski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jacopo</namePart>
<namePart type="family">Staiano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Andreas</namePart>
<namePart type="family">Vlachos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Isabelle</namePart>
<namePart type="family">Augenstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dubrovnik, Croatia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Text Summarization is a popular task and an active area of research for the Natural Language Processing community. By definition, it requires to account for long input texts, a characteristic which poses computational challenges for neural models. Moreover, real-world documents come in a variety of complex, visually-rich, layouts. This information is of great relevance, whether to highlight salient content or to encode long-range interactions between textual passages. Yet, all publicly available summarization datasets only provide plain text content. To facilitate research on how to exploit visual/layout information to better capture long-range dependencies in summarization models, we present LoRaLay, a collection of datasets for long-range summarization with accompanying visual/layout information. We extend existing and popular English datasets (arXiv and PubMed) with layout information and propose four novel datasets – consistently built from scholar resources – covering French, Spanish, Portuguese, and Korean languages. Further, we propose new baselines merging layout-aware and long-range models – two orthogonal approaches – and obtain state-of-the-art results, showing the importance of combining both lines of research.</abstract>
<identifier type="citekey">nguyen-etal-2023-loralay</identifier>
<identifier type="doi">10.18653/v1/2023.eacl-main.46</identifier>
<location>
<url>https://aclanthology.org/2023.eacl-main.46</url>
</location>
<part>
<date>2023-05</date>
<extent unit="page">
<start>636</start>
<end>651</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T LoRaLay: A Multilingual and Multimodal Dataset for Long Range and Layout-Aware Summarization
%A Nguyen, Laura
%A Scialom, Thomas
%A Piwowarski, Benjamin
%A Staiano, Jacopo
%Y Vlachos, Andreas
%Y Augenstein, Isabelle
%S Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics
%D 2023
%8 May
%I Association for Computational Linguistics
%C Dubrovnik, Croatia
%F nguyen-etal-2023-loralay
%X Text Summarization is a popular task and an active area of research for the Natural Language Processing community. By definition, it requires to account for long input texts, a characteristic which poses computational challenges for neural models. Moreover, real-world documents come in a variety of complex, visually-rich, layouts. This information is of great relevance, whether to highlight salient content or to encode long-range interactions between textual passages. Yet, all publicly available summarization datasets only provide plain text content. To facilitate research on how to exploit visual/layout information to better capture long-range dependencies in summarization models, we present LoRaLay, a collection of datasets for long-range summarization with accompanying visual/layout information. We extend existing and popular English datasets (arXiv and PubMed) with layout information and propose four novel datasets – consistently built from scholar resources – covering French, Spanish, Portuguese, and Korean languages. Further, we propose new baselines merging layout-aware and long-range models – two orthogonal approaches – and obtain state-of-the-art results, showing the importance of combining both lines of research.
%R 10.18653/v1/2023.eacl-main.46
%U https://aclanthology.org/2023.eacl-main.46
%U https://doi.org/10.18653/v1/2023.eacl-main.46
%P 636-651
Markdown (Informal)
[LoRaLay: A Multilingual and Multimodal Dataset for Long Range and Layout-Aware Summarization](https://aclanthology.org/2023.eacl-main.46) (Nguyen et al., EACL 2023)
ACL