@inproceedings{gao-etal-2025-mixed,
title = "A Mixed-Language Multi-Document News Summarization Dataset and a Graphs-Based Extract-Generate Model",
author = "Gao, Shengxiang and
Nan, Fang and
Zhang, Yongbing and
Huang, Yuxin and
Tan, Kaiwen and
Yu, Zhengtao",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-long.468/",
doi = "10.18653/v1/2025.naacl-long.468",
pages = "9255--9265",
ISBN = "979-8-89176-189-6",
abstract = "Existing research on news summarization primarily focuses on single-language single-document (SLSD), single-language multi-document (SLMD) or cross-language single-document (CLSD). However, in real-world scenarios, news about an international event often involves multiple documents in different languages, i.e., mixed-language multi-document (MLMD). Therefore, summarizing MLMD news is of great significance. However, the lack of datasets for MLMD news summarization has constrained the development of research in this area. To fill this gap, we construct a mixed-language multi-document news summarization dataset (MLMD-news), which contains four different languages and 10,992 source document cluster and target summary pairs. Additionally, we propose a graph-based extract-generate model and benchmark various methods on the MLMD-news dataset and publicly release our dataset and code, aiming to advance research in summarization within MLMD scenarios."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gao-etal-2025-mixed">
<titleInfo>
<title>A Mixed-Language Multi-Document News Summarization Dataset and a Graphs-Based Extract-Generate Model</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shengxiang</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fang</namePart>
<namePart type="family">Nan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yongbing</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuxin</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kaiwen</namePart>
<namePart type="family">Tan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhengtao</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-189-6</identifier>
</relatedItem>
<abstract>Existing research on news summarization primarily focuses on single-language single-document (SLSD), single-language multi-document (SLMD) or cross-language single-document (CLSD). However, in real-world scenarios, news about an international event often involves multiple documents in different languages, i.e., mixed-language multi-document (MLMD). Therefore, summarizing MLMD news is of great significance. However, the lack of datasets for MLMD news summarization has constrained the development of research in this area. To fill this gap, we construct a mixed-language multi-document news summarization dataset (MLMD-news), which contains four different languages and 10,992 source document cluster and target summary pairs. Additionally, we propose a graph-based extract-generate model and benchmark various methods on the MLMD-news dataset and publicly release our dataset and code, aiming to advance research in summarization within MLMD scenarios.</abstract>
<identifier type="citekey">gao-etal-2025-mixed</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-long.468</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-long.468/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>9255</start>
<end>9265</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Mixed-Language Multi-Document News Summarization Dataset and a Graphs-Based Extract-Generate Model
%A Gao, Shengxiang
%A Nan, Fang
%A Zhang, Yongbing
%A Huang, Yuxin
%A Tan, Kaiwen
%A Yu, Zhengtao
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-189-6
%F gao-etal-2025-mixed
%X Existing research on news summarization primarily focuses on single-language single-document (SLSD), single-language multi-document (SLMD) or cross-language single-document (CLSD). However, in real-world scenarios, news about an international event often involves multiple documents in different languages, i.e., mixed-language multi-document (MLMD). Therefore, summarizing MLMD news is of great significance. However, the lack of datasets for MLMD news summarization has constrained the development of research in this area. To fill this gap, we construct a mixed-language multi-document news summarization dataset (MLMD-news), which contains four different languages and 10,992 source document cluster and target summary pairs. Additionally, we propose a graph-based extract-generate model and benchmark various methods on the MLMD-news dataset and publicly release our dataset and code, aiming to advance research in summarization within MLMD scenarios.
%R 10.18653/v1/2025.naacl-long.468
%U https://aclanthology.org/2025.naacl-long.468/
%U https://doi.org/10.18653/v1/2025.naacl-long.468
%P 9255-9265
Markdown (Informal)
[A Mixed-Language Multi-Document News Summarization Dataset and a Graphs-Based Extract-Generate Model](https://aclanthology.org/2025.naacl-long.468/) (Gao et al., NAACL 2025)
ACL