@inproceedings{atharva-etal-2023-current,
title = "The Current Landscape of Multimodal Summarization",
author = "Atharva, Kumbhar and
Harsh, Kulkarni and
Atmaja, Mali and
Sheetal, Sonawane and
Prathamesh, Mulay",
editor = "Jyoti, D. Pawar and
Sobha, Lalitha Devi",
booktitle = "Proceedings of the 20th International Conference on Natural Language Processing (ICON)",
month = dec,
year = "2023",
address = "Goa University, Goa, India",
publisher = "NLP Association of India (NLPAI)",
url = "https://aclanthology.org/2023.icon-1.82",
pages = "797--806",
abstract = "In recent years, the rise of multimedia content on the internet has inundated users with a vast and diverse array of information, including images, videos, and textual data. Handling this flood of multimedia data necessitates advanced techniques capable of distilling this wealth of information into concise, meaningful summaries. Multimodal summarization, which involves generating summaries from multiple modalities such as text, images, and videos, has become a pivotal area of research in natural language processing, computer vision, and multimedia analysis. This survey paper offers an overview of the state-of-the-art techniques, methodologies, and challenges in the domain of multimodal summarization. We highlight the interdisciplinary advancements made in this field specifically on the lines of two main frontiers:1) Multimodal Abstractive Summarization, and 2) Pre-training Language Models in Multimodal Summarization. By synthesizing insights from existing research, we aim to provide a holistic understanding of multimodal summarization techniques.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="atharva-etal-2023-current">
<titleInfo>
<title>The Current Landscape of Multimodal Summarization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kumbhar</namePart>
<namePart type="family">Atharva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kulkarni</namePart>
<namePart type="family">Harsh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mali</namePart>
<namePart type="family">Atmaja</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sonawane</namePart>
<namePart type="family">Sheetal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mulay</namePart>
<namePart type="family">Prathamesh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 20th International Conference on Natural Language Processing (ICON)</title>
</titleInfo>
<name type="personal">
<namePart type="given">D</namePart>
<namePart type="given">Pawar</namePart>
<namePart type="family">Jyoti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lalitha</namePart>
<namePart type="given">Devi</namePart>
<namePart type="family">Sobha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>NLP Association of India (NLPAI)</publisher>
<place>
<placeTerm type="text">Goa University, Goa, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In recent years, the rise of multimedia content on the internet has inundated users with a vast and diverse array of information, including images, videos, and textual data. Handling this flood of multimedia data necessitates advanced techniques capable of distilling this wealth of information into concise, meaningful summaries. Multimodal summarization, which involves generating summaries from multiple modalities such as text, images, and videos, has become a pivotal area of research in natural language processing, computer vision, and multimedia analysis. This survey paper offers an overview of the state-of-the-art techniques, methodologies, and challenges in the domain of multimodal summarization. We highlight the interdisciplinary advancements made in this field specifically on the lines of two main frontiers:1) Multimodal Abstractive Summarization, and 2) Pre-training Language Models in Multimodal Summarization. By synthesizing insights from existing research, we aim to provide a holistic understanding of multimodal summarization techniques.</abstract>
<identifier type="citekey">atharva-etal-2023-current</identifier>
<location>
<url>https://aclanthology.org/2023.icon-1.82</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>797</start>
<end>806</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The Current Landscape of Multimodal Summarization
%A Atharva, Kumbhar
%A Harsh, Kulkarni
%A Atmaja, Mali
%A Sheetal, Sonawane
%A Prathamesh, Mulay
%Y Jyoti, D. Pawar
%Y Sobha, Lalitha Devi
%S Proceedings of the 20th International Conference on Natural Language Processing (ICON)
%D 2023
%8 December
%I NLP Association of India (NLPAI)
%C Goa University, Goa, India
%F atharva-etal-2023-current
%X In recent years, the rise of multimedia content on the internet has inundated users with a vast and diverse array of information, including images, videos, and textual data. Handling this flood of multimedia data necessitates advanced techniques capable of distilling this wealth of information into concise, meaningful summaries. Multimodal summarization, which involves generating summaries from multiple modalities such as text, images, and videos, has become a pivotal area of research in natural language processing, computer vision, and multimedia analysis. This survey paper offers an overview of the state-of-the-art techniques, methodologies, and challenges in the domain of multimodal summarization. We highlight the interdisciplinary advancements made in this field specifically on the lines of two main frontiers:1) Multimodal Abstractive Summarization, and 2) Pre-training Language Models in Multimodal Summarization. By synthesizing insights from existing research, we aim to provide a holistic understanding of multimodal summarization techniques.
%U https://aclanthology.org/2023.icon-1.82
%P 797-806
Markdown (Informal)
[The Current Landscape of Multimodal Summarization](https://aclanthology.org/2023.icon-1.82) (Atharva et al., ICON 2023)
ACL
- Kumbhar Atharva, Kulkarni Harsh, Mali Atmaja, Sonawane Sheetal, and Mulay Prathamesh. 2023. The Current Landscape of Multimodal Summarization. In Proceedings of the 20th International Conference on Natural Language Processing (ICON), pages 797–806, Goa University, Goa, India. NLP Association of India (NLPAI).