@inproceedings{hangchen-etal-2025-misp,
title = "{MISP}-Meeting: A Real-World Dataset with Multimodal Cues for Long-form Meeting Transcription and Summarization",
author = "HangChen, HangChen and
Yang, Chao-Han Huck and
Gu, Jia-Chen and
Siniscalchi, Sabato Marco and
Du, Jun",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.753/",
doi = "10.18653/v1/2025.acl-long.753",
pages = "15479--15492",
ISBN = "979-8-89176-251-0",
abstract = "We introduce MISP-Meeting, a new real-world, multimodal dataset that covers subject-oriented long-form content. MISP-Meeting integrates information from speech, vision, and text modalities to facilitate automatic meeting transcription and summarization (AMTS). Challenging conditions in human meetings, including far-field speech recognition, audio-visual understanding, and long-term summarization, have been carefully evaluated. We benchmark state-of-the-art automatic speech recognition (ASR) and large language models (LLMs) on this dataset, enhanced with multimodal cues. Experiments demonstrate that incorporating multimodal cues, such as lip movements and visual focus of attention, significantly enhances transcription accuracy, reducing the character error rate (CER) from 36.60{\%} to 20.27{\%} via guided source separation (GSS), fine-tuning, and audio-visual fusion. Furthermore, our summarization analysis reveals a direct correlation between ASR quality and summary coherence, underscoring the importance of robust multimodal modeling. Our dataset and codebase will be released as open source."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hangchen-etal-2025-misp">
<titleInfo>
<title>MISP-Meeting: A Real-World Dataset with Multimodal Cues for Long-form Meeting Transcription and Summarization</title>
</titleInfo>
<name type="personal">
<namePart type="given">HangChen</namePart>
<namePart type="family">HangChen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chao-Han</namePart>
<namePart type="given">Huck</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jia-Chen</namePart>
<namePart type="family">Gu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sabato</namePart>
<namePart type="given">Marco</namePart>
<namePart type="family">Siniscalchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jun</namePart>
<namePart type="family">Du</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>We introduce MISP-Meeting, a new real-world, multimodal dataset that covers subject-oriented long-form content. MISP-Meeting integrates information from speech, vision, and text modalities to facilitate automatic meeting transcription and summarization (AMTS). Challenging conditions in human meetings, including far-field speech recognition, audio-visual understanding, and long-term summarization, have been carefully evaluated. We benchmark state-of-the-art automatic speech recognition (ASR) and large language models (LLMs) on this dataset, enhanced with multimodal cues. Experiments demonstrate that incorporating multimodal cues, such as lip movements and visual focus of attention, significantly enhances transcription accuracy, reducing the character error rate (CER) from 36.60% to 20.27% via guided source separation (GSS), fine-tuning, and audio-visual fusion. Furthermore, our summarization analysis reveals a direct correlation between ASR quality and summary coherence, underscoring the importance of robust multimodal modeling. Our dataset and codebase will be released as open source.</abstract>
<identifier type="citekey">hangchen-etal-2025-misp</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.753</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.753/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>15479</start>
<end>15492</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MISP-Meeting: A Real-World Dataset with Multimodal Cues for Long-form Meeting Transcription and Summarization
%A HangChen, HangChen
%A Yang, Chao-Han Huck
%A Gu, Jia-Chen
%A Siniscalchi, Sabato Marco
%A Du, Jun
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F hangchen-etal-2025-misp
%X We introduce MISP-Meeting, a new real-world, multimodal dataset that covers subject-oriented long-form content. MISP-Meeting integrates information from speech, vision, and text modalities to facilitate automatic meeting transcription and summarization (AMTS). Challenging conditions in human meetings, including far-field speech recognition, audio-visual understanding, and long-term summarization, have been carefully evaluated. We benchmark state-of-the-art automatic speech recognition (ASR) and large language models (LLMs) on this dataset, enhanced with multimodal cues. Experiments demonstrate that incorporating multimodal cues, such as lip movements and visual focus of attention, significantly enhances transcription accuracy, reducing the character error rate (CER) from 36.60% to 20.27% via guided source separation (GSS), fine-tuning, and audio-visual fusion. Furthermore, our summarization analysis reveals a direct correlation between ASR quality and summary coherence, underscoring the importance of robust multimodal modeling. Our dataset and codebase will be released as open source.
%R 10.18653/v1/2025.acl-long.753
%U https://aclanthology.org/2025.acl-long.753/
%U https://doi.org/10.18653/v1/2025.acl-long.753
%P 15479-15492
Markdown (Informal)
[MISP-Meeting: A Real-World Dataset with Multimodal Cues for Long-form Meeting Transcription and Summarization](https://aclanthology.org/2025.acl-long.753/) (HangChen et al., ACL 2025)
ACL