@inproceedings{qiu-etal-2025-multimodal,
title = "Multimodal Generation with Consistency Transferring",
author = "Qiu, Junxiang and
Lu, Jinda and
Wang, Shuo",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2025",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-naacl.31/",
doi = "10.18653/v1/2025.findings-naacl.31",
pages = "504--513",
ISBN = "979-8-89176-195-7",
abstract = "Multimodal content generation has become an area of considerable interest. However, existing methods are hindered by limitations related to model constraints and training strategies: (1) Most current approaches rely on training models from scratch, resulting in inefficient training processes when extending these models; (2) There is a lack of constraints on adjacent steps within the models, leading to slow sampling and poor generation stability across various sampling methods. To address the issues, we introduce Multimodal Generation with Consistency Transferring (MGCT). The method introduces two key improvements: (1) A Model Consistency Transferring (MCT) strategy to acquire low-cost prior knowledge, increasing training efficiency and avoiding error accumulation; (2) A Layer Consistency Transferring (LCT) between adjacent steps, enhancing denoising capabilities at each step and improving model stability across various generation methods. These strategies ensure the consistency of jointly generated multimodal content and improving training efficiency. Experiments show that the algorithm enhances the model{'}s ability to capture actions and depict backgrounds more effectively. In both the AIST++ and Landscape datasets, it improves video generation speed by approximately 40{\%} and quality by about 39.3{\%}, while also achieving a slight 3{\%} improvement in audio quality over the baseline."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="qiu-etal-2025-multimodal">
<titleInfo>
<title>Multimodal Generation with Consistency Transferring</title>
</titleInfo>
<name type="personal">
<namePart type="given">Junxiang</namePart>
<namePart type="family">Qiu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jinda</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuo</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-195-7</identifier>
</relatedItem>
<abstract>Multimodal content generation has become an area of considerable interest. However, existing methods are hindered by limitations related to model constraints and training strategies: (1) Most current approaches rely on training models from scratch, resulting in inefficient training processes when extending these models; (2) There is a lack of constraints on adjacent steps within the models, leading to slow sampling and poor generation stability across various sampling methods. To address the issues, we introduce Multimodal Generation with Consistency Transferring (MGCT). The method introduces two key improvements: (1) A Model Consistency Transferring (MCT) strategy to acquire low-cost prior knowledge, increasing training efficiency and avoiding error accumulation; (2) A Layer Consistency Transferring (LCT) between adjacent steps, enhancing denoising capabilities at each step and improving model stability across various generation methods. These strategies ensure the consistency of jointly generated multimodal content and improving training efficiency. Experiments show that the algorithm enhances the model’s ability to capture actions and depict backgrounds more effectively. In both the AIST++ and Landscape datasets, it improves video generation speed by approximately 40% and quality by about 39.3%, while also achieving a slight 3% improvement in audio quality over the baseline.</abstract>
<identifier type="citekey">qiu-etal-2025-multimodal</identifier>
<identifier type="doi">10.18653/v1/2025.findings-naacl.31</identifier>
<location>
<url>https://aclanthology.org/2025.findings-naacl.31/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>504</start>
<end>513</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Multimodal Generation with Consistency Transferring
%A Qiu, Junxiang
%A Lu, Jinda
%A Wang, Shuo
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Findings of the Association for Computational Linguistics: NAACL 2025
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-195-7
%F qiu-etal-2025-multimodal
%X Multimodal content generation has become an area of considerable interest. However, existing methods are hindered by limitations related to model constraints and training strategies: (1) Most current approaches rely on training models from scratch, resulting in inefficient training processes when extending these models; (2) There is a lack of constraints on adjacent steps within the models, leading to slow sampling and poor generation stability across various sampling methods. To address the issues, we introduce Multimodal Generation with Consistency Transferring (MGCT). The method introduces two key improvements: (1) A Model Consistency Transferring (MCT) strategy to acquire low-cost prior knowledge, increasing training efficiency and avoiding error accumulation; (2) A Layer Consistency Transferring (LCT) between adjacent steps, enhancing denoising capabilities at each step and improving model stability across various generation methods. These strategies ensure the consistency of jointly generated multimodal content and improving training efficiency. Experiments show that the algorithm enhances the model’s ability to capture actions and depict backgrounds more effectively. In both the AIST++ and Landscape datasets, it improves video generation speed by approximately 40% and quality by about 39.3%, while also achieving a slight 3% improvement in audio quality over the baseline.
%R 10.18653/v1/2025.findings-naacl.31
%U https://aclanthology.org/2025.findings-naacl.31/
%U https://doi.org/10.18653/v1/2025.findings-naacl.31
%P 504-513
Markdown (Informal)
[Multimodal Generation with Consistency Transferring](https://aclanthology.org/2025.findings-naacl.31/) (Qiu et al., Findings 2025)
ACL