@inproceedings{wang-etal-2025-reasoning-enhanced,
title = "Reasoning-Enhanced Domain-Adaptive Pretraining of Multimodal Large Language Models for Short Video Content Governance",
author = "Wang, Zixuan and
Sun, Yu and
Wang, Hongwei and
Jing, Baoyu and
Shen, Xiang and
Dong, Xin and
Hao, Zhuolin and
Xiong, Hongyu and
Song, Yang",
editor = "Potdar, Saloni and
Rojas-Barahona, Lina and
Montella, Sebastien",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track",
month = nov,
year = "2025",
address = "Suzhou (China)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-industry.77/",
doi = "10.18653/v1/2025.emnlp-industry.77",
pages = "1104--1112",
ISBN = "979-8-89176-333-3",
abstract = "Short video platforms are evolving rapidly, making the identification of inappropriate content increasingly critical.Existing approaches typically train separate and small classification models for each type of issue, which requires extensive human-labeled data and lacks cross-issue generalization.We propose a reasoning-enhanced multimodal large language model (MLLM) pretraining paradigm for unified inappropriate content detection. To address the distribution gap between short video content and the original pretraining data of MLLMs, as well as the complex issue definitions, we introduce three targeted pretraining tasks:(1) \textit{Caption}, to enhance the MLLM{'}s perception of video details;(2) \textit{Visual Question Answering (VQA)}, to deepen the MLLM{'}s understanding of issue definitions and annotation guidelines;(3) \textit{Chain-of-Thought (CoT)}, to enhance the MLLM{'}s reasoning capability.Experimental results show that our pretraining approach significantly improves the MLLM{'}s performance in both zero-shot and supervised fine-tuning (SFT) settings.In addition, our pretrained model demonstrates strong generalization capabilities to emergent, previously unseen issues."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-etal-2025-reasoning-enhanced">
<titleInfo>
<title>Reasoning-Enhanced Domain-Adaptive Pretraining of Multimodal Large Language Models for Short Video Content Governance</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zixuan</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yu</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hongwei</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Baoyu</namePart>
<namePart type="family">Jing</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiang</namePart>
<namePart type="family">Shen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xin</namePart>
<namePart type="family">Dong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhuolin</namePart>
<namePart type="family">Hao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hongyu</namePart>
<namePart type="family">Xiong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Saloni</namePart>
<namePart type="family">Potdar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lina</namePart>
<namePart type="family">Rojas-Barahona</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastien</namePart>
<namePart type="family">Montella</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou (China)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-333-3</identifier>
</relatedItem>
<abstract>Short video platforms are evolving rapidly, making the identification of inappropriate content increasingly critical.Existing approaches typically train separate and small classification models for each type of issue, which requires extensive human-labeled data and lacks cross-issue generalization.We propose a reasoning-enhanced multimodal large language model (MLLM) pretraining paradigm for unified inappropriate content detection. To address the distribution gap between short video content and the original pretraining data of MLLMs, as well as the complex issue definitions, we introduce three targeted pretraining tasks:(1) Caption, to enhance the MLLM’s perception of video details;(2) Visual Question Answering (VQA), to deepen the MLLM’s understanding of issue definitions and annotation guidelines;(3) Chain-of-Thought (CoT), to enhance the MLLM’s reasoning capability.Experimental results show that our pretraining approach significantly improves the MLLM’s performance in both zero-shot and supervised fine-tuning (SFT) settings.In addition, our pretrained model demonstrates strong generalization capabilities to emergent, previously unseen issues.</abstract>
<identifier type="citekey">wang-etal-2025-reasoning-enhanced</identifier>
<identifier type="doi">10.18653/v1/2025.emnlp-industry.77</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-industry.77/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>1104</start>
<end>1112</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Reasoning-Enhanced Domain-Adaptive Pretraining of Multimodal Large Language Models for Short Video Content Governance
%A Wang, Zixuan
%A Sun, Yu
%A Wang, Hongwei
%A Jing, Baoyu
%A Shen, Xiang
%A Dong, Xin
%A Hao, Zhuolin
%A Xiong, Hongyu
%A Song, Yang
%Y Potdar, Saloni
%Y Rojas-Barahona, Lina
%Y Montella, Sebastien
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou (China)
%@ 979-8-89176-333-3
%F wang-etal-2025-reasoning-enhanced
%X Short video platforms are evolving rapidly, making the identification of inappropriate content increasingly critical.Existing approaches typically train separate and small classification models for each type of issue, which requires extensive human-labeled data and lacks cross-issue generalization.We propose a reasoning-enhanced multimodal large language model (MLLM) pretraining paradigm for unified inappropriate content detection. To address the distribution gap between short video content and the original pretraining data of MLLMs, as well as the complex issue definitions, we introduce three targeted pretraining tasks:(1) Caption, to enhance the MLLM’s perception of video details;(2) Visual Question Answering (VQA), to deepen the MLLM’s understanding of issue definitions and annotation guidelines;(3) Chain-of-Thought (CoT), to enhance the MLLM’s reasoning capability.Experimental results show that our pretraining approach significantly improves the MLLM’s performance in both zero-shot and supervised fine-tuning (SFT) settings.In addition, our pretrained model demonstrates strong generalization capabilities to emergent, previously unseen issues.
%R 10.18653/v1/2025.emnlp-industry.77
%U https://aclanthology.org/2025.emnlp-industry.77/
%U https://doi.org/10.18653/v1/2025.emnlp-industry.77
%P 1104-1112
Markdown (Informal)
[Reasoning-Enhanced Domain-Adaptive Pretraining of Multimodal Large Language Models for Short Video Content Governance](https://aclanthology.org/2025.emnlp-industry.77/) (Wang et al., EMNLP 2025)
ACL
- Zixuan Wang, Yu Sun, Hongwei Wang, Baoyu Jing, Xiang Shen, Xin Dong, Zhuolin Hao, Hongyu Xiong, and Yang Song. 2025. Reasoning-Enhanced Domain-Adaptive Pretraining of Multimodal Large Language Models for Short Video Content Governance. In Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track, pages 1104–1112, Suzhou (China). Association for Computational Linguistics.