@inproceedings{zhao-etal-2024-mrt,
title = "{MRT}: Multi-modal Short- and Long-range Temporal Convolutional Network for Time-sync Comment Video Behavior Prediction",
author = "Zhao, Weihao and
He, Weidong and
Wang, Hao and
Bi, Haoyang and
Wu, Han and
Zhu, Chen and
Xu, Tong and
Chen, Enhong",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.1020",
pages = "11680--11691",
abstract = "As a fresh way to improve the user viewing experience, videos of time-sync comments have attracted a lot of interest. Many efforts have been made to explore the effectiveness of time-sync comments for various applications. However, due to the complexity of interactions among users, videos, and comments, it still remains challenging to understand users{'} behavior on time-sync comments. Along this line, we study the problem of time-sync comment behavior prediction with considerations of both historical behaviors and multi-modal information of visual frames and textual comments. Specifically, we propose a novel Multi-modal short- and long-Range Temporal Convolutional Network model, namely MRT. Firstly, we design two amplified Temporal Convolutional Networks with different sizes of receptive fields, to capture both short- and long-range surrounding contexts for each frame and time-sync comments. Then, we design a bottle-neck fusion module to obtain the multi-modal enhanced representation. Furthermore, we take the user preferences into consideration to generate the personalized multi-model semantic representation at each timestamp. Finally, we utilize the binary cross-entropy loss to optimize MRT on the basis of users{'} historical records. Through comparing with representative baselines, we demonstrate the effectiveness of MRT and qualitatively verify the necessity and utility of short- and long-range contextual and multi-modal information through extensive experiments.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhao-etal-2024-mrt">
<titleInfo>
<title>MRT: Multi-modal Short- and Long-range Temporal Convolutional Network for Time-sync Comment Video Behavior Prediction</title>
</titleInfo>
<name type="personal">
<namePart type="given">Weihao</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Weidong</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hao</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haoyang</namePart>
<namePart type="family">Bi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Han</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chen</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tong</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enhong</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>As a fresh way to improve the user viewing experience, videos of time-sync comments have attracted a lot of interest. Many efforts have been made to explore the effectiveness of time-sync comments for various applications. However, due to the complexity of interactions among users, videos, and comments, it still remains challenging to understand users’ behavior on time-sync comments. Along this line, we study the problem of time-sync comment behavior prediction with considerations of both historical behaviors and multi-modal information of visual frames and textual comments. Specifically, we propose a novel Multi-modal short- and long-Range Temporal Convolutional Network model, namely MRT. Firstly, we design two amplified Temporal Convolutional Networks with different sizes of receptive fields, to capture both short- and long-range surrounding contexts for each frame and time-sync comments. Then, we design a bottle-neck fusion module to obtain the multi-modal enhanced representation. Furthermore, we take the user preferences into consideration to generate the personalized multi-model semantic representation at each timestamp. Finally, we utilize the binary cross-entropy loss to optimize MRT on the basis of users’ historical records. Through comparing with representative baselines, we demonstrate the effectiveness of MRT and qualitatively verify the necessity and utility of short- and long-range contextual and multi-modal information through extensive experiments.</abstract>
<identifier type="citekey">zhao-etal-2024-mrt</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.1020</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>11680</start>
<end>11691</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MRT: Multi-modal Short- and Long-range Temporal Convolutional Network for Time-sync Comment Video Behavior Prediction
%A Zhao, Weihao
%A He, Weidong
%A Wang, Hao
%A Bi, Haoyang
%A Wu, Han
%A Zhu, Chen
%A Xu, Tong
%A Chen, Enhong
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F zhao-etal-2024-mrt
%X As a fresh way to improve the user viewing experience, videos of time-sync comments have attracted a lot of interest. Many efforts have been made to explore the effectiveness of time-sync comments for various applications. However, due to the complexity of interactions among users, videos, and comments, it still remains challenging to understand users’ behavior on time-sync comments. Along this line, we study the problem of time-sync comment behavior prediction with considerations of both historical behaviors and multi-modal information of visual frames and textual comments. Specifically, we propose a novel Multi-modal short- and long-Range Temporal Convolutional Network model, namely MRT. Firstly, we design two amplified Temporal Convolutional Networks with different sizes of receptive fields, to capture both short- and long-range surrounding contexts for each frame and time-sync comments. Then, we design a bottle-neck fusion module to obtain the multi-modal enhanced representation. Furthermore, we take the user preferences into consideration to generate the personalized multi-model semantic representation at each timestamp. Finally, we utilize the binary cross-entropy loss to optimize MRT on the basis of users’ historical records. Through comparing with representative baselines, we demonstrate the effectiveness of MRT and qualitatively verify the necessity and utility of short- and long-range contextual and multi-modal information through extensive experiments.
%U https://aclanthology.org/2024.lrec-main.1020
%P 11680-11691
Markdown (Informal)
[MRT: Multi-modal Short- and Long-range Temporal Convolutional Network for Time-sync Comment Video Behavior Prediction](https://aclanthology.org/2024.lrec-main.1020) (Zhao et al., LREC-COLING 2024)
ACL
- Weihao Zhao, Weidong He, Hao Wang, Haoyang Bi, Han Wu, Chen Zhu, Tong Xu, and Enhong Chen. 2024. MRT: Multi-modal Short- and Long-range Temporal Convolutional Network for Time-sync Comment Video Behavior Prediction. In Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), pages 11680–11691, Torino, Italia. ELRA and ICCL.