@inproceedings{wang-etal-2025-videollm, title = "{V}ideo{LLM} Knows When to Speak: Enhancing Time-Sensitive Video Comprehension with Video-Text Duet Interaction Format", author = "Wang, Yueqian and Meng, Xiaojun and Wang, Yuxuan and Liang, Jianxin and Wei, Jiansheng and Zhang, Huishuai and Zhao, Dongyan", editor = "Christodoulopoulos, Christos and Chakraborty, Tanmoy and Rose, Carolyn and Peng, Violet", booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025", month = nov, year = "2025", address = "Suzhou, China", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2025.findings-emnlp.336/", pages = "6338--6359", ISBN = "979-8-89176-335-7" }