@inproceedings{zhao-etal-2025-exploring,
title = "Exploring Fine-Grained Human Motion Video Captioning",
author = "Zhao, Bingchan and
Liu, Xinyi and
Yu, Zhuocheng and
Yang, Tongchen and
Song, Yifan and
Jin, Mingyu and
Li, Sujian and
Wang, Yizhou",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.351/",
pages = "5247--5264",
abstract = "Detailed descriptions of human motion are crucial for effective fitness training, which highlights the importance of research in fine-grained human motion video captioning. Existing video captioning models often fail to capture the nuanced semantics of videos, resulting in the generated descriptions that are coarse and lack details, especially when depicting human motions. To benchmark the Body Fitness Training scenario, in this paper, we construct a fine-grained human motion video captioning dataset named BoFiT and design a state-of-the-art baseline model named BoFiT-Gen (Body Fitness Training Text Generation). BoFiT-Gen makes use of computer vision techniques to extract angular representations of human motions from videos and LLMs to generate fine-grained descriptions of human motions via prompting. Results show that BoFiT-Gen outperforms previous methods on comprehensive metrics. We aim for this dataset to serve as a useful evaluation set for visio-linguistic models and drive further progress in this field. Our dataset is released at https://github.com/colmon46/bofit."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhao-etal-2025-exploring">
<titleInfo>
<title>Exploring Fine-Grained Human Motion Video Captioning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bingchan</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xinyi</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhuocheng</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tongchen</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yifan</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mingyu</namePart>
<namePart type="family">Jin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sujian</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yizhou</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Detailed descriptions of human motion are crucial for effective fitness training, which highlights the importance of research in fine-grained human motion video captioning. Existing video captioning models often fail to capture the nuanced semantics of videos, resulting in the generated descriptions that are coarse and lack details, especially when depicting human motions. To benchmark the Body Fitness Training scenario, in this paper, we construct a fine-grained human motion video captioning dataset named BoFiT and design a state-of-the-art baseline model named BoFiT-Gen (Body Fitness Training Text Generation). BoFiT-Gen makes use of computer vision techniques to extract angular representations of human motions from videos and LLMs to generate fine-grained descriptions of human motions via prompting. Results show that BoFiT-Gen outperforms previous methods on comprehensive metrics. We aim for this dataset to serve as a useful evaluation set for visio-linguistic models and drive further progress in this field. Our dataset is released at https://github.com/colmon46/bofit.</abstract>
<identifier type="citekey">zhao-etal-2025-exploring</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.351/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>5247</start>
<end>5264</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Exploring Fine-Grained Human Motion Video Captioning
%A Zhao, Bingchan
%A Liu, Xinyi
%A Yu, Zhuocheng
%A Yang, Tongchen
%A Song, Yifan
%A Jin, Mingyu
%A Li, Sujian
%A Wang, Yizhou
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F zhao-etal-2025-exploring
%X Detailed descriptions of human motion are crucial for effective fitness training, which highlights the importance of research in fine-grained human motion video captioning. Existing video captioning models often fail to capture the nuanced semantics of videos, resulting in the generated descriptions that are coarse and lack details, especially when depicting human motions. To benchmark the Body Fitness Training scenario, in this paper, we construct a fine-grained human motion video captioning dataset named BoFiT and design a state-of-the-art baseline model named BoFiT-Gen (Body Fitness Training Text Generation). BoFiT-Gen makes use of computer vision techniques to extract angular representations of human motions from videos and LLMs to generate fine-grained descriptions of human motions via prompting. Results show that BoFiT-Gen outperforms previous methods on comprehensive metrics. We aim for this dataset to serve as a useful evaluation set for visio-linguistic models and drive further progress in this field. Our dataset is released at https://github.com/colmon46/bofit.
%U https://aclanthology.org/2025.coling-main.351/
%P 5247-5264
Markdown (Informal)
[Exploring Fine-Grained Human Motion Video Captioning](https://aclanthology.org/2025.coling-main.351/) (Zhao et al., COLING 2025)
ACL
- Bingchan Zhao, Xinyi Liu, Zhuocheng Yu, Tongchen Yang, Yifan Song, Mingyu Jin, Sujian Li, and Yizhou Wang. 2025. Exploring Fine-Grained Human Motion Video Captioning. In Proceedings of the 31st International Conference on Computational Linguistics, pages 5247–5264, Abu Dhabi, UAE. Association for Computational Linguistics.