@inproceedings{feng-etal-2025-tc,
title = "{TC}-Bench: Benchmarking Temporal Compositionality in Conditional Video Generation",
author = "Feng, Weixi and
Li, Jiachen and
Saxon, Michael and
Fu, Tsu-Jui and
Chen, Wenhu and
Wang, William Yang",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.241/",
doi = "10.18653/v1/2025.findings-acl.241",
pages = "4638--4662",
ISBN = "979-8-89176-256-5",
abstract = "Video generation has many unique challenges beyond those of image generation. The temporal dimension introduces extensive possible variations across frames, over which consistency and continuity may be violated. In this work, we evaluate the emergence of new concepts and relation transitions as time progresses in a video, which we refer to as Temporal Compositionality. We propose TC-Bench, a benchmark of meticulously crafted text prompts, ground truth videos, and new evaluation metrics. The prompts articulate the initial and final states of scenes, effectively reducing ambiguities for frame development. In addition, by collecting corresponding ground-truth videos, the benchmark can be used for text-to-video and image-to-video generation. We develop new metrics to measure the completeness of component transitions, which demonstrate significantly higher correlations with human judgments than existing metrics. Our experiments reveal that contemporary video generators are still weak in prompt understanding and achieve less than 20{\%} of the compositional changes, highlighting enormous improvement space. Our analysis indicates that current video generation models struggle to interpret descriptions of compositional changes and synthesize various components across different time steps."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="feng-etal-2025-tc">
<titleInfo>
<title>TC-Bench: Benchmarking Temporal Compositionality in Conditional Video Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Weixi</namePart>
<namePart type="family">Feng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiachen</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Saxon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tsu-Jui</namePart>
<namePart type="family">Fu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenhu</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">William</namePart>
<namePart type="given">Yang</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Video generation has many unique challenges beyond those of image generation. The temporal dimension introduces extensive possible variations across frames, over which consistency and continuity may be violated. In this work, we evaluate the emergence of new concepts and relation transitions as time progresses in a video, which we refer to as Temporal Compositionality. We propose TC-Bench, a benchmark of meticulously crafted text prompts, ground truth videos, and new evaluation metrics. The prompts articulate the initial and final states of scenes, effectively reducing ambiguities for frame development. In addition, by collecting corresponding ground-truth videos, the benchmark can be used for text-to-video and image-to-video generation. We develop new metrics to measure the completeness of component transitions, which demonstrate significantly higher correlations with human judgments than existing metrics. Our experiments reveal that contemporary video generators are still weak in prompt understanding and achieve less than 20% of the compositional changes, highlighting enormous improvement space. Our analysis indicates that current video generation models struggle to interpret descriptions of compositional changes and synthesize various components across different time steps.</abstract>
<identifier type="citekey">feng-etal-2025-tc</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.241</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.241/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>4638</start>
<end>4662</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T TC-Bench: Benchmarking Temporal Compositionality in Conditional Video Generation
%A Feng, Weixi
%A Li, Jiachen
%A Saxon, Michael
%A Fu, Tsu-Jui
%A Chen, Wenhu
%A Wang, William Yang
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F feng-etal-2025-tc
%X Video generation has many unique challenges beyond those of image generation. The temporal dimension introduces extensive possible variations across frames, over which consistency and continuity may be violated. In this work, we evaluate the emergence of new concepts and relation transitions as time progresses in a video, which we refer to as Temporal Compositionality. We propose TC-Bench, a benchmark of meticulously crafted text prompts, ground truth videos, and new evaluation metrics. The prompts articulate the initial and final states of scenes, effectively reducing ambiguities for frame development. In addition, by collecting corresponding ground-truth videos, the benchmark can be used for text-to-video and image-to-video generation. We develop new metrics to measure the completeness of component transitions, which demonstrate significantly higher correlations with human judgments than existing metrics. Our experiments reveal that contemporary video generators are still weak in prompt understanding and achieve less than 20% of the compositional changes, highlighting enormous improvement space. Our analysis indicates that current video generation models struggle to interpret descriptions of compositional changes and synthesize various components across different time steps.
%R 10.18653/v1/2025.findings-acl.241
%U https://aclanthology.org/2025.findings-acl.241/
%U https://doi.org/10.18653/v1/2025.findings-acl.241
%P 4638-4662
Markdown (Informal)
[TC-Bench: Benchmarking Temporal Compositionality in Conditional Video Generation](https://aclanthology.org/2025.findings-acl.241/) (Feng et al., Findings 2025)
ACL