@inproceedings{oguz-etal-2026-dualfact,
title = "{D}ual{F}act+: A Multimodal Fact Verification Framework for Procedural Video Captioning",
author = "Oguz, Cennet and
Hamidullah, Yasser and
Van Genabith, Josef and
Ostermann, Simon",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1912/",
pages = "38356--38371",
ISBN = "979-8-89176-395-1",
abstract = "Evaluating factual correctness in procedural video captions is challenging because captions must reflect both the abstract procedural roles (e.g., actions, ingredients, tools, locations) and their visual execution. Existing evaluation metrics, which rely on lexical overlap or holistic semantic similarity, often miss role-specific omissions and misclassify visually present but task-irrelevant content as hallucinations. We introduce DualFact+, a role-aware, fact-level evaluation framework that distinguishes conceptual facts, encoding ontology-based role typing of procedural steps (Action, Object or Ingredient, Tool, Location), from contextual facts, encoding video-grounded predicate{--}argument relations that specify how these roles are instantiated during execution. To enable complete and role-consistent evaluation, DualFact+ incorporates visually grounded implicit arguments and contrastive fact sets, and operates in two complementary modes: DualFact-C for text-based verification and DualFact-V for video-grounded verification. Experiments on YouCook3-Fact and CraftBench-Fact show that state-of-the-art captioning models produce fluent but often incomplete descriptions with systematic role-level errors. DualFact+ achieves stronger correlation with human factuality judgments than standard lexical and embedding-based metrics, highlighting the importance of role-aware evaluation for procedural video understanding."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="oguz-etal-2026-dualfact">
<titleInfo>
<title>DualFact+: A Multimodal Fact Verification Framework for Procedural Video Captioning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Cennet</namePart>
<namePart type="family">Oguz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yasser</namePart>
<namePart type="family">Hamidullah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Josef</namePart>
<namePart type="family">Van Genabith</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Ostermann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Evaluating factual correctness in procedural video captions is challenging because captions must reflect both the abstract procedural roles (e.g., actions, ingredients, tools, locations) and their visual execution. Existing evaluation metrics, which rely on lexical overlap or holistic semantic similarity, often miss role-specific omissions and misclassify visually present but task-irrelevant content as hallucinations. We introduce DualFact+, a role-aware, fact-level evaluation framework that distinguishes conceptual facts, encoding ontology-based role typing of procedural steps (Action, Object or Ingredient, Tool, Location), from contextual facts, encoding video-grounded predicate–argument relations that specify how these roles are instantiated during execution. To enable complete and role-consistent evaluation, DualFact+ incorporates visually grounded implicit arguments and contrastive fact sets, and operates in two complementary modes: DualFact-C for text-based verification and DualFact-V for video-grounded verification. Experiments on YouCook3-Fact and CraftBench-Fact show that state-of-the-art captioning models produce fluent but often incomplete descriptions with systematic role-level errors. DualFact+ achieves stronger correlation with human factuality judgments than standard lexical and embedding-based metrics, highlighting the importance of role-aware evaluation for procedural video understanding.</abstract>
<identifier type="citekey">oguz-etal-2026-dualfact</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1912/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>38356</start>
<end>38371</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T DualFact+: A Multimodal Fact Verification Framework for Procedural Video Captioning
%A Oguz, Cennet
%A Hamidullah, Yasser
%A Van Genabith, Josef
%A Ostermann, Simon
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F oguz-etal-2026-dualfact
%X Evaluating factual correctness in procedural video captions is challenging because captions must reflect both the abstract procedural roles (e.g., actions, ingredients, tools, locations) and their visual execution. Existing evaluation metrics, which rely on lexical overlap or holistic semantic similarity, often miss role-specific omissions and misclassify visually present but task-irrelevant content as hallucinations. We introduce DualFact+, a role-aware, fact-level evaluation framework that distinguishes conceptual facts, encoding ontology-based role typing of procedural steps (Action, Object or Ingredient, Tool, Location), from contextual facts, encoding video-grounded predicate–argument relations that specify how these roles are instantiated during execution. To enable complete and role-consistent evaluation, DualFact+ incorporates visually grounded implicit arguments and contrastive fact sets, and operates in two complementary modes: DualFact-C for text-based verification and DualFact-V for video-grounded verification. Experiments on YouCook3-Fact and CraftBench-Fact show that state-of-the-art captioning models produce fluent but often incomplete descriptions with systematic role-level errors. DualFact+ achieves stronger correlation with human factuality judgments than standard lexical and embedding-based metrics, highlighting the importance of role-aware evaluation for procedural video understanding.
%U https://aclanthology.org/2026.findings-acl.1912/
%P 38356-38371
Markdown (Informal)
[DualFact+: A Multimodal Fact Verification Framework for Procedural Video Captioning](https://aclanthology.org/2026.findings-acl.1912/) (Oguz et al., Findings 2026)
ACL