@inproceedings{babakov-etal-2025-causalgraphbench,
title = "{C}ausal{G}raph{B}ench: a Benchmark for Evaluating Language Models capabilities of Causal Graph discovery",
author = "Babakov, Nikolay and
Reiter, Ehud and
Bugar{\'i}n-Diz, Alberto",
editor = "Zhao, Jin and
Wang, Mingyang and
Liu, Zhu",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 4: Student Research Workshop)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-srw.16/",
doi = "10.18653/v1/2025.acl-srw.16",
pages = "240--258",
ISBN = "979-8-89176-254-1",
abstract = "This paper introduces CausalGraphBench, a benchmark designed to evaluate the ability of large language models (LLMs) to construct Causal Graphs (CGs), a critical component of reasoning models like Bayesian Networks. The benchmark comprises 35 CGs sourced from publicly available repositories and academic papers, each enriched with detailed metadata to facilitate systematic and consistent evaluation. We explore various LLM-driven methods for CG discovery, analyzing their performance across different graph sizes and complexity levels. Additionally, we examine the effects of data contamination on the quality of the generated CGs.Our findings reveal that methods relying on approaches with a limited number of queries to LLM, particularly those leveraging the full graph context, consistently outperform query-intensive and exhaustive approaches, which tend to overemphasize local relationships. Across all methods, performance declines as graph size increases."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="babakov-etal-2025-causalgraphbench">
<titleInfo>
<title>CausalGraphBench: a Benchmark for Evaluating Language Models capabilities of Causal Graph discovery</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nikolay</namePart>
<namePart type="family">Babakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ehud</namePart>
<namePart type="family">Reiter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alberto</namePart>
<namePart type="family">Bugarín-Diz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 4: Student Research Workshop)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jin</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mingyang</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhu</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-254-1</identifier>
</relatedItem>
<abstract>This paper introduces CausalGraphBench, a benchmark designed to evaluate the ability of large language models (LLMs) to construct Causal Graphs (CGs), a critical component of reasoning models like Bayesian Networks. The benchmark comprises 35 CGs sourced from publicly available repositories and academic papers, each enriched with detailed metadata to facilitate systematic and consistent evaluation. We explore various LLM-driven methods for CG discovery, analyzing their performance across different graph sizes and complexity levels. Additionally, we examine the effects of data contamination on the quality of the generated CGs.Our findings reveal that methods relying on approaches with a limited number of queries to LLM, particularly those leveraging the full graph context, consistently outperform query-intensive and exhaustive approaches, which tend to overemphasize local relationships. Across all methods, performance declines as graph size increases.</abstract>
<identifier type="citekey">babakov-etal-2025-causalgraphbench</identifier>
<identifier type="doi">10.18653/v1/2025.acl-srw.16</identifier>
<location>
<url>https://aclanthology.org/2025.acl-srw.16/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>240</start>
<end>258</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CausalGraphBench: a Benchmark for Evaluating Language Models capabilities of Causal Graph discovery
%A Babakov, Nikolay
%A Reiter, Ehud
%A Bugarín-Diz, Alberto
%Y Zhao, Jin
%Y Wang, Mingyang
%Y Liu, Zhu
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 4: Student Research Workshop)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-254-1
%F babakov-etal-2025-causalgraphbench
%X This paper introduces CausalGraphBench, a benchmark designed to evaluate the ability of large language models (LLMs) to construct Causal Graphs (CGs), a critical component of reasoning models like Bayesian Networks. The benchmark comprises 35 CGs sourced from publicly available repositories and academic papers, each enriched with detailed metadata to facilitate systematic and consistent evaluation. We explore various LLM-driven methods for CG discovery, analyzing their performance across different graph sizes and complexity levels. Additionally, we examine the effects of data contamination on the quality of the generated CGs.Our findings reveal that methods relying on approaches with a limited number of queries to LLM, particularly those leveraging the full graph context, consistently outperform query-intensive and exhaustive approaches, which tend to overemphasize local relationships. Across all methods, performance declines as graph size increases.
%R 10.18653/v1/2025.acl-srw.16
%U https://aclanthology.org/2025.acl-srw.16/
%U https://doi.org/10.18653/v1/2025.acl-srw.16
%P 240-258
Markdown (Informal)
[CausalGraphBench: a Benchmark for Evaluating Language Models capabilities of Causal Graph discovery](https://aclanthology.org/2025.acl-srw.16/) (Babakov et al., ACL 2025)
ACL