@inproceedings{dalal-etal-2023-calm,
title = "{CALM}-Bench: A Multi-task Benchmark for Evaluating Causality-Aware Language Models",
author = "Dalal, Dhairya and
Buitelaar, Paul and
Arcan, Mihael",
editor = "Vlachos, Andreas and
Augenstein, Isabelle",
booktitle = "Findings of the Association for Computational Linguistics: EACL 2023",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-eacl.23",
doi = "10.18653/v1/2023.findings-eacl.23",
pages = "296--311",
abstract = "Causal reasoning is a critical component of human cognition and is required across a range of question-answering (QA) tasks (such as abductive reasoning, commonsense QA, and procedural reasoning). Research on causal QA has been underdefined, task-specific, and limited in complexity. Recent advances in foundation language models (such as BERT, ERNIE, and T5) have shown the efficacy of pre-trained models across diverse QA tasks. However, there is limited research exploring the causal reasoning capabilities of those language models and no standard evaluation benchmark. To unify causal QA research, we propose CALM-Bench, a multi-task benchmark for evaluating causality-aware language models (CALM). We present a standardized definition of causal QA tasks and show empirically that causal reasoning can be generalized and transferred across different QA tasks. Additionally, we share a strong multi-task baseline model which outperforms single-task fine-tuned models on the CALM-Bench tasks.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dalal-etal-2023-calm">
<titleInfo>
<title>CALM-Bench: A Multi-task Benchmark for Evaluating Causality-Aware Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dhairya</namePart>
<namePart type="family">Dalal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">Buitelaar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mihael</namePart>
<namePart type="family">Arcan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Andreas</namePart>
<namePart type="family">Vlachos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Isabelle</namePart>
<namePart type="family">Augenstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dubrovnik, Croatia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Causal reasoning is a critical component of human cognition and is required across a range of question-answering (QA) tasks (such as abductive reasoning, commonsense QA, and procedural reasoning). Research on causal QA has been underdefined, task-specific, and limited in complexity. Recent advances in foundation language models (such as BERT, ERNIE, and T5) have shown the efficacy of pre-trained models across diverse QA tasks. However, there is limited research exploring the causal reasoning capabilities of those language models and no standard evaluation benchmark. To unify causal QA research, we propose CALM-Bench, a multi-task benchmark for evaluating causality-aware language models (CALM). We present a standardized definition of causal QA tasks and show empirically that causal reasoning can be generalized and transferred across different QA tasks. Additionally, we share a strong multi-task baseline model which outperforms single-task fine-tuned models on the CALM-Bench tasks.</abstract>
<identifier type="citekey">dalal-etal-2023-calm</identifier>
<identifier type="doi">10.18653/v1/2023.findings-eacl.23</identifier>
<location>
<url>https://aclanthology.org/2023.findings-eacl.23</url>
</location>
<part>
<date>2023-05</date>
<extent unit="page">
<start>296</start>
<end>311</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CALM-Bench: A Multi-task Benchmark for Evaluating Causality-Aware Language Models
%A Dalal, Dhairya
%A Buitelaar, Paul
%A Arcan, Mihael
%Y Vlachos, Andreas
%Y Augenstein, Isabelle
%S Findings of the Association for Computational Linguistics: EACL 2023
%D 2023
%8 May
%I Association for Computational Linguistics
%C Dubrovnik, Croatia
%F dalal-etal-2023-calm
%X Causal reasoning is a critical component of human cognition and is required across a range of question-answering (QA) tasks (such as abductive reasoning, commonsense QA, and procedural reasoning). Research on causal QA has been underdefined, task-specific, and limited in complexity. Recent advances in foundation language models (such as BERT, ERNIE, and T5) have shown the efficacy of pre-trained models across diverse QA tasks. However, there is limited research exploring the causal reasoning capabilities of those language models and no standard evaluation benchmark. To unify causal QA research, we propose CALM-Bench, a multi-task benchmark for evaluating causality-aware language models (CALM). We present a standardized definition of causal QA tasks and show empirically that causal reasoning can be generalized and transferred across different QA tasks. Additionally, we share a strong multi-task baseline model which outperforms single-task fine-tuned models on the CALM-Bench tasks.
%R 10.18653/v1/2023.findings-eacl.23
%U https://aclanthology.org/2023.findings-eacl.23
%U https://doi.org/10.18653/v1/2023.findings-eacl.23
%P 296-311
Markdown (Informal)
[CALM-Bench: A Multi-task Benchmark for Evaluating Causality-Aware Language Models](https://aclanthology.org/2023.findings-eacl.23) (Dalal et al., Findings 2023)
ACL