@inproceedings{magnifico-2025-automated,
title = "Automated classification of causal relations. Evaluating different {LLM} performances.",
author = "Magnifico, Giacomo",
editor = "Velichkov, Boris and
Nikolova-Koleva, Ivelina and
Slavcheva, Milena",
booktitle = "Proceedings of the 9th Student Research Workshop associated with the International Conference Recent Advances in Natural Language Processing",
month = sep,
year = "2025",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2025.ranlp-stud.4/",
pages = "27--36",
abstract = "The search for formal causal relations in natural language faces inherent limitations due to the lack of mathematically and logically informed datasets. Thus, the exploration of causal relations in natural language leads to the analysis of formal-logic-adjacent language patterns. Thanks to the recent advancements of generative LLMs, this research niche is expanding within the field of natural language processing and evaluation. In this work, we conduct an evaluation of 9 models produced by different AI developing companies in order to answer the question ``Are LLMs capable of discerning between different types of causal relations?''. The SciExpl dataset is chosen as a natural language corpus, and we develop three different prompt types aligned with zero-shot, few-shot, and chain-of-thought standards to evaluate the performance of the LLMs. Claude 3.7 Sonnet and Gemini 2.5 Flash Preview emerge as the best models for the task, with the respective highest F1 scores of 0.842 (few-shot prompting) and 0.846 (chain-of-thought prompting)."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="magnifico-2025-automated">
<titleInfo>
<title>Automated classification of causal relations. Evaluating different LLM performances.</title>
</titleInfo>
<name type="personal">
<namePart type="given">Giacomo</namePart>
<namePart type="family">Magnifico</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 9th Student Research Workshop associated with the International Conference Recent Advances in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Boris</namePart>
<namePart type="family">Velichkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ivelina</namePart>
<namePart type="family">Nikolova-Koleva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Milena</namePart>
<namePart type="family">Slavcheva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The search for formal causal relations in natural language faces inherent limitations due to the lack of mathematically and logically informed datasets. Thus, the exploration of causal relations in natural language leads to the analysis of formal-logic-adjacent language patterns. Thanks to the recent advancements of generative LLMs, this research niche is expanding within the field of natural language processing and evaluation. In this work, we conduct an evaluation of 9 models produced by different AI developing companies in order to answer the question “Are LLMs capable of discerning between different types of causal relations?”. The SciExpl dataset is chosen as a natural language corpus, and we develop three different prompt types aligned with zero-shot, few-shot, and chain-of-thought standards to evaluate the performance of the LLMs. Claude 3.7 Sonnet and Gemini 2.5 Flash Preview emerge as the best models for the task, with the respective highest F1 scores of 0.842 (few-shot prompting) and 0.846 (chain-of-thought prompting).</abstract>
<identifier type="citekey">magnifico-2025-automated</identifier>
<location>
<url>https://aclanthology.org/2025.ranlp-stud.4/</url>
</location>
<part>
<date>2025-09</date>
<extent unit="page">
<start>27</start>
<end>36</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Automated classification of causal relations. Evaluating different LLM performances.
%A Magnifico, Giacomo
%Y Velichkov, Boris
%Y Nikolova-Koleva, Ivelina
%Y Slavcheva, Milena
%S Proceedings of the 9th Student Research Workshop associated with the International Conference Recent Advances in Natural Language Processing
%D 2025
%8 September
%I INCOMA Ltd., Shoumen, Bulgaria
%C Varna, Bulgaria
%F magnifico-2025-automated
%X The search for formal causal relations in natural language faces inherent limitations due to the lack of mathematically and logically informed datasets. Thus, the exploration of causal relations in natural language leads to the analysis of formal-logic-adjacent language patterns. Thanks to the recent advancements of generative LLMs, this research niche is expanding within the field of natural language processing and evaluation. In this work, we conduct an evaluation of 9 models produced by different AI developing companies in order to answer the question “Are LLMs capable of discerning between different types of causal relations?”. The SciExpl dataset is chosen as a natural language corpus, and we develop three different prompt types aligned with zero-shot, few-shot, and chain-of-thought standards to evaluate the performance of the LLMs. Claude 3.7 Sonnet and Gemini 2.5 Flash Preview emerge as the best models for the task, with the respective highest F1 scores of 0.842 (few-shot prompting) and 0.846 (chain-of-thought prompting).
%U https://aclanthology.org/2025.ranlp-stud.4/
%P 27-36
Markdown (Informal)
[Automated classification of causal relations. Evaluating different LLM performances.](https://aclanthology.org/2025.ranlp-stud.4/) (Magnifico, RANLP 2025)
ACL