@inproceedings{malul-etal-2025-lchaim,
title = "{LCHAIM} - Investigating Long Context Reasoning in {H}ebrew",
author = "Malul, Ehud and
Perets, Oriel and
Mor, Ziv and
Kassel, Yigal and
Sulem, Elior",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.413/",
doi = "10.18653/v1/2025.findings-acl.413",
pages = "7928--7939",
ISBN = "979-8-89176-256-5",
abstract = "Natural Language Inference (NLI) has gained significant attention recently due to its importance in understanding how machines comprehend and reason about language. While English has received tremendous interest, Morphologically Rich Languages (MRLs) like Hebrew, require more research. In this paper, we address the evaluation of Hebrew NLI models by introducing LCHAIM, a dataset designed to evaluate these models on tasks involving long premises and complex reasoning. The dataset, created by translating and validating the English ConTRoL dataset, consists of 8,325 context-hypothesis pairs that require coreferential, temporal, logical and analytical reasoning. Our experiments show the difficulty of contextual reasoning in Hebrew, as evidenced by the performance of different models. Fine-tuning the LongHero model on both the shorter premise Hebrew NLI and the LCHAIM datasets yielded a mean accuracy of 52{\%}, that is 35{\%} less than human performance. Similarly, Large language Models (LLMs) like Gemma-9B, Dicta-LM-2.0-7B, and GPT-4o achieved a top mean accuracy of 60.12{\%} in few-shot setting."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="malul-etal-2025-lchaim">
<titleInfo>
<title>LCHAIM - Investigating Long Context Reasoning in Hebrew</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ehud</namePart>
<namePart type="family">Malul</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oriel</namePart>
<namePart type="family">Perets</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ziv</namePart>
<namePart type="family">Mor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yigal</namePart>
<namePart type="family">Kassel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elior</namePart>
<namePart type="family">Sulem</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Natural Language Inference (NLI) has gained significant attention recently due to its importance in understanding how machines comprehend and reason about language. While English has received tremendous interest, Morphologically Rich Languages (MRLs) like Hebrew, require more research. In this paper, we address the evaluation of Hebrew NLI models by introducing LCHAIM, a dataset designed to evaluate these models on tasks involving long premises and complex reasoning. The dataset, created by translating and validating the English ConTRoL dataset, consists of 8,325 context-hypothesis pairs that require coreferential, temporal, logical and analytical reasoning. Our experiments show the difficulty of contextual reasoning in Hebrew, as evidenced by the performance of different models. Fine-tuning the LongHero model on both the shorter premise Hebrew NLI and the LCHAIM datasets yielded a mean accuracy of 52%, that is 35% less than human performance. Similarly, Large language Models (LLMs) like Gemma-9B, Dicta-LM-2.0-7B, and GPT-4o achieved a top mean accuracy of 60.12% in few-shot setting.</abstract>
<identifier type="citekey">malul-etal-2025-lchaim</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.413</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.413/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>7928</start>
<end>7939</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T LCHAIM - Investigating Long Context Reasoning in Hebrew
%A Malul, Ehud
%A Perets, Oriel
%A Mor, Ziv
%A Kassel, Yigal
%A Sulem, Elior
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F malul-etal-2025-lchaim
%X Natural Language Inference (NLI) has gained significant attention recently due to its importance in understanding how machines comprehend and reason about language. While English has received tremendous interest, Morphologically Rich Languages (MRLs) like Hebrew, require more research. In this paper, we address the evaluation of Hebrew NLI models by introducing LCHAIM, a dataset designed to evaluate these models on tasks involving long premises and complex reasoning. The dataset, created by translating and validating the English ConTRoL dataset, consists of 8,325 context-hypothesis pairs that require coreferential, temporal, logical and analytical reasoning. Our experiments show the difficulty of contextual reasoning in Hebrew, as evidenced by the performance of different models. Fine-tuning the LongHero model on both the shorter premise Hebrew NLI and the LCHAIM datasets yielded a mean accuracy of 52%, that is 35% less than human performance. Similarly, Large language Models (LLMs) like Gemma-9B, Dicta-LM-2.0-7B, and GPT-4o achieved a top mean accuracy of 60.12% in few-shot setting.
%R 10.18653/v1/2025.findings-acl.413
%U https://aclanthology.org/2025.findings-acl.413/
%U https://doi.org/10.18653/v1/2025.findings-acl.413
%P 7928-7939
Markdown (Informal)
[LCHAIM - Investigating Long Context Reasoning in Hebrew](https://aclanthology.org/2025.findings-acl.413/) (Malul et al., Findings 2025)
ACL
- Ehud Malul, Oriel Perets, Ziv Mor, Yigal Kassel, and Elior Sulem. 2025. LCHAIM - Investigating Long Context Reasoning in Hebrew. In Findings of the Association for Computational Linguistics: ACL 2025, pages 7928–7939, Vienna, Austria. Association for Computational Linguistics.