@inproceedings{khodadad-etal-2026-evaluating,
title = "Evaluating Multi-Hop Reasoning in Large Language Models: A Chemistry-Centric Benchmark",
author = "Khodadad, Mohammad and
Kasmaee, Ali Shiraee and
Astaraki, Mahdi and
Sherck, Nicholas and
Mahyar, Hamidreza and
Samiee, Soheila",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-eacl.322/",
pages = "6117--6143",
ISBN = "979-8-89176-386-9",
abstract = "We introduce ChemComp, the first chemistry-focused benchmark for evaluating compositional multi-hop reasoning in large language models (LLMs). Our automated pipeline constructs benchmarks from proprietary or public data by integrating generative reasoning models, chemical named-entity recognition, and external knowledge bases to build knowledge graphs. Applied to recent chemistry literature, this approach minimizes overlap with LLM pretraining data. The resulting dataset comprises 1,188 multi-hop questions, refined through domain-expert feedback and robust evaluation protocols.Using ChemComp, we systematically compare LLM performance with and without retrieval augmentation, including an idealized gold-context scenario. Our results show that even state-of-the-art models struggle with compositional reasoning: retrieval significantly improves accuracy, yet reasoning errors persist even under perfect retrieval. These findings highlight the limitations of current LLMs and the critical role of retrieval-augmented methods in scientific reasoning. Furthermore, our pipeline is generalizable with fine-tuning, enabling the creation of challenging multi-hop reasoning benchmarks across domains and proprietary datasets."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="khodadad-etal-2026-evaluating">
<titleInfo>
<title>Evaluating Multi-Hop Reasoning in Large Language Models: A Chemistry-Centric Benchmark</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="family">Khodadad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ali</namePart>
<namePart type="given">Shiraee</namePart>
<namePart type="family">Kasmaee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mahdi</namePart>
<namePart type="family">Astaraki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nicholas</namePart>
<namePart type="family">Sherck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hamidreza</namePart>
<namePart type="family">Mahyar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Soheila</namePart>
<namePart type="family">Samiee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-386-9</identifier>
</relatedItem>
<abstract>We introduce ChemComp, the first chemistry-focused benchmark for evaluating compositional multi-hop reasoning in large language models (LLMs). Our automated pipeline constructs benchmarks from proprietary or public data by integrating generative reasoning models, chemical named-entity recognition, and external knowledge bases to build knowledge graphs. Applied to recent chemistry literature, this approach minimizes overlap with LLM pretraining data. The resulting dataset comprises 1,188 multi-hop questions, refined through domain-expert feedback and robust evaluation protocols.Using ChemComp, we systematically compare LLM performance with and without retrieval augmentation, including an idealized gold-context scenario. Our results show that even state-of-the-art models struggle with compositional reasoning: retrieval significantly improves accuracy, yet reasoning errors persist even under perfect retrieval. These findings highlight the limitations of current LLMs and the critical role of retrieval-augmented methods in scientific reasoning. Furthermore, our pipeline is generalizable with fine-tuning, enabling the creation of challenging multi-hop reasoning benchmarks across domains and proprietary datasets.</abstract>
<identifier type="citekey">khodadad-etal-2026-evaluating</identifier>
<location>
<url>https://aclanthology.org/2026.findings-eacl.322/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>6117</start>
<end>6143</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluating Multi-Hop Reasoning in Large Language Models: A Chemistry-Centric Benchmark
%A Khodadad, Mohammad
%A Kasmaee, Ali Shiraee
%A Astaraki, Mahdi
%A Sherck, Nicholas
%A Mahyar, Hamidreza
%A Samiee, Soheila
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Findings of the Association for Computational Linguistics: EACL 2026
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-386-9
%F khodadad-etal-2026-evaluating
%X We introduce ChemComp, the first chemistry-focused benchmark for evaluating compositional multi-hop reasoning in large language models (LLMs). Our automated pipeline constructs benchmarks from proprietary or public data by integrating generative reasoning models, chemical named-entity recognition, and external knowledge bases to build knowledge graphs. Applied to recent chemistry literature, this approach minimizes overlap with LLM pretraining data. The resulting dataset comprises 1,188 multi-hop questions, refined through domain-expert feedback and robust evaluation protocols.Using ChemComp, we systematically compare LLM performance with and without retrieval augmentation, including an idealized gold-context scenario. Our results show that even state-of-the-art models struggle with compositional reasoning: retrieval significantly improves accuracy, yet reasoning errors persist even under perfect retrieval. These findings highlight the limitations of current LLMs and the critical role of retrieval-augmented methods in scientific reasoning. Furthermore, our pipeline is generalizable with fine-tuning, enabling the creation of challenging multi-hop reasoning benchmarks across domains and proprietary datasets.
%U https://aclanthology.org/2026.findings-eacl.322/
%P 6117-6143
Markdown (Informal)
[Evaluating Multi-Hop Reasoning in Large Language Models: A Chemistry-Centric Benchmark](https://aclanthology.org/2026.findings-eacl.322/) (Khodadad et al., Findings 2026)
ACL