@inproceedings{ho-etal-2026-identifying,
title = "Identifying Where Large Language Models Struggle in Answering Complex Questions",
author = "Ho, Xanh and
Boudin, Florian and
Sugawara, Saku and
Duong, Khoa and
Aizawa, Akiko",
editor = "Mille, Simon and
Gehrmann, Sebastian and
Schmidtov{\'a}, Patr{\'i}cia and
Du{\v{s}}ek, Ond{\v{r}}ej and
Fadaee, Marzieh and
Lo, Kyle and
Santus, Enrico and
Stanovsky, Gabriel",
booktitle = "Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics ({GEM})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.gem-main.11/",
pages = "112--123",
ISBN = "979-8-89176-423-1",
abstract = "We design experiments to identify where Large Language Models (LLMs) struggle when answering complex questions.Our focus is on two key stages, mirroring the human QA process: 1) \textit{question decomposition}, where the model breaks down a complex question into sub-questions and 2) \textit{subproblem solving}, where it addresses each sub-question to obtain the final response.We preprocess and expand three multi-hop datasets to create experimental datasets featuring explicit and implicit multi-hop questions, crowdsourced and templated questions, and varying numbers of hops.Our results show that larger models (Llama 3.1 70B and o1) excel at decomposing explicit multi-hop questions but struggle with implicit ones, while smaller models (e.g., Llama 3.1 8B) have difficulty with both.In the sub-problem solving stage, all models perform well on simple questions with context.Furthermore, we found no correlation between accuracy in the question decomposition stage and final QA performance (direct response), highlighting a key difference between human and LLM reasoning."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ho-etal-2026-identifying">
<titleInfo>
<title>Identifying Where Large Language Models Struggle in Answering Complex Questions</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xanh</namePart>
<namePart type="family">Ho</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Florian</namePart>
<namePart type="family">Boudin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Saku</namePart>
<namePart type="family">Sugawara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khoa</namePart>
<namePart type="family">Duong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Akiko</namePart>
<namePart type="family">Aizawa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics (GEM)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Mille</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Gehrmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrícia</namePart>
<namePart type="family">Schmidtová</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ondřej</namePart>
<namePart type="family">Dušek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marzieh</namePart>
<namePart type="family">Fadaee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kyle</namePart>
<namePart type="family">Lo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enrico</namePart>
<namePart type="family">Santus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriel</namePart>
<namePart type="family">Stanovsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-423-1</identifier>
</relatedItem>
<abstract>We design experiments to identify where Large Language Models (LLMs) struggle when answering complex questions.Our focus is on two key stages, mirroring the human QA process: 1) question decomposition, where the model breaks down a complex question into sub-questions and 2) subproblem solving, where it addresses each sub-question to obtain the final response.We preprocess and expand three multi-hop datasets to create experimental datasets featuring explicit and implicit multi-hop questions, crowdsourced and templated questions, and varying numbers of hops.Our results show that larger models (Llama 3.1 70B and o1) excel at decomposing explicit multi-hop questions but struggle with implicit ones, while smaller models (e.g., Llama 3.1 8B) have difficulty with both.In the sub-problem solving stage, all models perform well on simple questions with context.Furthermore, we found no correlation between accuracy in the question decomposition stage and final QA performance (direct response), highlighting a key difference between human and LLM reasoning.</abstract>
<identifier type="citekey">ho-etal-2026-identifying</identifier>
<location>
<url>https://aclanthology.org/2026.gem-main.11/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>112</start>
<end>123</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Identifying Where Large Language Models Struggle in Answering Complex Questions
%A Ho, Xanh
%A Boudin, Florian
%A Sugawara, Saku
%A Duong, Khoa
%A Aizawa, Akiko
%Y Mille, Simon
%Y Gehrmann, Sebastian
%Y Schmidtová, Patrícia
%Y Dušek, Ondřej
%Y Fadaee, Marzieh
%Y Lo, Kyle
%Y Santus, Enrico
%Y Stanovsky, Gabriel
%S Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics (GEM)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-423-1
%F ho-etal-2026-identifying
%X We design experiments to identify where Large Language Models (LLMs) struggle when answering complex questions.Our focus is on two key stages, mirroring the human QA process: 1) question decomposition, where the model breaks down a complex question into sub-questions and 2) subproblem solving, where it addresses each sub-question to obtain the final response.We preprocess and expand three multi-hop datasets to create experimental datasets featuring explicit and implicit multi-hop questions, crowdsourced and templated questions, and varying numbers of hops.Our results show that larger models (Llama 3.1 70B and o1) excel at decomposing explicit multi-hop questions but struggle with implicit ones, while smaller models (e.g., Llama 3.1 8B) have difficulty with both.In the sub-problem solving stage, all models perform well on simple questions with context.Furthermore, we found no correlation between accuracy in the question decomposition stage and final QA performance (direct response), highlighting a key difference between human and LLM reasoning.
%U https://aclanthology.org/2026.gem-main.11/
%P 112-123
Markdown (Informal)
[Identifying Where Large Language Models Struggle in Answering Complex Questions](https://aclanthology.org/2026.gem-main.11/) (Ho et al., GEM 2026)
ACL