@inproceedings{biran-etal-2024-hopping,
title = "Hopping Too Late: Exploring the Limitations of Large Language Models on Multi-Hop Queries",
author = "Biran, Eden and
Gottesman, Daniela and
Yang, Sohee and
Geva, Mor and
Globerson, Amir",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.781",
doi = "10.18653/v1/2024.emnlp-main.781",
pages = "14113--14130",
abstract = "Large language models (LLMs) can solve complex multi-step problems, but little is known about how these computations are implemented internally. Motivated by this, we study how LLMs answer multi-hop queries such as {``}The spouse of the performer of Imagine is{''}. These queries require two information extraction steps: a latent one for resolving the first hop ({``}the performer of Imagine{''}) into the bridge entity (John Lennon), and another for resolving the second hop ({``}the spouse of John Lennon{''}) into the target entity (Yoko Ono). Understanding how the latent step is computed internally is key to understanding the overall computation. By carefully analyzing the internal computations of transformer-based LLMs, we discover that the bridge entity is resolved in the early layers of the model. Then, only after this resolution, the two-hop query is solved in the later layers. Because the second hop commences in later layers, there could be cases where these layers no longer encode the necessary knowledge for correctly predicting the answer. Motivated by this, we propose a novel {``}back-patching{''} analysis method whereby a hidden representation from a later layer is patched back to an earlier layer. We find that in up to 66{\%} of previously incorrect cases there exists a back-patch that results in the correct generation of the answer, showing that the later layers indeed sometimes lack the needed functionality. Overall our methods and findings open further opportunities for understanding and improving latent reasoning in transformer-based LLMs.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="biran-etal-2024-hopping">
<titleInfo>
<title>Hopping Too Late: Exploring the Limitations of Large Language Models on Multi-Hop Queries</title>
</titleInfo>
<name type="personal">
<namePart type="given">Eden</namePart>
<namePart type="family">Biran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniela</namePart>
<namePart type="family">Gottesman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sohee</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mor</namePart>
<namePart type="family">Geva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amir</namePart>
<namePart type="family">Globerson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large language models (LLMs) can solve complex multi-step problems, but little is known about how these computations are implemented internally. Motivated by this, we study how LLMs answer multi-hop queries such as “The spouse of the performer of Imagine is”. These queries require two information extraction steps: a latent one for resolving the first hop (“the performer of Imagine”) into the bridge entity (John Lennon), and another for resolving the second hop (“the spouse of John Lennon”) into the target entity (Yoko Ono). Understanding how the latent step is computed internally is key to understanding the overall computation. By carefully analyzing the internal computations of transformer-based LLMs, we discover that the bridge entity is resolved in the early layers of the model. Then, only after this resolution, the two-hop query is solved in the later layers. Because the second hop commences in later layers, there could be cases where these layers no longer encode the necessary knowledge for correctly predicting the answer. Motivated by this, we propose a novel “back-patching” analysis method whereby a hidden representation from a later layer is patched back to an earlier layer. We find that in up to 66% of previously incorrect cases there exists a back-patch that results in the correct generation of the answer, showing that the later layers indeed sometimes lack the needed functionality. Overall our methods and findings open further opportunities for understanding and improving latent reasoning in transformer-based LLMs.</abstract>
<identifier type="citekey">biran-etal-2024-hopping</identifier>
<identifier type="doi">10.18653/v1/2024.emnlp-main.781</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.781</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>14113</start>
<end>14130</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Hopping Too Late: Exploring the Limitations of Large Language Models on Multi-Hop Queries
%A Biran, Eden
%A Gottesman, Daniela
%A Yang, Sohee
%A Geva, Mor
%A Globerson, Amir
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F biran-etal-2024-hopping
%X Large language models (LLMs) can solve complex multi-step problems, but little is known about how these computations are implemented internally. Motivated by this, we study how LLMs answer multi-hop queries such as “The spouse of the performer of Imagine is”. These queries require two information extraction steps: a latent one for resolving the first hop (“the performer of Imagine”) into the bridge entity (John Lennon), and another for resolving the second hop (“the spouse of John Lennon”) into the target entity (Yoko Ono). Understanding how the latent step is computed internally is key to understanding the overall computation. By carefully analyzing the internal computations of transformer-based LLMs, we discover that the bridge entity is resolved in the early layers of the model. Then, only after this resolution, the two-hop query is solved in the later layers. Because the second hop commences in later layers, there could be cases where these layers no longer encode the necessary knowledge for correctly predicting the answer. Motivated by this, we propose a novel “back-patching” analysis method whereby a hidden representation from a later layer is patched back to an earlier layer. We find that in up to 66% of previously incorrect cases there exists a back-patch that results in the correct generation of the answer, showing that the later layers indeed sometimes lack the needed functionality. Overall our methods and findings open further opportunities for understanding and improving latent reasoning in transformer-based LLMs.
%R 10.18653/v1/2024.emnlp-main.781
%U https://aclanthology.org/2024.emnlp-main.781
%U https://doi.org/10.18653/v1/2024.emnlp-main.781
%P 14113-14130
Markdown (Informal)
[Hopping Too Late: Exploring the Limitations of Large Language Models on Multi-Hop Queries](https://aclanthology.org/2024.emnlp-main.781) (Biran et al., EMNLP 2024)
ACL