@inproceedings{sashida-etal-2026-revealing,
title = "Revealing Redundant Syntax in Large Language Models through Multi-Hop Dependency Paths",
author = "Sashida, Masaki and
Kojima, Takeshi and
Iwasawa, Yusuke and
Matsuo, Yutaka",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-eacl.214/",
pages = "4114--4137",
ISBN = "979-8-89176-386-9",
abstract = "Prior work on attention{--}syntax alignment has largely focused on single-hop Universal Dependency edges (DPs). In this paper, we treat short multi-hop dependency paths (MDPs) (e.g., ``obl+case'') as first-class units and analyze them alongside DPs. Across three pretrained autoregressive LMs (GPT-2 XL, Llama 3 8B, Qwen3-8B) and one encoder baseline (BERT-large), we extract 2{--}3 hop MDPs from UD-parsed English and quantify head{--}relation alignment with an Unlabeled Attachment Score (UAS){--}style metric modified for causal masking in decoder-only models. Rank visualizations reveal both overlap and specialization: we observe heads that align with both DPs and MDPs, as well as heads that appear specialized for one route. To test functional relevance, we first identify heads by UAS and then apply an undifferentiated (uniform) attention ablation to those heads; we evaluate the impact on BLiMP and LAMBADA. Ablating the top 10{\%} of all heads shows that MDP-selected heads induce larger drops than DP-selected heads and that the union ({``}Mix'') of DP- and MDP-selected heads yields the largest drops. For GPT-2 XL, the observed drops are (BLiMP: $\Delta$DP = 1.35 pp, $\Delta$MDP = 4.81 pp, $\Delta$Mix = 7.11 pp; LAMBADA: $\Delta$DP = 4.70 pp, $\Delta$MDP = 25.17 pp, $\Delta$Mix = 32.99 pp), all exceeding size-matched random controls. These results indicate that models can route information consistent with syntactic dependencies via both DP and MDP pathways, with MDPs playing a distinct and measurable role in some settings under our interventions."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sashida-etal-2026-revealing">
<titleInfo>
<title>Revealing Redundant Syntax in Large Language Models through Multi-Hop Dependency Paths</title>
</titleInfo>
<name type="personal">
<namePart type="given">Masaki</namePart>
<namePart type="family">Sashida</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Takeshi</namePart>
<namePart type="family">Kojima</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yusuke</namePart>
<namePart type="family">Iwasawa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yutaka</namePart>
<namePart type="family">Matsuo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-386-9</identifier>
</relatedItem>
<abstract>Prior work on attention–syntax alignment has largely focused on single-hop Universal Dependency edges (DPs). In this paper, we treat short multi-hop dependency paths (MDPs) (e.g., “obl+case”) as first-class units and analyze them alongside DPs. Across three pretrained autoregressive LMs (GPT-2 XL, Llama 3 8B, Qwen3-8B) and one encoder baseline (BERT-large), we extract 2–3 hop MDPs from UD-parsed English and quantify head–relation alignment with an Unlabeled Attachment Score (UAS)–style metric modified for causal masking in decoder-only models. Rank visualizations reveal both overlap and specialization: we observe heads that align with both DPs and MDPs, as well as heads that appear specialized for one route. To test functional relevance, we first identify heads by UAS and then apply an undifferentiated (uniform) attention ablation to those heads; we evaluate the impact on BLiMP and LAMBADA. Ablating the top 10% of all heads shows that MDP-selected heads induce larger drops than DP-selected heads and that the union (“Mix”) of DP- and MDP-selected heads yields the largest drops. For GPT-2 XL, the observed drops are (BLiMP: ΔDP = 1.35 pp, ΔMDP = 4.81 pp, ΔMix = 7.11 pp; LAMBADA: ΔDP = 4.70 pp, ΔMDP = 25.17 pp, ΔMix = 32.99 pp), all exceeding size-matched random controls. These results indicate that models can route information consistent with syntactic dependencies via both DP and MDP pathways, with MDPs playing a distinct and measurable role in some settings under our interventions.</abstract>
<identifier type="citekey">sashida-etal-2026-revealing</identifier>
<location>
<url>https://aclanthology.org/2026.findings-eacl.214/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>4114</start>
<end>4137</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Revealing Redundant Syntax in Large Language Models through Multi-Hop Dependency Paths
%A Sashida, Masaki
%A Kojima, Takeshi
%A Iwasawa, Yusuke
%A Matsuo, Yutaka
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Findings of the Association for Computational Linguistics: EACL 2026
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-386-9
%F sashida-etal-2026-revealing
%X Prior work on attention–syntax alignment has largely focused on single-hop Universal Dependency edges (DPs). In this paper, we treat short multi-hop dependency paths (MDPs) (e.g., “obl+case”) as first-class units and analyze them alongside DPs. Across three pretrained autoregressive LMs (GPT-2 XL, Llama 3 8B, Qwen3-8B) and one encoder baseline (BERT-large), we extract 2–3 hop MDPs from UD-parsed English and quantify head–relation alignment with an Unlabeled Attachment Score (UAS)–style metric modified for causal masking in decoder-only models. Rank visualizations reveal both overlap and specialization: we observe heads that align with both DPs and MDPs, as well as heads that appear specialized for one route. To test functional relevance, we first identify heads by UAS and then apply an undifferentiated (uniform) attention ablation to those heads; we evaluate the impact on BLiMP and LAMBADA. Ablating the top 10% of all heads shows that MDP-selected heads induce larger drops than DP-selected heads and that the union (“Mix”) of DP- and MDP-selected heads yields the largest drops. For GPT-2 XL, the observed drops are (BLiMP: ΔDP = 1.35 pp, ΔMDP = 4.81 pp, ΔMix = 7.11 pp; LAMBADA: ΔDP = 4.70 pp, ΔMDP = 25.17 pp, ΔMix = 32.99 pp), all exceeding size-matched random controls. These results indicate that models can route information consistent with syntactic dependencies via both DP and MDP pathways, with MDPs playing a distinct and measurable role in some settings under our interventions.
%U https://aclanthology.org/2026.findings-eacl.214/
%P 4114-4137
Markdown (Informal)
[Revealing Redundant Syntax in Large Language Models through Multi-Hop Dependency Paths](https://aclanthology.org/2026.findings-eacl.214/) (Sashida et al., Findings 2026)
ACL