@inproceedings{yamakoshi-etal-2025-evaluating,
title = "Evaluating distillation methods for data-efficient syntax learning",
author = "Yamakoshi, Takateru and
Griffiths, Thomas L. and
McCoy, R. Thomas and
Hawkins, Robert D.",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.801/",
pages = "14834--14847",
ISBN = "979-8-89176-335-7",
abstract = "Data-efficient training requires strong inductive biases. To the extent that transformer attention matrices encode syntactic relationships, we would predict that knowledge distillation (KD) targeting attention should selectively accelerate syntax acquisition relative to conventional logit-based KD. To test this hypothesis, we train GPT-2 student models on datasets ranging from 10K to 5M sentences using both distillation methods, evaluating them on both syntactic benchmarks and perplexity. Surprisingly, while logit-based KD dramatically improves data-efficiency, attention-based KD provides minimal benefit even for syntactic tasks. This suggests that output distributions provide sufficient supervisory signal for syntax acquisition, indicating that syntactic knowledge may be distributed throughout the network rather than localized in attention patterns."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yamakoshi-etal-2025-evaluating">
<titleInfo>
<title>Evaluating distillation methods for data-efficient syntax learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Takateru</namePart>
<namePart type="family">Yamakoshi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="given">L</namePart>
<namePart type="family">Griffiths</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">R</namePart>
<namePart type="given">Thomas</namePart>
<namePart type="family">McCoy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Robert</namePart>
<namePart type="given">D</namePart>
<namePart type="family">Hawkins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>Data-efficient training requires strong inductive biases. To the extent that transformer attention matrices encode syntactic relationships, we would predict that knowledge distillation (KD) targeting attention should selectively accelerate syntax acquisition relative to conventional logit-based KD. To test this hypothesis, we train GPT-2 student models on datasets ranging from 10K to 5M sentences using both distillation methods, evaluating them on both syntactic benchmarks and perplexity. Surprisingly, while logit-based KD dramatically improves data-efficiency, attention-based KD provides minimal benefit even for syntactic tasks. This suggests that output distributions provide sufficient supervisory signal for syntax acquisition, indicating that syntactic knowledge may be distributed throughout the network rather than localized in attention patterns.</abstract>
<identifier type="citekey">yamakoshi-etal-2025-evaluating</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.801/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>14834</start>
<end>14847</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluating distillation methods for data-efficient syntax learning
%A Yamakoshi, Takateru
%A Griffiths, Thomas L.
%A McCoy, R. Thomas
%A Hawkins, Robert D.
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F yamakoshi-etal-2025-evaluating
%X Data-efficient training requires strong inductive biases. To the extent that transformer attention matrices encode syntactic relationships, we would predict that knowledge distillation (KD) targeting attention should selectively accelerate syntax acquisition relative to conventional logit-based KD. To test this hypothesis, we train GPT-2 student models on datasets ranging from 10K to 5M sentences using both distillation methods, evaluating them on both syntactic benchmarks and perplexity. Surprisingly, while logit-based KD dramatically improves data-efficiency, attention-based KD provides minimal benefit even for syntactic tasks. This suggests that output distributions provide sufficient supervisory signal for syntax acquisition, indicating that syntactic knowledge may be distributed throughout the network rather than localized in attention patterns.
%U https://aclanthology.org/2025.findings-emnlp.801/
%P 14834-14847
Markdown (Informal)
[Evaluating distillation methods for data-efficient syntax learning](https://aclanthology.org/2025.findings-emnlp.801/) (Yamakoshi et al., Findings 2025)
ACL