@inproceedings{torstensson-holmstrom-2025-grammar,
title = "A Grammar-Based Method for Instilling Empirical Dependency Structure in {LLM}s",
author = {Torstensson, Olle and
Holmstr{\"o}m, Oskar},
editor = "Trosterud, Trond and
Wiechetek, Linda and
Pirinen, Flammie",
booktitle = "Proceedings of the 9th Workshop on Constraint Grammar and Finite State NLP",
month = mar,
year = "2025",
address = "Tallinn, Estonia",
publisher = "University of Tartu Library",
url = "https://aclanthology.org/2025.cgmta-1.7/",
pages = "45--49",
ISBN = "978-9908-53-113-7",
abstract = "We investigate whether synthetic pretraining data generated from a formal grammar modeling syntactic dependencies can improve English language models. Building upon the structured pretraining data approach of Papadimitriou and Jurafsky (2023), we develop a grammar that more closely mirrors empirical dependency structures. Our results are negative {--} this type of pretraining significantly degrades model performance, with both our and their pretraining approach performing worse than no pretraining at all. We analyze potential explanations for these findings and discuss implications for future work on structured-data pretraining."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="torstensson-holmstrom-2025-grammar">
<titleInfo>
<title>A Grammar-Based Method for Instilling Empirical Dependency Structure in LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Olle</namePart>
<namePart type="family">Torstensson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oskar</namePart>
<namePart type="family">Holmström</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 9th Workshop on Constraint Grammar and Finite State NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Trond</namePart>
<namePart type="family">Trosterud</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Linda</namePart>
<namePart type="family">Wiechetek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Flammie</namePart>
<namePart type="family">Pirinen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>University of Tartu Library</publisher>
<place>
<placeTerm type="text">Tallinn, Estonia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">978-9908-53-113-7</identifier>
</relatedItem>
<abstract>We investigate whether synthetic pretraining data generated from a formal grammar modeling syntactic dependencies can improve English language models. Building upon the structured pretraining data approach of Papadimitriou and Jurafsky (2023), we develop a grammar that more closely mirrors empirical dependency structures. Our results are negative – this type of pretraining significantly degrades model performance, with both our and their pretraining approach performing worse than no pretraining at all. We analyze potential explanations for these findings and discuss implications for future work on structured-data pretraining.</abstract>
<identifier type="citekey">torstensson-holmstrom-2025-grammar</identifier>
<location>
<url>https://aclanthology.org/2025.cgmta-1.7/</url>
</location>
<part>
<date>2025-03</date>
<extent unit="page">
<start>45</start>
<end>49</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Grammar-Based Method for Instilling Empirical Dependency Structure in LLMs
%A Torstensson, Olle
%A Holmström, Oskar
%Y Trosterud, Trond
%Y Wiechetek, Linda
%Y Pirinen, Flammie
%S Proceedings of the 9th Workshop on Constraint Grammar and Finite State NLP
%D 2025
%8 March
%I University of Tartu Library
%C Tallinn, Estonia
%@ 978-9908-53-113-7
%F torstensson-holmstrom-2025-grammar
%X We investigate whether synthetic pretraining data generated from a formal grammar modeling syntactic dependencies can improve English language models. Building upon the structured pretraining data approach of Papadimitriou and Jurafsky (2023), we develop a grammar that more closely mirrors empirical dependency structures. Our results are negative – this type of pretraining significantly degrades model performance, with both our and their pretraining approach performing worse than no pretraining at all. We analyze potential explanations for these findings and discuss implications for future work on structured-data pretraining.
%U https://aclanthology.org/2025.cgmta-1.7/
%P 45-49
Markdown (Informal)
[A Grammar-Based Method for Instilling Empirical Dependency Structure in LLMs](https://aclanthology.org/2025.cgmta-1.7/) (Torstensson & Holmström, cgmta 2025)
ACL