@inproceedings{mackintosh-etal-2025-evaluating,
title = "Evaluating {C}x{G} Generalisation in {LLM}s via Construction-Based {NLI} Fine Tuning",
author = "Mackintosh, Tom and
Tayyar Madabushi, Harish and
Bonial, Claire",
editor = "Bonial, Claire and
Torgbi, Melissa and
Weissweiler, Leonie and
Blodgett, Austin and
Beuls, Katrien and
Van Eecke, Paul and
Tayyar Madabushi, Harish",
booktitle = "Proceedings of the Second International Workshop on Construction Grammars and NLP",
month = sep,
year = "2025",
address = {D{\"u}sseldorf, Germany},
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.cxgsnlp-1.19/",
pages = "180--189",
ISBN = "979-8-89176-318-0",
abstract = "We probe large language models' ability to learn deep form-meaning mappings as defined by construction grammars. We introduce the ConTest-NLI benchmark of 80k sentences covering eight English constructions from highly lexicalized to highly schematic. Our pipeline generates diverse synthetic NLI triples via templating and the application of a model-in-the loop filter. This provides aspects of human validation to ensure challenge and label reliability. Zero-shot tests on leading LLMs reveal a 24{\%} drop in accuracy between naturalistic (88{\%}) and adversarial data (64{\%}), with schematic patterns proving hardest. Fine-tuning on a subset of ConTest-NLI yields up to 9{\%} improvement, yet our results highlight persistent abstraction gaps in current LLMs and offer a scalable framework for evaluating construction informed learning."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mackintosh-etal-2025-evaluating">
<titleInfo>
<title>Evaluating CxG Generalisation in LLMs via Construction-Based NLI Fine Tuning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tom</namePart>
<namePart type="family">Mackintosh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Harish</namePart>
<namePart type="family">Tayyar Madabushi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Claire</namePart>
<namePart type="family">Bonial</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second International Workshop on Construction Grammars and NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Claire</namePart>
<namePart type="family">Bonial</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Melissa</namePart>
<namePart type="family">Torgbi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leonie</namePart>
<namePart type="family">Weissweiler</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Austin</namePart>
<namePart type="family">Blodgett</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Katrien</namePart>
<namePart type="family">Beuls</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">Van Eecke</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Harish</namePart>
<namePart type="family">Tayyar Madabushi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Düsseldorf, Germany</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-318-0</identifier>
</relatedItem>
<abstract>We probe large language models’ ability to learn deep form-meaning mappings as defined by construction grammars. We introduce the ConTest-NLI benchmark of 80k sentences covering eight English constructions from highly lexicalized to highly schematic. Our pipeline generates diverse synthetic NLI triples via templating and the application of a model-in-the loop filter. This provides aspects of human validation to ensure challenge and label reliability. Zero-shot tests on leading LLMs reveal a 24% drop in accuracy between naturalistic (88%) and adversarial data (64%), with schematic patterns proving hardest. Fine-tuning on a subset of ConTest-NLI yields up to 9% improvement, yet our results highlight persistent abstraction gaps in current LLMs and offer a scalable framework for evaluating construction informed learning.</abstract>
<identifier type="citekey">mackintosh-etal-2025-evaluating</identifier>
<location>
<url>https://aclanthology.org/2025.cxgsnlp-1.19/</url>
</location>
<part>
<date>2025-09</date>
<extent unit="page">
<start>180</start>
<end>189</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluating CxG Generalisation in LLMs via Construction-Based NLI Fine Tuning
%A Mackintosh, Tom
%A Tayyar Madabushi, Harish
%A Bonial, Claire
%Y Bonial, Claire
%Y Torgbi, Melissa
%Y Weissweiler, Leonie
%Y Blodgett, Austin
%Y Beuls, Katrien
%Y Van Eecke, Paul
%Y Tayyar Madabushi, Harish
%S Proceedings of the Second International Workshop on Construction Grammars and NLP
%D 2025
%8 September
%I Association for Computational Linguistics
%C Düsseldorf, Germany
%@ 979-8-89176-318-0
%F mackintosh-etal-2025-evaluating
%X We probe large language models’ ability to learn deep form-meaning mappings as defined by construction grammars. We introduce the ConTest-NLI benchmark of 80k sentences covering eight English constructions from highly lexicalized to highly schematic. Our pipeline generates diverse synthetic NLI triples via templating and the application of a model-in-the loop filter. This provides aspects of human validation to ensure challenge and label reliability. Zero-shot tests on leading LLMs reveal a 24% drop in accuracy between naturalistic (88%) and adversarial data (64%), with schematic patterns proving hardest. Fine-tuning on a subset of ConTest-NLI yields up to 9% improvement, yet our results highlight persistent abstraction gaps in current LLMs and offer a scalable framework for evaluating construction informed learning.
%U https://aclanthology.org/2025.cxgsnlp-1.19/
%P 180-189
Markdown (Informal)
[Evaluating CxG Generalisation in LLMs via Construction-Based NLI Fine Tuning](https://aclanthology.org/2025.cxgsnlp-1.19/) (Mackintosh et al., CxGsNLP 2025)
ACL