@inproceedings{pistotti-etal-2025-exploring,
title = "Exploring Gaps in the {APS}: Direct Minimal Pair Analysis in {LLM} Syntactic Assessments",
author = "Pistotti, Timothy and
Brown, Jason and
Witbrock, Michael J.",
editor = "Bernard, Timoth{\'e}e and
Mickus, Timothee",
booktitle = "Proceedings of the Second Workshop on the Bridges and Gaps between Formal and Computational Linguistics (BriGap-2)",
month = sep,
year = "2025",
address = {D{\"u}sseldorf, Germany},
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.brigap-1.4/",
pages = "20--25",
ISBN = "979-8-89176-317-3",
abstract = "Recent studies probing the Argument from the Poverty of the Stimulus (APS) have applied Large Language Models (LLMs) to test the learnability of complex syntax through surprisal-based metrics. However, divergent conclusions raise questions concerning the insights these metrics offer. While Wilcox et al. (2024) used direct minimal pair comparisons (the ``wh-effect'') to demonstrate that models successfully generalise knowledge of filler-gap dependencies, Lan et al. (2024) used a Difference-in-Differences (DiD) metric and found that models largely fail on parasitic gaps (PGs). This paper argues that the direct minimal pair approach offers greater diagnostic transparency. We demonstrate this by generating a full 8-permutation paradigm of refined PG stimuli and evaluating the GPT-2 model used in previous studies with a systematic Wilcox-style wh-effect analysis. Our results show that GPT-2 succeeds across all four tested conditions, indicating robust knowledge of filler-gap licensing principles even in complex PG environments. This finding, which contrasts with the more ambiguous results from DiD-style metrics, suggests that the choice of evaluation metric is critical for assessing an LLM{'}s syntactic competence."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pistotti-etal-2025-exploring">
<titleInfo>
<title>Exploring Gaps in the APS: Direct Minimal Pair Analysis in LLM Syntactic Assessments</title>
</titleInfo>
<name type="personal">
<namePart type="given">Timothy</namePart>
<namePart type="family">Pistotti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jason</namePart>
<namePart type="family">Brown</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="given">J</namePart>
<namePart type="family">Witbrock</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop on the Bridges and Gaps between Formal and Computational Linguistics (BriGap-2)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Timothée</namePart>
<namePart type="family">Bernard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Timothee</namePart>
<namePart type="family">Mickus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Düsseldorf, Germany</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-317-3</identifier>
</relatedItem>
<abstract>Recent studies probing the Argument from the Poverty of the Stimulus (APS) have applied Large Language Models (LLMs) to test the learnability of complex syntax through surprisal-based metrics. However, divergent conclusions raise questions concerning the insights these metrics offer. While Wilcox et al. (2024) used direct minimal pair comparisons (the “wh-effect”) to demonstrate that models successfully generalise knowledge of filler-gap dependencies, Lan et al. (2024) used a Difference-in-Differences (DiD) metric and found that models largely fail on parasitic gaps (PGs). This paper argues that the direct minimal pair approach offers greater diagnostic transparency. We demonstrate this by generating a full 8-permutation paradigm of refined PG stimuli and evaluating the GPT-2 model used in previous studies with a systematic Wilcox-style wh-effect analysis. Our results show that GPT-2 succeeds across all four tested conditions, indicating robust knowledge of filler-gap licensing principles even in complex PG environments. This finding, which contrasts with the more ambiguous results from DiD-style metrics, suggests that the choice of evaluation metric is critical for assessing an LLM’s syntactic competence.</abstract>
<identifier type="citekey">pistotti-etal-2025-exploring</identifier>
<location>
<url>https://aclanthology.org/2025.brigap-1.4/</url>
</location>
<part>
<date>2025-09</date>
<extent unit="page">
<start>20</start>
<end>25</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Exploring Gaps in the APS: Direct Minimal Pair Analysis in LLM Syntactic Assessments
%A Pistotti, Timothy
%A Brown, Jason
%A Witbrock, Michael J.
%Y Bernard, Timothée
%Y Mickus, Timothee
%S Proceedings of the Second Workshop on the Bridges and Gaps between Formal and Computational Linguistics (BriGap-2)
%D 2025
%8 September
%I Association for Computational Linguistics
%C Düsseldorf, Germany
%@ 979-8-89176-317-3
%F pistotti-etal-2025-exploring
%X Recent studies probing the Argument from the Poverty of the Stimulus (APS) have applied Large Language Models (LLMs) to test the learnability of complex syntax through surprisal-based metrics. However, divergent conclusions raise questions concerning the insights these metrics offer. While Wilcox et al. (2024) used direct minimal pair comparisons (the “wh-effect”) to demonstrate that models successfully generalise knowledge of filler-gap dependencies, Lan et al. (2024) used a Difference-in-Differences (DiD) metric and found that models largely fail on parasitic gaps (PGs). This paper argues that the direct minimal pair approach offers greater diagnostic transparency. We demonstrate this by generating a full 8-permutation paradigm of refined PG stimuli and evaluating the GPT-2 model used in previous studies with a systematic Wilcox-style wh-effect analysis. Our results show that GPT-2 succeeds across all four tested conditions, indicating robust knowledge of filler-gap licensing principles even in complex PG environments. This finding, which contrasts with the more ambiguous results from DiD-style metrics, suggests that the choice of evaluation metric is critical for assessing an LLM’s syntactic competence.
%U https://aclanthology.org/2025.brigap-1.4/
%P 20-25
Markdown (Informal)
[Exploring Gaps in the APS: Direct Minimal Pair Analysis in LLM Syntactic Assessments](https://aclanthology.org/2025.brigap-1.4/) (Pistotti et al., BriGap 2025)
ACL