@article{suijkerbuijk-etal-2025-blimp,
title = "{BL}i{MP}-{NL}: A Corpus of {D}utch Minimal Pairs and Acceptability Judgments for Language Model Evaluation",
author = {Suijkerbuijk, Michelle and
Prins, Zo{\"e} and
Kloots, Marianne de Heer and
Zuidema, Willem and
Frank, Stefan L.},
journal = "Computational Linguistics",
volume = "51",
number = "4",
month = dec,
year = "2025",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2025.cl-4.6/",
doi = "10.1162/coli_a_00559",
pages = "1267--1301",
abstract = "We present a corpus of 8,400 Dutch sentence pairs, intended primarily for the grammatical evaluation of language models. Each pair consists of a grammatical sentence and a minimally different ungrammatical sentence. The corpus covers 84 paradigms, classified into 22 syntactic phenomena. Ten sentence pairs of each paradigm were created by hand, while the remaining 90 were generated semi-automatically and manually validated afterwards. Nine of the 10 hand-crafted sentences of each paradigm are rated for acceptability by at least 30 participants each, and for the same 9 sentences reading times are recorded per word, through self-paced reading. Here, we report on the construction of the dataset, the measured acceptability ratings and reading times, as well as the extent to which a variety of language models can be used to predict both the ground-truth grammaticality and human acceptability ratings."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="suijkerbuijk-etal-2025-blimp">
<titleInfo>
<title>BLiMP-NL: A Corpus of Dutch Minimal Pairs and Acceptability Judgments for Language Model Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Michelle</namePart>
<namePart type="family">Suijkerbuijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zoë</namePart>
<namePart type="family">Prins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianne</namePart>
<namePart type="given">de</namePart>
<namePart type="given">Heer</namePart>
<namePart type="family">Kloots</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Willem</namePart>
<namePart type="family">Zuidema</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stefan</namePart>
<namePart type="given">L</namePart>
<namePart type="family">Frank</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>We present a corpus of 8,400 Dutch sentence pairs, intended primarily for the grammatical evaluation of language models. Each pair consists of a grammatical sentence and a minimally different ungrammatical sentence. The corpus covers 84 paradigms, classified into 22 syntactic phenomena. Ten sentence pairs of each paradigm were created by hand, while the remaining 90 were generated semi-automatically and manually validated afterwards. Nine of the 10 hand-crafted sentences of each paradigm are rated for acceptability by at least 30 participants each, and for the same 9 sentences reading times are recorded per word, through self-paced reading. Here, we report on the construction of the dataset, the measured acceptability ratings and reading times, as well as the extent to which a variety of language models can be used to predict both the ground-truth grammaticality and human acceptability ratings.</abstract>
<identifier type="citekey">suijkerbuijk-etal-2025-blimp</identifier>
<identifier type="doi">10.1162/coli_a_00559</identifier>
<location>
<url>https://aclanthology.org/2025.cl-4.6/</url>
</location>
<part>
<date>2025-12</date>
<detail type="volume"><number>51</number></detail>
<detail type="issue"><number>4</number></detail>
<extent unit="page">
<start>1267</start>
<end>1301</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T BLiMP-NL: A Corpus of Dutch Minimal Pairs and Acceptability Judgments for Language Model Evaluation
%A Suijkerbuijk, Michelle
%A Prins, Zoë
%A Kloots, Marianne de Heer
%A Zuidema, Willem
%A Frank, Stefan L.
%J Computational Linguistics
%D 2025
%8 December
%V 51
%N 4
%I MIT Press
%C Cambridge, MA
%F suijkerbuijk-etal-2025-blimp
%X We present a corpus of 8,400 Dutch sentence pairs, intended primarily for the grammatical evaluation of language models. Each pair consists of a grammatical sentence and a minimally different ungrammatical sentence. The corpus covers 84 paradigms, classified into 22 syntactic phenomena. Ten sentence pairs of each paradigm were created by hand, while the remaining 90 were generated semi-automatically and manually validated afterwards. Nine of the 10 hand-crafted sentences of each paradigm are rated for acceptability by at least 30 participants each, and for the same 9 sentences reading times are recorded per word, through self-paced reading. Here, we report on the construction of the dataset, the measured acceptability ratings and reading times, as well as the extent to which a variety of language models can be used to predict both the ground-truth grammaticality and human acceptability ratings.
%R 10.1162/coli_a_00559
%U https://aclanthology.org/2025.cl-4.6/
%U https://doi.org/10.1162/coli_a_00559
%P 1267-1301
Markdown (Informal)
[BLiMP-NL: A Corpus of Dutch Minimal Pairs and Acceptability Judgments for Language Model Evaluation](https://aclanthology.org/2025.cl-4.6/) (Suijkerbuijk et al., CL 2025)
ACL