@inproceedings{liu-reiter-2026-linguistically,
title = "Linguistically-Informed Evaluation of {LLM}s on Acceptability Judgments in a Forced-Choice Paradigm",
author = "Liu, Ziyue and
Reiter, Nils",
editor = "T.Y.S.S., Santosh and
Rodriguez, Juan Diego and
de Gibert, Ona",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics ({ACL} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-srw.103/",
pages = "1177--1189",
ISBN = "979-8-89176-393-7",
abstract = "Evaluating the grammatical abilities of large language models (LLMs) is important for both NLP and linguistic theory. We investigate the ability of large language models (LLMs) to perform acceptability judgments in a forced-choice paradigm. We evaluate a subset of LLMs on 150 minimal sentence pairs sampled from Linguistic Inquiry and categorized using BLiMP linguistic phenomena. Our results show that while LLMs approximate human judgments, performance varies across models and phenomenon types, with stronger alignment on morphosyntactic phenomena than on linguistically and semantically demanding phenomena. Prompting strategies have minimal impact."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="liu-reiter-2026-linguistically">
<titleInfo>
<title>Linguistically-Informed Evaluation of LLMs on Acceptability Judgments in a Forced-Choice Paradigm</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ziyue</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nils</namePart>
<namePart type="family">Reiter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Santosh</namePart>
<namePart type="family">T.Y.S.S.</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="given">Diego</namePart>
<namePart type="family">Rodriguez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ona</namePart>
<namePart type="family">de Gibert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-393-7</identifier>
</relatedItem>
<abstract>Evaluating the grammatical abilities of large language models (LLMs) is important for both NLP and linguistic theory. We investigate the ability of large language models (LLMs) to perform acceptability judgments in a forced-choice paradigm. We evaluate a subset of LLMs on 150 minimal sentence pairs sampled from Linguistic Inquiry and categorized using BLiMP linguistic phenomena. Our results show that while LLMs approximate human judgments, performance varies across models and phenomenon types, with stronger alignment on morphosyntactic phenomena than on linguistically and semantically demanding phenomena. Prompting strategies have minimal impact.</abstract>
<identifier type="citekey">liu-reiter-2026-linguistically</identifier>
<location>
<url>https://aclanthology.org/2026.acl-srw.103/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>1177</start>
<end>1189</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Linguistically-Informed Evaluation of LLMs on Acceptability Judgments in a Forced-Choice Paradigm
%A Liu, Ziyue
%A Reiter, Nils
%Y T.Y.S.S., Santosh
%Y Rodriguez, Juan Diego
%Y de Gibert, Ona
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-393-7
%F liu-reiter-2026-linguistically
%X Evaluating the grammatical abilities of large language models (LLMs) is important for both NLP and linguistic theory. We investigate the ability of large language models (LLMs) to perform acceptability judgments in a forced-choice paradigm. We evaluate a subset of LLMs on 150 minimal sentence pairs sampled from Linguistic Inquiry and categorized using BLiMP linguistic phenomena. Our results show that while LLMs approximate human judgments, performance varies across models and phenomenon types, with stronger alignment on morphosyntactic phenomena than on linguistically and semantically demanding phenomena. Prompting strategies have minimal impact.
%U https://aclanthology.org/2026.acl-srw.103/
%P 1177-1189
Markdown (Informal)
[Linguistically-Informed Evaluation of LLMs on Acceptability Judgments in a Forced-Choice Paradigm](https://aclanthology.org/2026.acl-srw.103/) (Liu & Reiter, ACL 2026)
ACL