@inproceedings{ide-etal-2025-make,
title = "How to Make the Most of {LLM}s' Grammatical Knowledge for Acceptability Judgments",
author = "Ide, Yusuke and
Nishida, Yuto and
Vasselli, Justin and
Oba, Miyu and
Sakai, Yusuke and
Kamigaito, Hidetaka and
Watanabe, Taro",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-long.380/",
doi = "10.18653/v1/2025.naacl-long.380",
pages = "7416--7432",
ISBN = "979-8-89176-189-6",
abstract = "The grammatical knowledge of language models (LMs) is often measured using a benchmark of linguistic minimal pairs, where LMs are presented with a pair of acceptable and unacceptable sentences and required to judge which is more acceptable. Conventional approaches compare sentence probabilities directly, but large language models (LLMs) provide nuanced evaluation methods using prompts and templates. We therefore investigate how to derive the most accurate acceptability judgments from LLMs to comprehensively evaluate their grammatical knowledge. Through extensive experiments in both English and Chinese, we compare nine judgment methods and demonstrate that two of them, in-template LP (a probability readout method) and Yes/No probability computing (a prompting-based method), achieve higher accuracy than the conventional approach. Our analysis reveals that the top two methods excel in different linguistic phenomena, suggesting they access different aspects of the LLMs' grammatical knowledge. We find that ensembling the two methods achieves even higher accuracy. Consequently, we recommend these techniques, either individually or ensembled, as more effective alternatives to conventional approaches for assessing grammatical knowledge in LLMs."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ide-etal-2025-make">
<titleInfo>
<title>How to Make the Most of LLMs’ Grammatical Knowledge for Acceptability Judgments</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yusuke</namePart>
<namePart type="family">Ide</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuto</namePart>
<namePart type="family">Nishida</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Justin</namePart>
<namePart type="family">Vasselli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Miyu</namePart>
<namePart type="family">Oba</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yusuke</namePart>
<namePart type="family">Sakai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hidetaka</namePart>
<namePart type="family">Kamigaito</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Taro</namePart>
<namePart type="family">Watanabe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-189-6</identifier>
</relatedItem>
<abstract>The grammatical knowledge of language models (LMs) is often measured using a benchmark of linguistic minimal pairs, where LMs are presented with a pair of acceptable and unacceptable sentences and required to judge which is more acceptable. Conventional approaches compare sentence probabilities directly, but large language models (LLMs) provide nuanced evaluation methods using prompts and templates. We therefore investigate how to derive the most accurate acceptability judgments from LLMs to comprehensively evaluate their grammatical knowledge. Through extensive experiments in both English and Chinese, we compare nine judgment methods and demonstrate that two of them, in-template LP (a probability readout method) and Yes/No probability computing (a prompting-based method), achieve higher accuracy than the conventional approach. Our analysis reveals that the top two methods excel in different linguistic phenomena, suggesting they access different aspects of the LLMs’ grammatical knowledge. We find that ensembling the two methods achieves even higher accuracy. Consequently, we recommend these techniques, either individually or ensembled, as more effective alternatives to conventional approaches for assessing grammatical knowledge in LLMs.</abstract>
<identifier type="citekey">ide-etal-2025-make</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-long.380</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-long.380/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>7416</start>
<end>7432</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T How to Make the Most of LLMs’ Grammatical Knowledge for Acceptability Judgments
%A Ide, Yusuke
%A Nishida, Yuto
%A Vasselli, Justin
%A Oba, Miyu
%A Sakai, Yusuke
%A Kamigaito, Hidetaka
%A Watanabe, Taro
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-189-6
%F ide-etal-2025-make
%X The grammatical knowledge of language models (LMs) is often measured using a benchmark of linguistic minimal pairs, where LMs are presented with a pair of acceptable and unacceptable sentences and required to judge which is more acceptable. Conventional approaches compare sentence probabilities directly, but large language models (LLMs) provide nuanced evaluation methods using prompts and templates. We therefore investigate how to derive the most accurate acceptability judgments from LLMs to comprehensively evaluate their grammatical knowledge. Through extensive experiments in both English and Chinese, we compare nine judgment methods and demonstrate that two of them, in-template LP (a probability readout method) and Yes/No probability computing (a prompting-based method), achieve higher accuracy than the conventional approach. Our analysis reveals that the top two methods excel in different linguistic phenomena, suggesting they access different aspects of the LLMs’ grammatical knowledge. We find that ensembling the two methods achieves even higher accuracy. Consequently, we recommend these techniques, either individually or ensembled, as more effective alternatives to conventional approaches for assessing grammatical knowledge in LLMs.
%R 10.18653/v1/2025.naacl-long.380
%U https://aclanthology.org/2025.naacl-long.380/
%U https://doi.org/10.18653/v1/2025.naacl-long.380
%P 7416-7432
Markdown (Informal)
[How to Make the Most of LLMs’ Grammatical Knowledge for Acceptability Judgments](https://aclanthology.org/2025.naacl-long.380/) (Ide et al., NAACL 2025)
ACL
- Yusuke Ide, Yuto Nishida, Justin Vasselli, Miyu Oba, Yusuke Sakai, Hidetaka Kamigaito, and Taro Watanabe. 2025. How to Make the Most of LLMs’ Grammatical Knowledge for Acceptability Judgments. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 7416–7432, Albuquerque, New Mexico. Association for Computational Linguistics.