@inproceedings{someya-oseki-2023-jblimp,
title = "{JBL}i{MP}: {J}apanese Benchmark of Linguistic Minimal Pairs",
author = "Someya, Taiga and
Oseki, Yohei",
editor = "Vlachos, Andreas and
Augenstein, Isabelle",
booktitle = "Findings of the Association for Computational Linguistics: EACL 2023",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-eacl.117",
doi = "10.18653/v1/2023.findings-eacl.117",
pages = "1581--1594",
abstract = "In this paper, we introduce JBLiMP (Japanese Benchmark of Linguistic Minimal Pairs), a novel dataset for targeted syntactic evaluations of language models in Japanese. JBLiMP consists of 331 minimal pairs, which are created based on acceptability judgments extracted from journal articles in theoretical linguistics. These minimal pairs are grouped into 11 categories, each covering a different linguistic phenomenon. JBLiMP is unique in that it successfully combines two important features independently observed in existing datasets: (i) coverage of complex linguistic phenomena (cf. CoLA) and (ii) presentation of sentences as minimal pairs (cf. BLiMP). In addition, JBLiMP is the first dataset for targeted syntactic evaluations of language models in Japanese, thus allowing the comparison of syntactic knowledge of language models across different languages. We then evaluate the syntactic knowledge of several language models on JBLiMP: GPT-2, LSTM, and n-gram language models. The results demonstrated that all the architectures achieved comparable overall accuracies around 75{\%}. Error analyses by linguistic phenomenon further revealed that these language models successfully captured local dependencies like nominal structures, but not long-distance dependencies such as verbal agreement and binding.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="someya-oseki-2023-jblimp">
<titleInfo>
<title>JBLiMP: Japanese Benchmark of Linguistic Minimal Pairs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Taiga</namePart>
<namePart type="family">Someya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yohei</namePart>
<namePart type="family">Oseki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Andreas</namePart>
<namePart type="family">Vlachos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Isabelle</namePart>
<namePart type="family">Augenstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dubrovnik, Croatia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper, we introduce JBLiMP (Japanese Benchmark of Linguistic Minimal Pairs), a novel dataset for targeted syntactic evaluations of language models in Japanese. JBLiMP consists of 331 minimal pairs, which are created based on acceptability judgments extracted from journal articles in theoretical linguistics. These minimal pairs are grouped into 11 categories, each covering a different linguistic phenomenon. JBLiMP is unique in that it successfully combines two important features independently observed in existing datasets: (i) coverage of complex linguistic phenomena (cf. CoLA) and (ii) presentation of sentences as minimal pairs (cf. BLiMP). In addition, JBLiMP is the first dataset for targeted syntactic evaluations of language models in Japanese, thus allowing the comparison of syntactic knowledge of language models across different languages. We then evaluate the syntactic knowledge of several language models on JBLiMP: GPT-2, LSTM, and n-gram language models. The results demonstrated that all the architectures achieved comparable overall accuracies around 75%. Error analyses by linguistic phenomenon further revealed that these language models successfully captured local dependencies like nominal structures, but not long-distance dependencies such as verbal agreement and binding.</abstract>
<identifier type="citekey">someya-oseki-2023-jblimp</identifier>
<identifier type="doi">10.18653/v1/2023.findings-eacl.117</identifier>
<location>
<url>https://aclanthology.org/2023.findings-eacl.117</url>
</location>
<part>
<date>2023-05</date>
<extent unit="page">
<start>1581</start>
<end>1594</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T JBLiMP: Japanese Benchmark of Linguistic Minimal Pairs
%A Someya, Taiga
%A Oseki, Yohei
%Y Vlachos, Andreas
%Y Augenstein, Isabelle
%S Findings of the Association for Computational Linguistics: EACL 2023
%D 2023
%8 May
%I Association for Computational Linguistics
%C Dubrovnik, Croatia
%F someya-oseki-2023-jblimp
%X In this paper, we introduce JBLiMP (Japanese Benchmark of Linguistic Minimal Pairs), a novel dataset for targeted syntactic evaluations of language models in Japanese. JBLiMP consists of 331 minimal pairs, which are created based on acceptability judgments extracted from journal articles in theoretical linguistics. These minimal pairs are grouped into 11 categories, each covering a different linguistic phenomenon. JBLiMP is unique in that it successfully combines two important features independently observed in existing datasets: (i) coverage of complex linguistic phenomena (cf. CoLA) and (ii) presentation of sentences as minimal pairs (cf. BLiMP). In addition, JBLiMP is the first dataset for targeted syntactic evaluations of language models in Japanese, thus allowing the comparison of syntactic knowledge of language models across different languages. We then evaluate the syntactic knowledge of several language models on JBLiMP: GPT-2, LSTM, and n-gram language models. The results demonstrated that all the architectures achieved comparable overall accuracies around 75%. Error analyses by linguistic phenomenon further revealed that these language models successfully captured local dependencies like nominal structures, but not long-distance dependencies such as verbal agreement and binding.
%R 10.18653/v1/2023.findings-eacl.117
%U https://aclanthology.org/2023.findings-eacl.117
%U https://doi.org/10.18653/v1/2023.findings-eacl.117
%P 1581-1594
Markdown (Informal)
[JBLiMP: Japanese Benchmark of Linguistic Minimal Pairs](https://aclanthology.org/2023.findings-eacl.117) (Someya & Oseki, Findings 2023)
ACL