@article{waldis-etal-2024-holmes,
title = "Holmes {\ensuremath{\recorder}} A Benchmark to Assess the Linguistic Competence of Language Models",
author = "Waldis, Andreas and
Perlitz, Yotam and
Choshen, Leshem and
Hou, Yufang and
Gurevych, Iryna",
journal = "Transactions of the Association for Computational Linguistics",
volume = "12",
year = "2024",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2024.tacl-1.88/",
doi = "10.1162/tacl_a_00718",
pages = "1616--1647",
abstract = "We introduce Holmes, a new benchmark designed to assess language models' (LMs') linguistic competence{---}their unconscious understanding of linguistic phenomena. Specifically, we use classifier-based probing to examine LMs' internal representations regarding distinct linguistic phenomena (e.g., part-of-speech tagging). As a result, we meet recent calls to disentangle LMs' linguistic competence from other cognitive abilities, such as following instructions in prompting-based evaluations. Composing Holmes, we review over 270 probing studies and include more than 200 datasets to assess syntax, morphology, semantics, reasoning, and discourse phenomena. Analyzing over 50 LMs reveals that, aligned with known trends, their linguistic competence correlates with model size. However, surprisingly, model architecture and instruction tuning also significantly influence performance, particularly in morphology and syntax. Finally, we propose FlashHolmes, a streamlined version that reduces the computation load while maintaining high-ranking precision."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="waldis-etal-2024-holmes">
<titleInfo>
<title>Holmes \ensuremath\recorder A Benchmark to Assess the Linguistic Competence of Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Andreas</namePart>
<namePart type="family">Waldis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yotam</namePart>
<namePart type="family">Perlitz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leshem</namePart>
<namePart type="family">Choshen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yufang</namePart>
<namePart type="family">Hou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iryna</namePart>
<namePart type="family">Gurevych</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>We introduce Holmes, a new benchmark designed to assess language models’ (LMs’) linguistic competence—their unconscious understanding of linguistic phenomena. Specifically, we use classifier-based probing to examine LMs’ internal representations regarding distinct linguistic phenomena (e.g., part-of-speech tagging). As a result, we meet recent calls to disentangle LMs’ linguistic competence from other cognitive abilities, such as following instructions in prompting-based evaluations. Composing Holmes, we review over 270 probing studies and include more than 200 datasets to assess syntax, morphology, semantics, reasoning, and discourse phenomena. Analyzing over 50 LMs reveals that, aligned with known trends, their linguistic competence correlates with model size. However, surprisingly, model architecture and instruction tuning also significantly influence performance, particularly in morphology and syntax. Finally, we propose FlashHolmes, a streamlined version that reduces the computation load while maintaining high-ranking precision.</abstract>
<identifier type="citekey">waldis-etal-2024-holmes</identifier>
<identifier type="doi">10.1162/tacl_a_00718</identifier>
<location>
<url>https://aclanthology.org/2024.tacl-1.88/</url>
</location>
<part>
<date>2024</date>
<detail type="volume"><number>12</number></detail>
<extent unit="page">
<start>1616</start>
<end>1647</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Holmes \ensuremath\recorder A Benchmark to Assess the Linguistic Competence of Language Models
%A Waldis, Andreas
%A Perlitz, Yotam
%A Choshen, Leshem
%A Hou, Yufang
%A Gurevych, Iryna
%J Transactions of the Association for Computational Linguistics
%D 2024
%V 12
%I MIT Press
%C Cambridge, MA
%F waldis-etal-2024-holmes
%X We introduce Holmes, a new benchmark designed to assess language models’ (LMs’) linguistic competence—their unconscious understanding of linguistic phenomena. Specifically, we use classifier-based probing to examine LMs’ internal representations regarding distinct linguistic phenomena (e.g., part-of-speech tagging). As a result, we meet recent calls to disentangle LMs’ linguistic competence from other cognitive abilities, such as following instructions in prompting-based evaluations. Composing Holmes, we review over 270 probing studies and include more than 200 datasets to assess syntax, morphology, semantics, reasoning, and discourse phenomena. Analyzing over 50 LMs reveals that, aligned with known trends, their linguistic competence correlates with model size. However, surprisingly, model architecture and instruction tuning also significantly influence performance, particularly in morphology and syntax. Finally, we propose FlashHolmes, a streamlined version that reduces the computation load while maintaining high-ranking precision.
%R 10.1162/tacl_a_00718
%U https://aclanthology.org/2024.tacl-1.88/
%U https://doi.org/10.1162/tacl_a_00718
%P 1616-1647
Markdown (Informal)
[Holmes ⌕ A Benchmark to Assess the Linguistic Competence of Language Models](https://aclanthology.org/2024.tacl-1.88/) (Waldis et al., TACL 2024)
ACL