@inproceedings{ray-choudhury-kalra-2023-implications,
title = "Implications of Annotation Artifacts in Edge Probing Test Datasets",
author = "Ray Choudhury, Sagnik and
Kalra, Jushaan",
editor = "Jiang, Jing and
Reitter, David and
Deng, Shumin",
booktitle = "Proceedings of the 27th Conference on Computational Natural Language Learning (CoNLL)",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.conll-1.39",
doi = "10.18653/v1/2023.conll-1.39",
pages = "575--586",
abstract = "Edge probing tests are classification tasks that test for grammatical knowledge encoded in token representations coming from contextual encoders such as large language models (LLMs). Many LLM encoders have shown high performance in EP tests, leading to conjectures about their ability to encode linguistic knowledge. However, a large body of research claims that the tests necessarily do not measure the LLM{'}s capacity to encode knowledge, but rather reflect the classifiers{'} ability to learn the problem. Much of this criticism stems from the fact that often the classifiers have very similar accuracy when an LLM vs a random encoder is used. Consequently, several modifications to the tests have been suggested, including information theoretic probes. We show that commonly used edge probing test datasets have various biases including memorization. When these biases are removed, the LLM encoders do show a significant difference from the random ones, even with the simple non-information theoretic probes.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ray-choudhury-kalra-2023-implications">
<titleInfo>
<title>Implications of Annotation Artifacts in Edge Probing Test Datasets</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sagnik</namePart>
<namePart type="family">Ray Choudhury</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jushaan</namePart>
<namePart type="family">Kalra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 27th Conference on Computational Natural Language Learning (CoNLL)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jing</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Reitter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shumin</namePart>
<namePart type="family">Deng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Edge probing tests are classification tasks that test for grammatical knowledge encoded in token representations coming from contextual encoders such as large language models (LLMs). Many LLM encoders have shown high performance in EP tests, leading to conjectures about their ability to encode linguistic knowledge. However, a large body of research claims that the tests necessarily do not measure the LLM’s capacity to encode knowledge, but rather reflect the classifiers’ ability to learn the problem. Much of this criticism stems from the fact that often the classifiers have very similar accuracy when an LLM vs a random encoder is used. Consequently, several modifications to the tests have been suggested, including information theoretic probes. We show that commonly used edge probing test datasets have various biases including memorization. When these biases are removed, the LLM encoders do show a significant difference from the random ones, even with the simple non-information theoretic probes.</abstract>
<identifier type="citekey">ray-choudhury-kalra-2023-implications</identifier>
<identifier type="doi">10.18653/v1/2023.conll-1.39</identifier>
<location>
<url>https://aclanthology.org/2023.conll-1.39</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>575</start>
<end>586</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Implications of Annotation Artifacts in Edge Probing Test Datasets
%A Ray Choudhury, Sagnik
%A Kalra, Jushaan
%Y Jiang, Jing
%Y Reitter, David
%Y Deng, Shumin
%S Proceedings of the 27th Conference on Computational Natural Language Learning (CoNLL)
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F ray-choudhury-kalra-2023-implications
%X Edge probing tests are classification tasks that test for grammatical knowledge encoded in token representations coming from contextual encoders such as large language models (LLMs). Many LLM encoders have shown high performance in EP tests, leading to conjectures about their ability to encode linguistic knowledge. However, a large body of research claims that the tests necessarily do not measure the LLM’s capacity to encode knowledge, but rather reflect the classifiers’ ability to learn the problem. Much of this criticism stems from the fact that often the classifiers have very similar accuracy when an LLM vs a random encoder is used. Consequently, several modifications to the tests have been suggested, including information theoretic probes. We show that commonly used edge probing test datasets have various biases including memorization. When these biases are removed, the LLM encoders do show a significant difference from the random ones, even with the simple non-information theoretic probes.
%R 10.18653/v1/2023.conll-1.39
%U https://aclanthology.org/2023.conll-1.39
%U https://doi.org/10.18653/v1/2023.conll-1.39
%P 575-586
Markdown (Informal)
[Implications of Annotation Artifacts in Edge Probing Test Datasets](https://aclanthology.org/2023.conll-1.39) (Ray Choudhury & Kalra, CoNLL 2023)
ACL