@inproceedings{mcknight-fyshe-2024-characterizing,
title = "Characterizing Human and Zero-Shot {GPT}-3.5 Object-Similarity Judgments",
author = "McKnight, D and
Fyshe, Alona",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2024",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-naacl.242",
doi = "10.18653/v1/2024.findings-naacl.242",
pages = "3810--3828",
abstract = "Recent advancements in large language models{'} (LLMs) capabilities have yielded few-shot, human-comparable performance on a range of tasks. At the same time, researchers expend significant effort and resources gathering human annotations. At some point, LLMs may be able to perform some simple annotation tasks, but studies of LLM annotation accuracy and behavior are sparse. In this paper, we characterize OpenAI{'}s GPT-3.5{'}s judgment on a behavioral task for implicit object categorization. We characterize the embedding spaces of models trained on human vs. GPT responses and give similarities and differences between them, finding many similar dimensions. We also find that despite these similar dimensions, augmenting humans{'} responses with GPT ones drives model divergence across the sizes of datasets tested.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mcknight-fyshe-2024-characterizing">
<titleInfo>
<title>Characterizing Human and Zero-Shot GPT-3.5 Object-Similarity Judgments</title>
</titleInfo>
<name type="personal">
<namePart type="given">D</namePart>
<namePart type="family">McKnight</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alona</namePart>
<namePart type="family">Fyshe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recent advancements in large language models’ (LLMs) capabilities have yielded few-shot, human-comparable performance on a range of tasks. At the same time, researchers expend significant effort and resources gathering human annotations. At some point, LLMs may be able to perform some simple annotation tasks, but studies of LLM annotation accuracy and behavior are sparse. In this paper, we characterize OpenAI’s GPT-3.5’s judgment on a behavioral task for implicit object categorization. We characterize the embedding spaces of models trained on human vs. GPT responses and give similarities and differences between them, finding many similar dimensions. We also find that despite these similar dimensions, augmenting humans’ responses with GPT ones drives model divergence across the sizes of datasets tested.</abstract>
<identifier type="citekey">mcknight-fyshe-2024-characterizing</identifier>
<identifier type="doi">10.18653/v1/2024.findings-naacl.242</identifier>
<location>
<url>https://aclanthology.org/2024.findings-naacl.242</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>3810</start>
<end>3828</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Characterizing Human and Zero-Shot GPT-3.5 Object-Similarity Judgments
%A McKnight, D.
%A Fyshe, Alona
%Y Duh, Kevin
%Y Gomez, Helena
%Y Bethard, Steven
%S Findings of the Association for Computational Linguistics: NAACL 2024
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F mcknight-fyshe-2024-characterizing
%X Recent advancements in large language models’ (LLMs) capabilities have yielded few-shot, human-comparable performance on a range of tasks. At the same time, researchers expend significant effort and resources gathering human annotations. At some point, LLMs may be able to perform some simple annotation tasks, but studies of LLM annotation accuracy and behavior are sparse. In this paper, we characterize OpenAI’s GPT-3.5’s judgment on a behavioral task for implicit object categorization. We characterize the embedding spaces of models trained on human vs. GPT responses and give similarities and differences between them, finding many similar dimensions. We also find that despite these similar dimensions, augmenting humans’ responses with GPT ones drives model divergence across the sizes of datasets tested.
%R 10.18653/v1/2024.findings-naacl.242
%U https://aclanthology.org/2024.findings-naacl.242
%U https://doi.org/10.18653/v1/2024.findings-naacl.242
%P 3810-3828
Markdown (Informal)
[Characterizing Human and Zero-Shot GPT-3.5 Object-Similarity Judgments](https://aclanthology.org/2024.findings-naacl.242) (McKnight & Fyshe, Findings 2024)
ACL