@inproceedings{toney-2024-kind,
title = "What Kind of Sourcery is This? Evaluating {GPT}-4{'}s Performance on Linking Scientific Fact to Citations",
author = "Toney, Autumn",
editor = "Kumar, Sachin and
Balachandran, Vidhisha and
Park, Chan Young and
Shi, Weijia and
Hayati, Shirley Anugrah and
Tsvetkov, Yulia and
Smith, Noah and
Hajishirzi, Hannaneh and
Kang, Dongyeop and
Jurgens, David",
booktitle = "Proceedings of the 1st Workshop on Customizable NLP: Progress and Challenges in Customizing NLP for a Domain, Application, Group, or Individual (CustomNLP4U)",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.customnlp4u-1.19",
pages = "257--268",
abstract = "From document summarization to code generation, chabots have disrupted various aspects of scientific research and writing. While chabots are useful research resources for ideation, information retrieval, and editing, their generative pre-trained transformer (GPT) models{'} underlying knowledge infrastructure is opaque. This has raised questions about the reliability of generative chatbot responses, as GPT models are known to respond with misleading information that appears to be accurate. Prior research has investigated the utility of OpenAI{'}s public chatbot, ChatGPT, to generate reliable bibliographic information with a focus on small-scale medical-related scientific facts. We present an expanded study that analyzes GPT-4{'}s ability to accurately identify 1,326 scientific facts and link them to academic sources. Using both the API and UI service, we experimented with open-ended and close-ended prompts to establish an understanding of GPT-4{'}s general ability at this domain-specific task, as well as study the real-world scenario of an average user interacting with ChatGPT using its UI. GPT-4 accurately identified 96{\%} of the scientific facts and generated relevant and existent academic citations with 78{\%} accuracy. Using the claims that GPT-4 mislabeled and provided incorrect sources via the API, we prompt two public GPTs customized for academic writing to evaluate if they correctly label the scientific claims and provide accurate sources. We find that these GPTs are able to accurately label 38{\%} of the mislabeled claims, with 95{\%} of the corresponding citations being accurate and relevant.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="toney-2024-kind">
<titleInfo>
<title>What Kind of Sourcery is This? Evaluating GPT-4’s Performance on Linking Scientific Fact to Citations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Autumn</namePart>
<namePart type="family">Toney</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on Customizable NLP: Progress and Challenges in Customizing NLP for a Domain, Application, Group, or Individual (CustomNLP4U)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sachin</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vidhisha</namePart>
<namePart type="family">Balachandran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chan</namePart>
<namePart type="given">Young</namePart>
<namePart type="family">Park</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Weijia</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shirley</namePart>
<namePart type="given">Anugrah</namePart>
<namePart type="family">Hayati</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulia</namePart>
<namePart type="family">Tsvetkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Noah</namePart>
<namePart type="family">Smith</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hannaneh</namePart>
<namePart type="family">Hajishirzi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dongyeop</namePart>
<namePart type="family">Kang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>From document summarization to code generation, chabots have disrupted various aspects of scientific research and writing. While chabots are useful research resources for ideation, information retrieval, and editing, their generative pre-trained transformer (GPT) models’ underlying knowledge infrastructure is opaque. This has raised questions about the reliability of generative chatbot responses, as GPT models are known to respond with misleading information that appears to be accurate. Prior research has investigated the utility of OpenAI’s public chatbot, ChatGPT, to generate reliable bibliographic information with a focus on small-scale medical-related scientific facts. We present an expanded study that analyzes GPT-4’s ability to accurately identify 1,326 scientific facts and link them to academic sources. Using both the API and UI service, we experimented with open-ended and close-ended prompts to establish an understanding of GPT-4’s general ability at this domain-specific task, as well as study the real-world scenario of an average user interacting with ChatGPT using its UI. GPT-4 accurately identified 96% of the scientific facts and generated relevant and existent academic citations with 78% accuracy. Using the claims that GPT-4 mislabeled and provided incorrect sources via the API, we prompt two public GPTs customized for academic writing to evaluate if they correctly label the scientific claims and provide accurate sources. We find that these GPTs are able to accurately label 38% of the mislabeled claims, with 95% of the corresponding citations being accurate and relevant.</abstract>
<identifier type="citekey">toney-2024-kind</identifier>
<location>
<url>https://aclanthology.org/2024.customnlp4u-1.19</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>257</start>
<end>268</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T What Kind of Sourcery is This? Evaluating GPT-4’s Performance on Linking Scientific Fact to Citations
%A Toney, Autumn
%Y Kumar, Sachin
%Y Balachandran, Vidhisha
%Y Park, Chan Young
%Y Shi, Weijia
%Y Hayati, Shirley Anugrah
%Y Tsvetkov, Yulia
%Y Smith, Noah
%Y Hajishirzi, Hannaneh
%Y Kang, Dongyeop
%Y Jurgens, David
%S Proceedings of the 1st Workshop on Customizable NLP: Progress and Challenges in Customizing NLP for a Domain, Application, Group, or Individual (CustomNLP4U)
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F toney-2024-kind
%X From document summarization to code generation, chabots have disrupted various aspects of scientific research and writing. While chabots are useful research resources for ideation, information retrieval, and editing, their generative pre-trained transformer (GPT) models’ underlying knowledge infrastructure is opaque. This has raised questions about the reliability of generative chatbot responses, as GPT models are known to respond with misleading information that appears to be accurate. Prior research has investigated the utility of OpenAI’s public chatbot, ChatGPT, to generate reliable bibliographic information with a focus on small-scale medical-related scientific facts. We present an expanded study that analyzes GPT-4’s ability to accurately identify 1,326 scientific facts and link them to academic sources. Using both the API and UI service, we experimented with open-ended and close-ended prompts to establish an understanding of GPT-4’s general ability at this domain-specific task, as well as study the real-world scenario of an average user interacting with ChatGPT using its UI. GPT-4 accurately identified 96% of the scientific facts and generated relevant and existent academic citations with 78% accuracy. Using the claims that GPT-4 mislabeled and provided incorrect sources via the API, we prompt two public GPTs customized for academic writing to evaluate if they correctly label the scientific claims and provide accurate sources. We find that these GPTs are able to accurately label 38% of the mislabeled claims, with 95% of the corresponding citations being accurate and relevant.
%U https://aclanthology.org/2024.customnlp4u-1.19
%P 257-268
Markdown (Informal)
[What Kind of Sourcery is This? Evaluating GPT-4’s Performance on Linking Scientific Fact to Citations](https://aclanthology.org/2024.customnlp4u-1.19) (Toney, CustomNLP4U 2024)
ACL