@inproceedings{ram-2025-citation,
title = "Citation Drift: Measuring Reference Stability in Multi-Turn {LLM} Conversations",
author = "Ram, Gokul Srinath Seetha",
editor = "Accomazzi, Alberto and
Ghosal, Tirthankar and
Grezes, Felix and
Lockhart, Kelly",
booktitle = "Proceedings of the Third Workshop for Artificial Intelligence for Scientific Publications",
month = dec,
year = "2025",
address = "Mumbai, India and virtual",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.wasp-main.20/",
pages = "186--191",
ISBN = "979-8-89176-310-4",
abstract = "Large Language Models (LLMs) are increasingly used for scientific writing and research assistance, yet their ability to maintain consistent citations across multi-turn conversations remains unexplored. This paper introduces the concept of citation drift{---}the phenomenon where references mutate, disappear, or get fabricated during extended LLM interactions. We analyze 240 conversations across four LLaMA models using 36 authentic scientific papers from six domains and find significant citation instability. LLaMA-4-Maverick-17B achieves the highest stability (0.481) and lowest fabrication entropy, while LLaMA-4-Scout-17B fabricates up to 85.6{\%} of citations. We introduce five new metrics{---}stability, fabrication rate, drift rate, drift entropy, and willingness-to-cite{---}providing a standardized framework for evaluating factual reliability in scientific dialogue systems. Our benchmark offers reproducible, model-agnostic evaluation tools for assessing citation reliability in AI-assisted research workflows."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ram-2025-citation">
<titleInfo>
<title>Citation Drift: Measuring Reference Stability in Multi-Turn LLM Conversations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Gokul</namePart>
<namePart type="given">Srinath</namePart>
<namePart type="given">Seetha</namePart>
<namePart type="family">Ram</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Third Workshop for Artificial Intelligence for Scientific Publications</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alberto</namePart>
<namePart type="family">Accomazzi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tirthankar</namePart>
<namePart type="family">Ghosal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Felix</namePart>
<namePart type="family">Grezes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kelly</namePart>
<namePart type="family">Lockhart</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mumbai, India and virtual</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-310-4</identifier>
</relatedItem>
<abstract>Large Language Models (LLMs) are increasingly used for scientific writing and research assistance, yet their ability to maintain consistent citations across multi-turn conversations remains unexplored. This paper introduces the concept of citation drift—the phenomenon where references mutate, disappear, or get fabricated during extended LLM interactions. We analyze 240 conversations across four LLaMA models using 36 authentic scientific papers from six domains and find significant citation instability. LLaMA-4-Maverick-17B achieves the highest stability (0.481) and lowest fabrication entropy, while LLaMA-4-Scout-17B fabricates up to 85.6% of citations. We introduce five new metrics—stability, fabrication rate, drift rate, drift entropy, and willingness-to-cite—providing a standardized framework for evaluating factual reliability in scientific dialogue systems. Our benchmark offers reproducible, model-agnostic evaluation tools for assessing citation reliability in AI-assisted research workflows.</abstract>
<identifier type="citekey">ram-2025-citation</identifier>
<location>
<url>https://aclanthology.org/2025.wasp-main.20/</url>
</location>
<part>
<date>2025-12</date>
<extent unit="page">
<start>186</start>
<end>191</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Citation Drift: Measuring Reference Stability in Multi-Turn LLM Conversations
%A Ram, Gokul Srinath Seetha
%Y Accomazzi, Alberto
%Y Ghosal, Tirthankar
%Y Grezes, Felix
%Y Lockhart, Kelly
%S Proceedings of the Third Workshop for Artificial Intelligence for Scientific Publications
%D 2025
%8 December
%I Association for Computational Linguistics
%C Mumbai, India and virtual
%@ 979-8-89176-310-4
%F ram-2025-citation
%X Large Language Models (LLMs) are increasingly used for scientific writing and research assistance, yet their ability to maintain consistent citations across multi-turn conversations remains unexplored. This paper introduces the concept of citation drift—the phenomenon where references mutate, disappear, or get fabricated during extended LLM interactions. We analyze 240 conversations across four LLaMA models using 36 authentic scientific papers from six domains and find significant citation instability. LLaMA-4-Maverick-17B achieves the highest stability (0.481) and lowest fabrication entropy, while LLaMA-4-Scout-17B fabricates up to 85.6% of citations. We introduce five new metrics—stability, fabrication rate, drift rate, drift entropy, and willingness-to-cite—providing a standardized framework for evaluating factual reliability in scientific dialogue systems. Our benchmark offers reproducible, model-agnostic evaluation tools for assessing citation reliability in AI-assisted research workflows.
%U https://aclanthology.org/2025.wasp-main.20/
%P 186-191
Markdown (Informal)
[Citation Drift: Measuring Reference Stability in Multi-Turn LLM Conversations](https://aclanthology.org/2025.wasp-main.20/) (Ram, WASP 2025)
ACL