@inproceedings{ramprasad-etal-2024-analyzing,
title = "Analyzing {LLM} Behavior in Dialogue Summarization: Unveiling Circumstantial Hallucination Trends",
author = "Ramprasad, Sanjana and
Ferracane, Elisa and
Lipton, Zachary",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.acl-long.677",
doi = "10.18653/v1/2024.acl-long.677",
pages = "12549--12561",
abstract = "Recent advancements in large language models (LLMs) have significantly advanced the capabilities of summarization systems.However, they continue to face a persistent challenge: hallucination. While prior work has extensively examined LLMs in news domains, evaluation of dialogue summarization has primarily focused on BART-based models, resulting in a notable gap in understanding LLM effectiveness.Our work seeks to address this gap by benchmarking LLMs for dialogue summarization faithfulness using human annotations,focusing on identifying and categorizing span-level inconsistencies.Specifically, we evaluate two prominent LLMs: GPT-4 and Alpaca-13B.Our evaluation reveals that LLMs often generate plausible, but not fully supported inferences based on conversation contextual cues, a trait absent in older models. As a result, we propose a refined taxonomy of errors, introducing a novel category termed {``}Contextual Inference{''} to address this aspect of LLM behavior. Using our taxonomy, we compare the behavioral differences between LLMs and older fine-tuned models. Additionally, we systematically assess the efficacy of automatic error detection methods on LLM summaries and find that they struggle to detect these nuanced errors effectively. To address this, we introduce two prompt-based approaches for fine-grained error detection. Our methods outperform existing metrics, particularly in identifying the novel {``}Contextual Inference{''} error type.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ramprasad-etal-2024-analyzing">
<titleInfo>
<title>Analyzing LLM Behavior in Dialogue Summarization: Unveiling Circumstantial Hallucination Trends</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sanjana</namePart>
<namePart type="family">Ramprasad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elisa</namePart>
<namePart type="family">Ferracane</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zachary</namePart>
<namePart type="family">Lipton</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recent advancements in large language models (LLMs) have significantly advanced the capabilities of summarization systems.However, they continue to face a persistent challenge: hallucination. While prior work has extensively examined LLMs in news domains, evaluation of dialogue summarization has primarily focused on BART-based models, resulting in a notable gap in understanding LLM effectiveness.Our work seeks to address this gap by benchmarking LLMs for dialogue summarization faithfulness using human annotations,focusing on identifying and categorizing span-level inconsistencies.Specifically, we evaluate two prominent LLMs: GPT-4 and Alpaca-13B.Our evaluation reveals that LLMs often generate plausible, but not fully supported inferences based on conversation contextual cues, a trait absent in older models. As a result, we propose a refined taxonomy of errors, introducing a novel category termed “Contextual Inference” to address this aspect of LLM behavior. Using our taxonomy, we compare the behavioral differences between LLMs and older fine-tuned models. Additionally, we systematically assess the efficacy of automatic error detection methods on LLM summaries and find that they struggle to detect these nuanced errors effectively. To address this, we introduce two prompt-based approaches for fine-grained error detection. Our methods outperform existing metrics, particularly in identifying the novel “Contextual Inference” error type.</abstract>
<identifier type="citekey">ramprasad-etal-2024-analyzing</identifier>
<identifier type="doi">10.18653/v1/2024.acl-long.677</identifier>
<location>
<url>https://aclanthology.org/2024.acl-long.677</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>12549</start>
<end>12561</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Analyzing LLM Behavior in Dialogue Summarization: Unveiling Circumstantial Hallucination Trends
%A Ramprasad, Sanjana
%A Ferracane, Elisa
%A Lipton, Zachary
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F ramprasad-etal-2024-analyzing
%X Recent advancements in large language models (LLMs) have significantly advanced the capabilities of summarization systems.However, they continue to face a persistent challenge: hallucination. While prior work has extensively examined LLMs in news domains, evaluation of dialogue summarization has primarily focused on BART-based models, resulting in a notable gap in understanding LLM effectiveness.Our work seeks to address this gap by benchmarking LLMs for dialogue summarization faithfulness using human annotations,focusing on identifying and categorizing span-level inconsistencies.Specifically, we evaluate two prominent LLMs: GPT-4 and Alpaca-13B.Our evaluation reveals that LLMs often generate plausible, but not fully supported inferences based on conversation contextual cues, a trait absent in older models. As a result, we propose a refined taxonomy of errors, introducing a novel category termed “Contextual Inference” to address this aspect of LLM behavior. Using our taxonomy, we compare the behavioral differences between LLMs and older fine-tuned models. Additionally, we systematically assess the efficacy of automatic error detection methods on LLM summaries and find that they struggle to detect these nuanced errors effectively. To address this, we introduce two prompt-based approaches for fine-grained error detection. Our methods outperform existing metrics, particularly in identifying the novel “Contextual Inference” error type.
%R 10.18653/v1/2024.acl-long.677
%U https://aclanthology.org/2024.acl-long.677
%U https://doi.org/10.18653/v1/2024.acl-long.677
%P 12549-12561
Markdown (Informal)
[Analyzing LLM Behavior in Dialogue Summarization: Unveiling Circumstantial Hallucination Trends](https://aclanthology.org/2024.acl-long.677) (Ramprasad et al., ACL 2024)
ACL