@inproceedings{tang-etal-2024-towards,
title = "Towards Benchmarking Situational Awareness of Large Language Models:Comprehensive Benchmark, Evaluation and Analysis",
author = "Tang, Guo and
Chu, Zheng and
Zheng, Wenxiang and
Liu, Ming and
Qin, Bing",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.464",
pages = "7904--7928",
abstract = "Situational awareness refers to the capacity to perceive and comprehend the present context and anticipate forthcoming events, which plays a critical role in aiding decision-making, anticipating potential issues, and adapting to dynamic circumstances. Nevertheless, the situational awareness capabilities of large language models have not yet been comprehensively assessed. To address this, we propose SA-Bench, a comprehensive benchmark that covers three tiers of situational awareness capabilities, covering environment perception, situation comprehension and future projection. SA-Bench provides a comprehensive evaluation to explore the situational awareness capabilities of LLMs. We conduct extensive experiments on advanced LLMs, including GPT-4, LLaMA3, Qwen1.5, among others. Our experimental results indicate that even SOTA LLMs still exhibit substantial capability gaps compared to humans. In addition, we thoroughly analysis and examine the challenges encountered by LLMs across various tasks, as well as emphasize the deficiencies they confront. We hope SA-Bench will foster research within the field of situational awareness.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tang-etal-2024-towards">
<titleInfo>
<title>Towards Benchmarking Situational Awareness of Large Language Models:Comprehensive Benchmark, Evaluation and Analysis</title>
</titleInfo>
<name type="personal">
<namePart type="given">Guo</namePart>
<namePart type="family">Tang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zheng</namePart>
<namePart type="family">Chu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenxiang</namePart>
<namePart type="family">Zheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ming</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bing</namePart>
<namePart type="family">Qin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Situational awareness refers to the capacity to perceive and comprehend the present context and anticipate forthcoming events, which plays a critical role in aiding decision-making, anticipating potential issues, and adapting to dynamic circumstances. Nevertheless, the situational awareness capabilities of large language models have not yet been comprehensively assessed. To address this, we propose SA-Bench, a comprehensive benchmark that covers three tiers of situational awareness capabilities, covering environment perception, situation comprehension and future projection. SA-Bench provides a comprehensive evaluation to explore the situational awareness capabilities of LLMs. We conduct extensive experiments on advanced LLMs, including GPT-4, LLaMA3, Qwen1.5, among others. Our experimental results indicate that even SOTA LLMs still exhibit substantial capability gaps compared to humans. In addition, we thoroughly analysis and examine the challenges encountered by LLMs across various tasks, as well as emphasize the deficiencies they confront. We hope SA-Bench will foster research within the field of situational awareness.</abstract>
<identifier type="citekey">tang-etal-2024-towards</identifier>
<location>
<url>https://aclanthology.org/2024.findings-emnlp.464</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>7904</start>
<end>7928</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Towards Benchmarking Situational Awareness of Large Language Models:Comprehensive Benchmark, Evaluation and Analysis
%A Tang, Guo
%A Chu, Zheng
%A Zheng, Wenxiang
%A Liu, Ming
%A Qin, Bing
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Findings of the Association for Computational Linguistics: EMNLP 2024
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F tang-etal-2024-towards
%X Situational awareness refers to the capacity to perceive and comprehend the present context and anticipate forthcoming events, which plays a critical role in aiding decision-making, anticipating potential issues, and adapting to dynamic circumstances. Nevertheless, the situational awareness capabilities of large language models have not yet been comprehensively assessed. To address this, we propose SA-Bench, a comprehensive benchmark that covers three tiers of situational awareness capabilities, covering environment perception, situation comprehension and future projection. SA-Bench provides a comprehensive evaluation to explore the situational awareness capabilities of LLMs. We conduct extensive experiments on advanced LLMs, including GPT-4, LLaMA3, Qwen1.5, among others. Our experimental results indicate that even SOTA LLMs still exhibit substantial capability gaps compared to humans. In addition, we thoroughly analysis and examine the challenges encountered by LLMs across various tasks, as well as emphasize the deficiencies they confront. We hope SA-Bench will foster research within the field of situational awareness.
%U https://aclanthology.org/2024.findings-emnlp.464
%P 7904-7928
Markdown (Informal)
[Towards Benchmarking Situational Awareness of Large Language Models:Comprehensive Benchmark, Evaluation and Analysis](https://aclanthology.org/2024.findings-emnlp.464) (Tang et al., Findings 2024)
ACL