@inproceedings{li-etal-2025-consistency,
title = "On the Consistency of Commonsense in Large Language Models",
author = "Li, Guozheng and
Wang, Peng and
Ke, Wenjun and
Xu, Zijie and
Liu, Jiajun and
Shang, Ziyu",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.834/",
doi = "10.18653/v1/2025.findings-acl.834",
pages = "16205--16225",
ISBN = "979-8-89176-256-5",
abstract = "Commonsense, humans' implicit understanding of everyday situations, is crucial for large language models (LLMs). Existing commonsense evaluations for LLMs primarily focus on downstream knowledge tasks, failing to probe whether LLMs truly understand and utilize knowledge or merely memorize it. They also rely heavily on human annotation and lack automated large-scale data generation. To address this, we propose to automatically construct a large benchmark named CoCo (Consistency of Commonsense) comprising 39K samples derived from commonsense knowledge graphs (CSKGs), paired with symbolic questions and ground-truth answers, which systematically assesses LLMs' knowledge memorization, comprehension, and application and examines the consistency between these tasks. To enhance our evaluation, we also propose novel metrics and prompting strategies. Experimental results on multiple LLMs reveal that CoCo presents significant challenges, and our detailed analysis provides deeper insights into the strengths and limitations of LLMs' commonsense abilities."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="li-etal-2025-consistency">
<titleInfo>
<title>On the Consistency of Commonsense in Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Guozheng</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peng</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenjun</namePart>
<namePart type="family">Ke</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zijie</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ziyu</namePart>
<namePart type="family">Shang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Commonsense, humans’ implicit understanding of everyday situations, is crucial for large language models (LLMs). Existing commonsense evaluations for LLMs primarily focus on downstream knowledge tasks, failing to probe whether LLMs truly understand and utilize knowledge or merely memorize it. They also rely heavily on human annotation and lack automated large-scale data generation. To address this, we propose to automatically construct a large benchmark named CoCo (Consistency of Commonsense) comprising 39K samples derived from commonsense knowledge graphs (CSKGs), paired with symbolic questions and ground-truth answers, which systematically assesses LLMs’ knowledge memorization, comprehension, and application and examines the consistency between these tasks. To enhance our evaluation, we also propose novel metrics and prompting strategies. Experimental results on multiple LLMs reveal that CoCo presents significant challenges, and our detailed analysis provides deeper insights into the strengths and limitations of LLMs’ commonsense abilities.</abstract>
<identifier type="citekey">li-etal-2025-consistency</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.834</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.834/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>16205</start>
<end>16225</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T On the Consistency of Commonsense in Large Language Models
%A Li, Guozheng
%A Wang, Peng
%A Ke, Wenjun
%A Xu, Zijie
%A Liu, Jiajun
%A Shang, Ziyu
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F li-etal-2025-consistency
%X Commonsense, humans’ implicit understanding of everyday situations, is crucial for large language models (LLMs). Existing commonsense evaluations for LLMs primarily focus on downstream knowledge tasks, failing to probe whether LLMs truly understand and utilize knowledge or merely memorize it. They also rely heavily on human annotation and lack automated large-scale data generation. To address this, we propose to automatically construct a large benchmark named CoCo (Consistency of Commonsense) comprising 39K samples derived from commonsense knowledge graphs (CSKGs), paired with symbolic questions and ground-truth answers, which systematically assesses LLMs’ knowledge memorization, comprehension, and application and examines the consistency between these tasks. To enhance our evaluation, we also propose novel metrics and prompting strategies. Experimental results on multiple LLMs reveal that CoCo presents significant challenges, and our detailed analysis provides deeper insights into the strengths and limitations of LLMs’ commonsense abilities.
%R 10.18653/v1/2025.findings-acl.834
%U https://aclanthology.org/2025.findings-acl.834/
%U https://doi.org/10.18653/v1/2025.findings-acl.834
%P 16205-16225
Markdown (Informal)
[On the Consistency of Commonsense in Large Language Models](https://aclanthology.org/2025.findings-acl.834/) (Li et al., Findings 2025)
ACL