@inproceedings{han-etal-2026-crossing,
title = "{C}ros{S}ing: Cross-Scale Reasoning Evaluation on {LLM}s against Humans",
author = "Han, Qi and
Wu, Yifan and
Schijndel, Marten Van",
editor = "Voigt, Rob and
Warstadt, Alex and
Feldman, Naomi and
Linzen, Tal",
booktitle = "Proceedings of the Society for Computation in Linguistics 2026",
month = jul,
year = "2026",
address = "San Diego, CA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.scil-main.36/",
pages = "379--407",
ISBN = "979-8-89176-412-5",
abstract = "While many studies have shown LLMs perform well in various reasoning tasks, few have examined their capacity on semantic reasoning tasks. As LLMs reason with language, it is crucial to understand how well they grasp and use the underlying scalar relationships in language. In this study, we introduced a new dataset CrosSing (Cross-Scale reasoning), providing a human baseline against which to evaluate LLMs' ability to reason across lexical scales in gradable adjectives. We further probed how their understanding is influenced by overinformative contexts. We evaluated ten high-performing LLMs and found that some outperformed humans when no extra information was provided, but that LLM performance declined in certain overinformative contexts while human performance improved significantly. This contrast reveals a fundamental difference between recent LLMs and humans in understanding adjectives' scalar relationships and how such understanding behaves in overinformative contexts."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="han-etal-2026-crossing">
<titleInfo>
<title>CrosSing: Cross-Scale Reasoning Evaluation on LLMs against Humans</title>
</titleInfo>
<name type="personal">
<namePart type="given">Qi</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yifan</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marten</namePart>
<namePart type="given">Van</namePart>
<namePart type="family">Schijndel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Society for Computation in Linguistics 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rob</namePart>
<namePart type="family">Voigt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Warstadt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naomi</namePart>
<namePart type="family">Feldman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tal</namePart>
<namePart type="family">Linzen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, CA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-412-5</identifier>
</relatedItem>
<abstract>While many studies have shown LLMs perform well in various reasoning tasks, few have examined their capacity on semantic reasoning tasks. As LLMs reason with language, it is crucial to understand how well they grasp and use the underlying scalar relationships in language. In this study, we introduced a new dataset CrosSing (Cross-Scale reasoning), providing a human baseline against which to evaluate LLMs’ ability to reason across lexical scales in gradable adjectives. We further probed how their understanding is influenced by overinformative contexts. We evaluated ten high-performing LLMs and found that some outperformed humans when no extra information was provided, but that LLM performance declined in certain overinformative contexts while human performance improved significantly. This contrast reveals a fundamental difference between recent LLMs and humans in understanding adjectives’ scalar relationships and how such understanding behaves in overinformative contexts.</abstract>
<identifier type="citekey">han-etal-2026-crossing</identifier>
<location>
<url>https://aclanthology.org/2026.scil-main.36/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>379</start>
<end>407</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CrosSing: Cross-Scale Reasoning Evaluation on LLMs against Humans
%A Han, Qi
%A Wu, Yifan
%A Schijndel, Marten Van
%Y Voigt, Rob
%Y Warstadt, Alex
%Y Feldman, Naomi
%Y Linzen, Tal
%S Proceedings of the Society for Computation in Linguistics 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, CA
%@ 979-8-89176-412-5
%F han-etal-2026-crossing
%X While many studies have shown LLMs perform well in various reasoning tasks, few have examined their capacity on semantic reasoning tasks. As LLMs reason with language, it is crucial to understand how well they grasp and use the underlying scalar relationships in language. In this study, we introduced a new dataset CrosSing (Cross-Scale reasoning), providing a human baseline against which to evaluate LLMs’ ability to reason across lexical scales in gradable adjectives. We further probed how their understanding is influenced by overinformative contexts. We evaluated ten high-performing LLMs and found that some outperformed humans when no extra information was provided, but that LLM performance declined in certain overinformative contexts while human performance improved significantly. This contrast reveals a fundamental difference between recent LLMs and humans in understanding adjectives’ scalar relationships and how such understanding behaves in overinformative contexts.
%U https://aclanthology.org/2026.scil-main.36/
%P 379-407
Markdown (Informal)
[CrosSing: Cross-Scale Reasoning Evaluation on LLMs against Humans](https://aclanthology.org/2026.scil-main.36/) (Han et al., SCiL 2026)
ACL