@inproceedings{huang-etal-2024-lateval,
title = "{L}at{E}val: An Interactive {LLM}s Evaluation Benchmark with Incomplete Information from Lateral Thinking Puzzles",
author = "Huang, Shulin and
Ma, Shirong and
Li, Yinghui and
Huang, Mengzuo and
Zou, Wuhe and
Zhang, Weidong and
Zheng, Haitao",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.889",
pages = "10186--10197",
abstract = "With the evolution of LLMs, they are endowed with impressive logical reasoning, or vertical thinking capabilities. But can they think out of the box? Do they possess proficient lateral thinking abilities? Following the setup of Lateral Thinking Puzzles, we propose a novel evaluation benchmark, LatEval, which assesses the model{'}s lateral thinking within an interactive framework. In our benchmark, we challenge LLMs with 2 aspects: (1) posing high-quality questions that break out of conventional norms but are beneficial for puzzle-solving. (2) integrating existing information to gradually deduce the truth through reasoning. We observe that it is hard for most LLMs to accomplish lateral thinking during interactions. Even the most powerful LLM, GPT-4, faces challenges in achieving satisfactory performance, and for most open-source models, simply completing this task is quite difficult. This evaluation benchmark provides LLMs with a highly challenging and differentiating task that is crucial to an effective AI assistant. Our dataset and source codes are available at https://github.com/THUKElab/LatEval.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="huang-etal-2024-lateval">
<titleInfo>
<title>LatEval: An Interactive LLMs Evaluation Benchmark with Incomplete Information from Lateral Thinking Puzzles</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shulin</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shirong</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yinghui</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mengzuo</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wuhe</namePart>
<namePart type="family">Zou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Weidong</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haitao</namePart>
<namePart type="family">Zheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>With the evolution of LLMs, they are endowed with impressive logical reasoning, or vertical thinking capabilities. But can they think out of the box? Do they possess proficient lateral thinking abilities? Following the setup of Lateral Thinking Puzzles, we propose a novel evaluation benchmark, LatEval, which assesses the model’s lateral thinking within an interactive framework. In our benchmark, we challenge LLMs with 2 aspects: (1) posing high-quality questions that break out of conventional norms but are beneficial for puzzle-solving. (2) integrating existing information to gradually deduce the truth through reasoning. We observe that it is hard for most LLMs to accomplish lateral thinking during interactions. Even the most powerful LLM, GPT-4, faces challenges in achieving satisfactory performance, and for most open-source models, simply completing this task is quite difficult. This evaluation benchmark provides LLMs with a highly challenging and differentiating task that is crucial to an effective AI assistant. Our dataset and source codes are available at https://github.com/THUKElab/LatEval.</abstract>
<identifier type="citekey">huang-etal-2024-lateval</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.889</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>10186</start>
<end>10197</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T LatEval: An Interactive LLMs Evaluation Benchmark with Incomplete Information from Lateral Thinking Puzzles
%A Huang, Shulin
%A Ma, Shirong
%A Li, Yinghui
%A Huang, Mengzuo
%A Zou, Wuhe
%A Zhang, Weidong
%A Zheng, Haitao
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F huang-etal-2024-lateval
%X With the evolution of LLMs, they are endowed with impressive logical reasoning, or vertical thinking capabilities. But can they think out of the box? Do they possess proficient lateral thinking abilities? Following the setup of Lateral Thinking Puzzles, we propose a novel evaluation benchmark, LatEval, which assesses the model’s lateral thinking within an interactive framework. In our benchmark, we challenge LLMs with 2 aspects: (1) posing high-quality questions that break out of conventional norms but are beneficial for puzzle-solving. (2) integrating existing information to gradually deduce the truth through reasoning. We observe that it is hard for most LLMs to accomplish lateral thinking during interactions. Even the most powerful LLM, GPT-4, faces challenges in achieving satisfactory performance, and for most open-source models, simply completing this task is quite difficult. This evaluation benchmark provides LLMs with a highly challenging and differentiating task that is crucial to an effective AI assistant. Our dataset and source codes are available at https://github.com/THUKElab/LatEval.
%U https://aclanthology.org/2024.lrec-main.889
%P 10186-10197
Markdown (Informal)
[LatEval: An Interactive LLMs Evaluation Benchmark with Incomplete Information from Lateral Thinking Puzzles](https://aclanthology.org/2024.lrec-main.889) (Huang et al., LREC-COLING 2024)
ACL
- Shulin Huang, Shirong Ma, Yinghui Li, Mengzuo Huang, Wuhe Zou, Weidong Zhang, and Haitao Zheng. 2024. LatEval: An Interactive LLMs Evaluation Benchmark with Incomplete Information from Lateral Thinking Puzzles. In Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), pages 10186–10197, Torino, Italia. ELRA and ICCL.