@inproceedings{zhang-he-2024-large,
title = "Large Language Models Can Not Perform Well in Understanding and Manipulating Natural Language at Both Character and Word Levels?",
author = "Zhang, Yidan and
He, Zhenan",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.691",
pages = "11826--11842",
abstract = "Despite their promising performance across various tasks, recent studies reveal that Large language models (LLMs) still exhibit significant deficiencies in handling several word-level and character-level tasks, e.g., word unscrambling and sentence editing, indicating urgent needs for substantial improvements in basic language understanding and manipulation. To address these challenges, it is crucial to develop large-scale benchmarks that can comprehensively assess the performance of LLMs in basic language tasks. In this paper, we introduce a bilingual benchmark, CWUM, to investigate the capabilities and limitations of LLMs in understanding and manipulating natural language at both character and word levels. CWUM consists of 15 simple text editing tasks, e.g., letter counting, word reversing, Chinese character inserting, etc. We conduct extensive experiments on eight advanced LLMs, including base models and instruction-tuned (chat) variants. The experimental results highlight significant failures of existing LLMs on CWUM tasks that humans can solve perfectly with 100{\%} accuracy. On English tasks of CWUM, the average accuracy of GPT-4, LLaMA-3-70B, and Qwen-72B is 66.64{\%}, 39.32{\%}, and 33.16{\%}, respectively, which lags far behind human performance. Instruction-tuning the base model does not lead to a distinct performance improvement, as the average accuracy of LLaMA-3-70B-Instruct on English tasks is only 1.44{\%} higher than that of the base LLaMA-3-70B. Ultimately, we show that supervised fine-tuning (SFT) can enhance model performance on CWUM without compromising its ability to generalize across general tasks.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-he-2024-large">
<titleInfo>
<title>Large Language Models Can Not Perform Well in Understanding and Manipulating Natural Language at Both Character and Word Levels?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yidan</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhenan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Despite their promising performance across various tasks, recent studies reveal that Large language models (LLMs) still exhibit significant deficiencies in handling several word-level and character-level tasks, e.g., word unscrambling and sentence editing, indicating urgent needs for substantial improvements in basic language understanding and manipulation. To address these challenges, it is crucial to develop large-scale benchmarks that can comprehensively assess the performance of LLMs in basic language tasks. In this paper, we introduce a bilingual benchmark, CWUM, to investigate the capabilities and limitations of LLMs in understanding and manipulating natural language at both character and word levels. CWUM consists of 15 simple text editing tasks, e.g., letter counting, word reversing, Chinese character inserting, etc. We conduct extensive experiments on eight advanced LLMs, including base models and instruction-tuned (chat) variants. The experimental results highlight significant failures of existing LLMs on CWUM tasks that humans can solve perfectly with 100% accuracy. On English tasks of CWUM, the average accuracy of GPT-4, LLaMA-3-70B, and Qwen-72B is 66.64%, 39.32%, and 33.16%, respectively, which lags far behind human performance. Instruction-tuning the base model does not lead to a distinct performance improvement, as the average accuracy of LLaMA-3-70B-Instruct on English tasks is only 1.44% higher than that of the base LLaMA-3-70B. Ultimately, we show that supervised fine-tuning (SFT) can enhance model performance on CWUM without compromising its ability to generalize across general tasks.</abstract>
<identifier type="citekey">zhang-he-2024-large</identifier>
<location>
<url>https://aclanthology.org/2024.findings-emnlp.691</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>11826</start>
<end>11842</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Large Language Models Can Not Perform Well in Understanding and Manipulating Natural Language at Both Character and Word Levels?
%A Zhang, Yidan
%A He, Zhenan
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Findings of the Association for Computational Linguistics: EMNLP 2024
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F zhang-he-2024-large
%X Despite their promising performance across various tasks, recent studies reveal that Large language models (LLMs) still exhibit significant deficiencies in handling several word-level and character-level tasks, e.g., word unscrambling and sentence editing, indicating urgent needs for substantial improvements in basic language understanding and manipulation. To address these challenges, it is crucial to develop large-scale benchmarks that can comprehensively assess the performance of LLMs in basic language tasks. In this paper, we introduce a bilingual benchmark, CWUM, to investigate the capabilities and limitations of LLMs in understanding and manipulating natural language at both character and word levels. CWUM consists of 15 simple text editing tasks, e.g., letter counting, word reversing, Chinese character inserting, etc. We conduct extensive experiments on eight advanced LLMs, including base models and instruction-tuned (chat) variants. The experimental results highlight significant failures of existing LLMs on CWUM tasks that humans can solve perfectly with 100% accuracy. On English tasks of CWUM, the average accuracy of GPT-4, LLaMA-3-70B, and Qwen-72B is 66.64%, 39.32%, and 33.16%, respectively, which lags far behind human performance. Instruction-tuning the base model does not lead to a distinct performance improvement, as the average accuracy of LLaMA-3-70B-Instruct on English tasks is only 1.44% higher than that of the base LLaMA-3-70B. Ultimately, we show that supervised fine-tuning (SFT) can enhance model performance on CWUM without compromising its ability to generalize across general tasks.
%U https://aclanthology.org/2024.findings-emnlp.691
%P 11826-11842
Markdown (Informal)
[Large Language Models Can Not Perform Well in Understanding and Manipulating Natural Language at Both Character and Word Levels?](https://aclanthology.org/2024.findings-emnlp.691) (Zhang & He, Findings 2024)
ACL