@inproceedings{li-etal-2024-towards-real,
title = "Towards Real-World Writing Assistance: A {C}hinese Character Checking Benchmark with Faked and Misspelled Characters",
author = "Li, Yinghui and
Xu, Zishan and
Chen, Shaoshen and
Huang, Haojing and
Li, Yangning and
Ma, Shirong and
Jiang, Yong and
Li, Zhongli and
Zhou, Qingyu and
Zheng, Hai-Tao and
Shen, Ying",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.acl-long.469",
doi = "10.18653/v1/2024.acl-long.469",
pages = "8656--8668",
abstract = "Writing assistance aims to improve the correctness and quality of input texts, with character checking being crucial in detecting and correcting wrong characters. In the real world where handwriting occupies the vast majority, characters that humans get wrong include faked characters (i.e., untrue characters created due to writing errors) and misspelled characters (i.e., true characters used incorrectly due to spelling errors). However, existing datasets and related studies only focus on misspelled characters that can be represented by computer text encoding systems, thereby ignoring faked characters which are more common and difficult. To break through this dilemma, we present $\textbf{Visual-C}$$^3$, a human-annotated $\textbf{Visual}$ $\textbf{C}$hinese $\textbf{C}$haracter $\textbf{C}$hecking dataset with faked and misspelled Chinese characters. To the best of our knowledge, Visual-C$^3$ is the first real-world visual and the largest human-crafted dataset for the Chinese character checking scenario. Additionally, we also propose and evaluate novel baseline methods on Visual-C$^3$. Extensive empirical results and analyses show that Visual-C$^3$ is high-quality yet challenging. As the first study focusing on Chinese faked characters, the dataset and the baseline methods are publicly available at https://github.com/THUKElab/Visual-C3.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="li-etal-2024-towards-real">
<titleInfo>
<title>Towards Real-World Writing Assistance: A Chinese Character Checking Benchmark with Faked and Misspelled Characters</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yinghui</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zishan</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shaoshen</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haojing</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yangning</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shirong</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yong</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhongli</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qingyu</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hai-Tao</namePart>
<namePart type="family">Zheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ying</namePart>
<namePart type="family">Shen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Writing assistance aims to improve the correctness and quality of input texts, with character checking being crucial in detecting and correcting wrong characters. In the real world where handwriting occupies the vast majority, characters that humans get wrong include faked characters (i.e., untrue characters created due to writing errors) and misspelled characters (i.e., true characters used incorrectly due to spelling errors). However, existing datasets and related studies only focus on misspelled characters that can be represented by computer text encoding systems, thereby ignoring faked characters which are more common and difficult. To break through this dilemma, we present Visual-C³, a human-annotated Visual Chinese Character Checking dataset with faked and misspelled Chinese characters. To the best of our knowledge, Visual-C³ is the first real-world visual and the largest human-crafted dataset for the Chinese character checking scenario. Additionally, we also propose and evaluate novel baseline methods on Visual-C³. Extensive empirical results and analyses show that Visual-C³ is high-quality yet challenging. As the first study focusing on Chinese faked characters, the dataset and the baseline methods are publicly available at https://github.com/THUKElab/Visual-C3.</abstract>
<identifier type="citekey">li-etal-2024-towards-real</identifier>
<identifier type="doi">10.18653/v1/2024.acl-long.469</identifier>
<location>
<url>https://aclanthology.org/2024.acl-long.469</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>8656</start>
<end>8668</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Towards Real-World Writing Assistance: A Chinese Character Checking Benchmark with Faked and Misspelled Characters
%A Li, Yinghui
%A Xu, Zishan
%A Chen, Shaoshen
%A Huang, Haojing
%A Li, Yangning
%A Ma, Shirong
%A Jiang, Yong
%A Li, Zhongli
%A Zhou, Qingyu
%A Zheng, Hai-Tao
%A Shen, Ying
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F li-etal-2024-towards-real
%X Writing assistance aims to improve the correctness and quality of input texts, with character checking being crucial in detecting and correcting wrong characters. In the real world where handwriting occupies the vast majority, characters that humans get wrong include faked characters (i.e., untrue characters created due to writing errors) and misspelled characters (i.e., true characters used incorrectly due to spelling errors). However, existing datasets and related studies only focus on misspelled characters that can be represented by computer text encoding systems, thereby ignoring faked characters which are more common and difficult. To break through this dilemma, we present Visual-C³, a human-annotated Visual Chinese Character Checking dataset with faked and misspelled Chinese characters. To the best of our knowledge, Visual-C³ is the first real-world visual and the largest human-crafted dataset for the Chinese character checking scenario. Additionally, we also propose and evaluate novel baseline methods on Visual-C³. Extensive empirical results and analyses show that Visual-C³ is high-quality yet challenging. As the first study focusing on Chinese faked characters, the dataset and the baseline methods are publicly available at https://github.com/THUKElab/Visual-C3.
%R 10.18653/v1/2024.acl-long.469
%U https://aclanthology.org/2024.acl-long.469
%U https://doi.org/10.18653/v1/2024.acl-long.469
%P 8656-8668
Markdown (Informal)
[Towards Real-World Writing Assistance: A Chinese Character Checking Benchmark with Faked and Misspelled Characters](https://aclanthology.org/2024.acl-long.469) (Li et al., ACL 2024)
ACL
- Yinghui Li, Zishan Xu, Shaoshen Chen, Haojing Huang, Yangning Li, Shirong Ma, Yong Jiang, Zhongli Li, Qingyu Zhou, Hai-Tao Zheng, and Ying Shen. 2024. Towards Real-World Writing Assistance: A Chinese Character Checking Benchmark with Faked and Misspelled Characters. In Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 8656–8668, Bangkok, Thailand. Association for Computational Linguistics.