@inproceedings{liang-leung-2021-improving,
title = "Improving Model Generalization: A {C}hinese Named Entity Recognition Case Study",
author = "Liang, Guanqing and
Leung, Cane Wing-Ki",
editor = "Zong, Chengqing and
Xia, Fei and
Li, Wenjie and
Navigli, Roberto",
booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 2: Short Papers)",
month = aug,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.acl-short.125",
doi = "10.18653/v1/2021.acl-short.125",
pages = "992--997",
abstract = "Generalization is an important ability that helps to ensure that a machine learning model can perform well on unseen data. In this paper, we study the effect of data bias on model generalization, using Chinese Named Entity Recognition (NER) as a case study. Specifically, we analyzed five benchmarking datasets for Chinese NER, and observed the following two types of data bias that can compromise model generalization ability. Firstly, the test sets of all the five datasets contain a significant proportion of entities that have been seen in the training sets. Such test data would therefore not be able to reflect the true generalization ability of a model. Secondly, all datasets are dominated by a few fat-head entities, i.e., entities appearing with particularly high frequency. As a result, a model might be able to produce high prediction accuracy simply by keyword memorization without leveraging context knowledge. To address these data biases, we first refine each test set by excluding seen entities from it, so as to better evaluate a model{'}s generalization ability. Then, we propose a simple yet effective entity resampling method to make entities within the same category distributed equally, encouraging a model to leverage both name and context knowledge in the training process. Experimental results demonstrate that the proposed entity resampling method significantly improves a model{'}s ability in detecting unseen entities, especially for company, organization and position categories.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="liang-leung-2021-improving">
<titleInfo>
<title>Improving Model Generalization: A Chinese Named Entity Recognition Case Study</title>
</titleInfo>
<name type="personal">
<namePart type="given">Guanqing</namePart>
<namePart type="family">Liang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cane</namePart>
<namePart type="given">Wing-Ki</namePart>
<namePart type="family">Leung</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 2: Short Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chengqing</namePart>
<namePart type="family">Zong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fei</namePart>
<namePart type="family">Xia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenjie</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roberto</namePart>
<namePart type="family">Navigli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Generalization is an important ability that helps to ensure that a machine learning model can perform well on unseen data. In this paper, we study the effect of data bias on model generalization, using Chinese Named Entity Recognition (NER) as a case study. Specifically, we analyzed five benchmarking datasets for Chinese NER, and observed the following two types of data bias that can compromise model generalization ability. Firstly, the test sets of all the five datasets contain a significant proportion of entities that have been seen in the training sets. Such test data would therefore not be able to reflect the true generalization ability of a model. Secondly, all datasets are dominated by a few fat-head entities, i.e., entities appearing with particularly high frequency. As a result, a model might be able to produce high prediction accuracy simply by keyword memorization without leveraging context knowledge. To address these data biases, we first refine each test set by excluding seen entities from it, so as to better evaluate a model’s generalization ability. Then, we propose a simple yet effective entity resampling method to make entities within the same category distributed equally, encouraging a model to leverage both name and context knowledge in the training process. Experimental results demonstrate that the proposed entity resampling method significantly improves a model’s ability in detecting unseen entities, especially for company, organization and position categories.</abstract>
<identifier type="citekey">liang-leung-2021-improving</identifier>
<identifier type="doi">10.18653/v1/2021.acl-short.125</identifier>
<location>
<url>https://aclanthology.org/2021.acl-short.125</url>
</location>
<part>
<date>2021-08</date>
<extent unit="page">
<start>992</start>
<end>997</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Improving Model Generalization: A Chinese Named Entity Recognition Case Study
%A Liang, Guanqing
%A Leung, Cane Wing-Ki
%Y Zong, Chengqing
%Y Xia, Fei
%Y Li, Wenjie
%Y Navigli, Roberto
%S Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 2: Short Papers)
%D 2021
%8 August
%I Association for Computational Linguistics
%C Online
%F liang-leung-2021-improving
%X Generalization is an important ability that helps to ensure that a machine learning model can perform well on unseen data. In this paper, we study the effect of data bias on model generalization, using Chinese Named Entity Recognition (NER) as a case study. Specifically, we analyzed five benchmarking datasets for Chinese NER, and observed the following two types of data bias that can compromise model generalization ability. Firstly, the test sets of all the five datasets contain a significant proportion of entities that have been seen in the training sets. Such test data would therefore not be able to reflect the true generalization ability of a model. Secondly, all datasets are dominated by a few fat-head entities, i.e., entities appearing with particularly high frequency. As a result, a model might be able to produce high prediction accuracy simply by keyword memorization without leveraging context knowledge. To address these data biases, we first refine each test set by excluding seen entities from it, so as to better evaluate a model’s generalization ability. Then, we propose a simple yet effective entity resampling method to make entities within the same category distributed equally, encouraging a model to leverage both name and context knowledge in the training process. Experimental results demonstrate that the proposed entity resampling method significantly improves a model’s ability in detecting unseen entities, especially for company, organization and position categories.
%R 10.18653/v1/2021.acl-short.125
%U https://aclanthology.org/2021.acl-short.125
%U https://doi.org/10.18653/v1/2021.acl-short.125
%P 992-997
Markdown (Informal)
[Improving Model Generalization: A Chinese Named Entity Recognition Case Study](https://aclanthology.org/2021.acl-short.125) (Liang & Leung, ACL-IJCNLP 2021)
ACL