@inproceedings{shim-etal-2022-exploratory,
title = "An exploratory data analysis: the performance differences of a medical code prediction system on different demographic groups",
author = "Shim, Heereen and
Lowet, Dietwig and
Luca, Stijn and
Vanrumste, Bart",
editor = "Naumann, Tristan and
Bethard, Steven and
Roberts, Kirk and
Rumshisky, Anna",
booktitle = "Proceedings of the 4th Clinical Natural Language Processing Workshop",
month = jul,
year = "2022",
address = "Seattle, WA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.clinicalnlp-1.10",
doi = "10.18653/v1/2022.clinicalnlp-1.10",
pages = "93--102",
abstract = "Recent studies show that neural natural processing models for medical code prediction suffer from a label imbalance issue. This study aims to investigate further imbalance in a medical code prediction dataset in terms of demographic variables and analyse performance differences in demographic groups. We use sample-based metrics to correctly evaluate the performance in terms of the data subject. Also, a simple label distance metric is proposed to quantify the difference in the label distribution between a group and the entire data. Our analysis results reveal that the model performs differently towards different demographic groups: significant differences between age groups and between insurance types are observed. Interestingly, we found a weak positive correlation between the number of training data of the group and the performance of the group. However, a strong negative correlation between the label distance of the group and the performance of the group is observed. This result suggests that the model tends to perform poorly in the group whose label distribution is different from the global label distribution of the training data set. Further analysis of the model performance is required to identify the cause of these differences and to improve the model building.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="shim-etal-2022-exploratory">
<titleInfo>
<title>An exploratory data analysis: the performance differences of a medical code prediction system on different demographic groups</title>
</titleInfo>
<name type="personal">
<namePart type="given">Heereen</namePart>
<namePart type="family">Shim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dietwig</namePart>
<namePart type="family">Lowet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stijn</namePart>
<namePart type="family">Luca</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bart</namePart>
<namePart type="family">Vanrumste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 4th Clinical Natural Language Processing Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tristan</namePart>
<namePart type="family">Naumann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kirk</namePart>
<namePart type="family">Roberts</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rumshisky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Seattle, WA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recent studies show that neural natural processing models for medical code prediction suffer from a label imbalance issue. This study aims to investigate further imbalance in a medical code prediction dataset in terms of demographic variables and analyse performance differences in demographic groups. We use sample-based metrics to correctly evaluate the performance in terms of the data subject. Also, a simple label distance metric is proposed to quantify the difference in the label distribution between a group and the entire data. Our analysis results reveal that the model performs differently towards different demographic groups: significant differences between age groups and between insurance types are observed. Interestingly, we found a weak positive correlation between the number of training data of the group and the performance of the group. However, a strong negative correlation between the label distance of the group and the performance of the group is observed. This result suggests that the model tends to perform poorly in the group whose label distribution is different from the global label distribution of the training data set. Further analysis of the model performance is required to identify the cause of these differences and to improve the model building.</abstract>
<identifier type="citekey">shim-etal-2022-exploratory</identifier>
<identifier type="doi">10.18653/v1/2022.clinicalnlp-1.10</identifier>
<location>
<url>https://aclanthology.org/2022.clinicalnlp-1.10</url>
</location>
<part>
<date>2022-07</date>
<extent unit="page">
<start>93</start>
<end>102</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T An exploratory data analysis: the performance differences of a medical code prediction system on different demographic groups
%A Shim, Heereen
%A Lowet, Dietwig
%A Luca, Stijn
%A Vanrumste, Bart
%Y Naumann, Tristan
%Y Bethard, Steven
%Y Roberts, Kirk
%Y Rumshisky, Anna
%S Proceedings of the 4th Clinical Natural Language Processing Workshop
%D 2022
%8 July
%I Association for Computational Linguistics
%C Seattle, WA
%F shim-etal-2022-exploratory
%X Recent studies show that neural natural processing models for medical code prediction suffer from a label imbalance issue. This study aims to investigate further imbalance in a medical code prediction dataset in terms of demographic variables and analyse performance differences in demographic groups. We use sample-based metrics to correctly evaluate the performance in terms of the data subject. Also, a simple label distance metric is proposed to quantify the difference in the label distribution between a group and the entire data. Our analysis results reveal that the model performs differently towards different demographic groups: significant differences between age groups and between insurance types are observed. Interestingly, we found a weak positive correlation between the number of training data of the group and the performance of the group. However, a strong negative correlation between the label distance of the group and the performance of the group is observed. This result suggests that the model tends to perform poorly in the group whose label distribution is different from the global label distribution of the training data set. Further analysis of the model performance is required to identify the cause of these differences and to improve the model building.
%R 10.18653/v1/2022.clinicalnlp-1.10
%U https://aclanthology.org/2022.clinicalnlp-1.10
%U https://doi.org/10.18653/v1/2022.clinicalnlp-1.10
%P 93-102
Markdown (Informal)
[An exploratory data analysis: the performance differences of a medical code prediction system on different demographic groups](https://aclanthology.org/2022.clinicalnlp-1.10) (Shim et al., ClinicalNLP 2022)
ACL