@inproceedings{savkov-etal-2022-consultation,
title = "Consultation Checklists: Standardising the Human Evaluation of Medical Note Generation",
author = "Savkov, Aleksandar and
Moramarco, Francesco and
Papadopoulos Korfiatis, Alex and
Perera, Mark and
Belz, Anya and
Reiter, Ehud",
editor = "Li, Yunyao and
Lazaridou, Angeliki",
booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing: Industry Track",
month = dec,
year = "2022",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.emnlp-industry.10",
doi = "10.18653/v1/2022.emnlp-industry.10",
pages = "111--120",
abstract = "Evaluating automatically generated text is generally hard due to the inherently subjective nature of many aspects of the output quality. This difficulty is compounded in automatic consultation note generation by differing opinions between medical experts both about which patient statements should be included in generated notes and about their respective importance in arriving at a diagnosis. Previous real-world evaluations of note-generation systems saw substantial disagreement between expert evaluators. In this paper we propose a protocol that aims to increase objectivity by grounding evaluations in Consultation Checklists, which are created in a preliminary step and then used as a common point of reference during quality assessment. We observed good levels of inter-annotator agreement in a first evaluation study using the protocol; further, using Consultation Checklists produced in the study as reference for automatic metrics such as ROUGE or BERTScore improves their correlation with human judgements compared to using the original human note.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="savkov-etal-2022-consultation">
<titleInfo>
<title>Consultation Checklists: Standardising the Human Evaluation of Medical Note Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aleksandar</namePart>
<namePart type="family">Savkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Francesco</namePart>
<namePart type="family">Moramarco</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Papadopoulos Korfiatis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Perera</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anya</namePart>
<namePart type="family">Belz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ehud</namePart>
<namePart type="family">Reiter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yunyao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Angeliki</namePart>
<namePart type="family">Lazaridou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Evaluating automatically generated text is generally hard due to the inherently subjective nature of many aspects of the output quality. This difficulty is compounded in automatic consultation note generation by differing opinions between medical experts both about which patient statements should be included in generated notes and about their respective importance in arriving at a diagnosis. Previous real-world evaluations of note-generation systems saw substantial disagreement between expert evaluators. In this paper we propose a protocol that aims to increase objectivity by grounding evaluations in Consultation Checklists, which are created in a preliminary step and then used as a common point of reference during quality assessment. We observed good levels of inter-annotator agreement in a first evaluation study using the protocol; further, using Consultation Checklists produced in the study as reference for automatic metrics such as ROUGE or BERTScore improves their correlation with human judgements compared to using the original human note.</abstract>
<identifier type="citekey">savkov-etal-2022-consultation</identifier>
<identifier type="doi">10.18653/v1/2022.emnlp-industry.10</identifier>
<location>
<url>https://aclanthology.org/2022.emnlp-industry.10</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>111</start>
<end>120</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Consultation Checklists: Standardising the Human Evaluation of Medical Note Generation
%A Savkov, Aleksandar
%A Moramarco, Francesco
%A Papadopoulos Korfiatis, Alex
%A Perera, Mark
%A Belz, Anya
%A Reiter, Ehud
%Y Li, Yunyao
%Y Lazaridou, Angeliki
%S Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing: Industry Track
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F savkov-etal-2022-consultation
%X Evaluating automatically generated text is generally hard due to the inherently subjective nature of many aspects of the output quality. This difficulty is compounded in automatic consultation note generation by differing opinions between medical experts both about which patient statements should be included in generated notes and about their respective importance in arriving at a diagnosis. Previous real-world evaluations of note-generation systems saw substantial disagreement between expert evaluators. In this paper we propose a protocol that aims to increase objectivity by grounding evaluations in Consultation Checklists, which are created in a preliminary step and then used as a common point of reference during quality assessment. We observed good levels of inter-annotator agreement in a first evaluation study using the protocol; further, using Consultation Checklists produced in the study as reference for automatic metrics such as ROUGE or BERTScore improves their correlation with human judgements compared to using the original human note.
%R 10.18653/v1/2022.emnlp-industry.10
%U https://aclanthology.org/2022.emnlp-industry.10
%U https://doi.org/10.18653/v1/2022.emnlp-industry.10
%P 111-120
Markdown (Informal)
[Consultation Checklists: Standardising the Human Evaluation of Medical Note Generation](https://aclanthology.org/2022.emnlp-industry.10) (Savkov et al., EMNLP 2022)
ACL