@inproceedings{belz-etal-2024-qcet-interactive,
title = "{QCET}: An Interactive Taxonomy of Quality Criteria for Comparable and Repeatable Evaluation of {NLP} Systems",
author = "Belz, Anya and
Mille, Simon and
Thomson, Craig and
Huidrom, Rudali",
editor = "Mahamood, Saad and
Minh, Nguyen Le and
Ippolito, Daphne",
booktitle = "Proceedings of the 17th International Natural Language Generation Conference: System Demonstrations",
month = sep,
year = "2024",
address = "Tokyo, Japan",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.inlg-demos.4",
pages = "9--12",
abstract = "Four years on from two papers (Belz et al., 2020; Howcroft et al., 2020) that first called out the lack of standardisation and comparability in the quality criteria assessed in NLP system evaluations, researchers still use widely differing quality criteria names and definitions, meaning that it continues to be unclear when the same aspect of quality is being assessed in two evaluations. While normalised quality criteria were proposed at the time, the list was unwieldy and using it came with a steep learning curve. In this demo paper, our aim is to address these issues with an interactive taxonomy tool that enables quick perusal and selection of the quality criteria, and provides decision support and examples of use at each node.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="belz-etal-2024-qcet-interactive">
<titleInfo>
<title>QCET: An Interactive Taxonomy of Quality Criteria for Comparable and Repeatable Evaluation of NLP Systems</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anya</namePart>
<namePart type="family">Belz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Mille</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Craig</namePart>
<namePart type="family">Thomson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rudali</namePart>
<namePart type="family">Huidrom</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 17th International Natural Language Generation Conference: System Demonstrations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Saad</namePart>
<namePart type="family">Mahamood</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nguyen</namePart>
<namePart type="given">Le</namePart>
<namePart type="family">Minh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daphne</namePart>
<namePart type="family">Ippolito</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Tokyo, Japan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Four years on from two papers (Belz et al., 2020; Howcroft et al., 2020) that first called out the lack of standardisation and comparability in the quality criteria assessed in NLP system evaluations, researchers still use widely differing quality criteria names and definitions, meaning that it continues to be unclear when the same aspect of quality is being assessed in two evaluations. While normalised quality criteria were proposed at the time, the list was unwieldy and using it came with a steep learning curve. In this demo paper, our aim is to address these issues with an interactive taxonomy tool that enables quick perusal and selection of the quality criteria, and provides decision support and examples of use at each node.</abstract>
<identifier type="citekey">belz-etal-2024-qcet-interactive</identifier>
<location>
<url>https://aclanthology.org/2024.inlg-demos.4</url>
</location>
<part>
<date>2024-09</date>
<extent unit="page">
<start>9</start>
<end>12</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T QCET: An Interactive Taxonomy of Quality Criteria for Comparable and Repeatable Evaluation of NLP Systems
%A Belz, Anya
%A Mille, Simon
%A Thomson, Craig
%A Huidrom, Rudali
%Y Mahamood, Saad
%Y Minh, Nguyen Le
%Y Ippolito, Daphne
%S Proceedings of the 17th International Natural Language Generation Conference: System Demonstrations
%D 2024
%8 September
%I Association for Computational Linguistics
%C Tokyo, Japan
%F belz-etal-2024-qcet-interactive
%X Four years on from two papers (Belz et al., 2020; Howcroft et al., 2020) that first called out the lack of standardisation and comparability in the quality criteria assessed in NLP system evaluations, researchers still use widely differing quality criteria names and definitions, meaning that it continues to be unclear when the same aspect of quality is being assessed in two evaluations. While normalised quality criteria were proposed at the time, the list was unwieldy and using it came with a steep learning curve. In this demo paper, our aim is to address these issues with an interactive taxonomy tool that enables quick perusal and selection of the quality criteria, and provides decision support and examples of use at each node.
%U https://aclanthology.org/2024.inlg-demos.4
%P 9-12
Markdown (Informal)
[QCET: An Interactive Taxonomy of Quality Criteria for Comparable and Repeatable Evaluation of NLP Systems](https://aclanthology.org/2024.inlg-demos.4) (Belz et al., INLG 2024)
ACL