@inproceedings{jiang-etal-2022-im2,
title = "{IM}2: an Interpretable and Multi-category Integrated Metric Framework for Automatic Dialogue Evaluation",
author = "Jiang, Zhihua and
Ye, Guanghui and
Rao, Dongning and
Wang, Di and
Miao, Xin",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.emnlp-main.762",
doi = "10.18653/v1/2022.emnlp-main.762",
pages = "11091--11103",
abstract = "Evaluation metrics shine the light on the best models and thus strongly influence the research directions, such as the recently developed dialogue metrics USR, FED, and GRADE. However, most current metrics evaluate the dialogue data as isolated and static because they only focus on a single quality or several qualities. To mitigate the problem, this paper proposes an interpretable, multi-faceted, and controllable framework IM{\textasciicircum}2 (Interpretable and Multi-category Integrated Metric) to combine a large number of metrics which are good at measuring different qualities. The IM{\textasciicircum}2 framework first divides current popular dialogue qualities into different categories and then applies or proposes dialogue metrics to measure the qualities within each category and finally generates an overall IM{\textasciicircum}2 score. An initial version of IM{\textasciicircum}2 was submitted to the AAAI 2022 Track5.1@DSTC10 challenge and took the 2{\textasciicircum}nd place on both of the development and test leaderboard. After the competition, we develop more metrics and improve the performance of our model. We compare IM{\textasciicircum}2 with other 13 current dialogue metrics and experimental results show that IM{\textasciicircum}2 correlates more strongly with human judgments than any of them on each evaluated dataset.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jiang-etal-2022-im2">
<titleInfo>
<title>IM2: an Interpretable and Multi-category Integrated Metric Framework for Automatic Dialogue Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhihua</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guanghui</namePart>
<namePart type="family">Ye</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dongning</namePart>
<namePart type="family">Rao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Di</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xin</namePart>
<namePart type="family">Miao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Evaluation metrics shine the light on the best models and thus strongly influence the research directions, such as the recently developed dialogue metrics USR, FED, and GRADE. However, most current metrics evaluate the dialogue data as isolated and static because they only focus on a single quality or several qualities. To mitigate the problem, this paper proposes an interpretable, multi-faceted, and controllable framework IM⌃2 (Interpretable and Multi-category Integrated Metric) to combine a large number of metrics which are good at measuring different qualities. The IM⌃2 framework first divides current popular dialogue qualities into different categories and then applies or proposes dialogue metrics to measure the qualities within each category and finally generates an overall IM⌃2 score. An initial version of IM⌃2 was submitted to the AAAI 2022 Track5.1@DSTC10 challenge and took the 2⌃nd place on both of the development and test leaderboard. After the competition, we develop more metrics and improve the performance of our model. We compare IM⌃2 with other 13 current dialogue metrics and experimental results show that IM⌃2 correlates more strongly with human judgments than any of them on each evaluated dataset.</abstract>
<identifier type="citekey">jiang-etal-2022-im2</identifier>
<identifier type="doi">10.18653/v1/2022.emnlp-main.762</identifier>
<location>
<url>https://aclanthology.org/2022.emnlp-main.762</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>11091</start>
<end>11103</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T IM2: an Interpretable and Multi-category Integrated Metric Framework for Automatic Dialogue Evaluation
%A Jiang, Zhihua
%A Ye, Guanghui
%A Rao, Dongning
%A Wang, Di
%A Miao, Xin
%Y Goldberg, Yoav
%Y Kozareva, Zornitsa
%Y Zhang, Yue
%S Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F jiang-etal-2022-im2
%X Evaluation metrics shine the light on the best models and thus strongly influence the research directions, such as the recently developed dialogue metrics USR, FED, and GRADE. However, most current metrics evaluate the dialogue data as isolated and static because they only focus on a single quality or several qualities. To mitigate the problem, this paper proposes an interpretable, multi-faceted, and controllable framework IM⌃2 (Interpretable and Multi-category Integrated Metric) to combine a large number of metrics which are good at measuring different qualities. The IM⌃2 framework first divides current popular dialogue qualities into different categories and then applies or proposes dialogue metrics to measure the qualities within each category and finally generates an overall IM⌃2 score. An initial version of IM⌃2 was submitted to the AAAI 2022 Track5.1@DSTC10 challenge and took the 2⌃nd place on both of the development and test leaderboard. After the competition, we develop more metrics and improve the performance of our model. We compare IM⌃2 with other 13 current dialogue metrics and experimental results show that IM⌃2 correlates more strongly with human judgments than any of them on each evaluated dataset.
%R 10.18653/v1/2022.emnlp-main.762
%U https://aclanthology.org/2022.emnlp-main.762
%U https://doi.org/10.18653/v1/2022.emnlp-main.762
%P 11091-11103
Markdown (Informal)
[IM2: an Interpretable and Multi-category Integrated Metric Framework for Automatic Dialogue Evaluation](https://aclanthology.org/2022.emnlp-main.762) (Jiang et al., EMNLP 2022)
ACL