@inproceedings{kim-2025-rubric,
title = "{RUBRIC}-{MQM} : Span-Level {LLM}-as-judge in Machine Translation For High-End Models",
author = "Kim, Ahrii",
editor = "Rehm, Georg and
Li, Yunyao",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 6: Industry Track)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-industry.12/",
doi = "10.18653/v1/2025.acl-industry.12",
pages = "147--165",
ISBN = "979-8-89176-288-6",
abstract = "Referred to as $\textit{LLM-as-judge}$, a generative large language model (LLM) has demonstrated considerable efficacy as an evaluator in various tasks, including Machine Translation (LAJ-MT) by predicting scores or identifying error types for individual sentences. However, its dependability in practical application has yet to be demonstrated, as there is only an $\textit{approximated match}$ due to the task{'}s open-ended nature. To address this problem, we introduce a straightforward and novel meta-evaluation strategy $\textbf{PromptCUE}$ and evaluate cutting-edge LAJ-MT models such as GEMBA-MQM. We identify their fundamental deficits, including certain label biases and the inability to assess near-perfect translations.To improve reliability, we investigate more trustworthy and less biased models using multidimensional prompt engineering. Our findings indicate that the combination of span-level error quantification and a rubric-style prompt tailored to the characteristics of LLMs has efficiently addressed the majority of the challenges current LAJ-MT models face. Furthermore, it demonstrates a considerably enhanced alignment with human values. Accordingly, we present $\textbf{Rubric-MQM}$, the LAJ-MT for high-end models and an updated version of GEMBA-MQM."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kim-2025-rubric">
<titleInfo>
<title>RUBRIC-MQM : Span-Level LLM-as-judge in Machine Translation For High-End Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ahrii</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 6: Industry Track)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Georg</namePart>
<namePart type="family">Rehm</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yunyao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-288-6</identifier>
</relatedItem>
<abstract>Referred to as LLM-as-judge, a generative large language model (LLM) has demonstrated considerable efficacy as an evaluator in various tasks, including Machine Translation (LAJ-MT) by predicting scores or identifying error types for individual sentences. However, its dependability in practical application has yet to be demonstrated, as there is only an approximated match due to the task’s open-ended nature. To address this problem, we introduce a straightforward and novel meta-evaluation strategy PromptCUE and evaluate cutting-edge LAJ-MT models such as GEMBA-MQM. We identify their fundamental deficits, including certain label biases and the inability to assess near-perfect translations.To improve reliability, we investigate more trustworthy and less biased models using multidimensional prompt engineering. Our findings indicate that the combination of span-level error quantification and a rubric-style prompt tailored to the characteristics of LLMs has efficiently addressed the majority of the challenges current LAJ-MT models face. Furthermore, it demonstrates a considerably enhanced alignment with human values. Accordingly, we present Rubric-MQM, the LAJ-MT for high-end models and an updated version of GEMBA-MQM.</abstract>
<identifier type="citekey">kim-2025-rubric</identifier>
<identifier type="doi">10.18653/v1/2025.acl-industry.12</identifier>
<location>
<url>https://aclanthology.org/2025.acl-industry.12/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>147</start>
<end>165</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T RUBRIC-MQM : Span-Level LLM-as-judge in Machine Translation For High-End Models
%A Kim, Ahrii
%Y Rehm, Georg
%Y Li, Yunyao
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 6: Industry Track)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-288-6
%F kim-2025-rubric
%X Referred to as LLM-as-judge, a generative large language model (LLM) has demonstrated considerable efficacy as an evaluator in various tasks, including Machine Translation (LAJ-MT) by predicting scores or identifying error types for individual sentences. However, its dependability in practical application has yet to be demonstrated, as there is only an approximated match due to the task’s open-ended nature. To address this problem, we introduce a straightforward and novel meta-evaluation strategy PromptCUE and evaluate cutting-edge LAJ-MT models such as GEMBA-MQM. We identify their fundamental deficits, including certain label biases and the inability to assess near-perfect translations.To improve reliability, we investigate more trustworthy and less biased models using multidimensional prompt engineering. Our findings indicate that the combination of span-level error quantification and a rubric-style prompt tailored to the characteristics of LLMs has efficiently addressed the majority of the challenges current LAJ-MT models face. Furthermore, it demonstrates a considerably enhanced alignment with human values. Accordingly, we present Rubric-MQM, the LAJ-MT for high-end models and an updated version of GEMBA-MQM.
%R 10.18653/v1/2025.acl-industry.12
%U https://aclanthology.org/2025.acl-industry.12/
%U https://doi.org/10.18653/v1/2025.acl-industry.12
%P 147-165
Markdown (Informal)
[RUBRIC-MQM : Span-Level LLM-as-judge in Machine Translation For High-End Models](https://aclanthology.org/2025.acl-industry.12/) (Kim, ACL 2025)
ACL