@inproceedings{hamon-2010-judge,
title = "Is my Judge a good One?",
author = "Hamon, Olivier",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Maegaard, Bente and
Mariani, Joseph and
Odijk, Jan and
Piperidis, Stelios and
Rosner, Mike and
Tapias, Daniel",
booktitle = "Proceedings of the Seventh International Conference on Language Resources and Evaluation ({LREC}'10)",
month = may,
year = "2010",
address = "Valletta, Malta",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2010/pdf/402_Paper.pdf",
abstract = "This paper aims at measuring the reliability of judges in MT evaluation. The scope is two evaluation campaigns from the CESTA project, during which human evaluations were carried out on fluency and adequacy criteria for English-to-French documents. Our objectives were threefold: observe both inter- and intra-judge agreements, and then study the influence of the evaluation design especially implemented for the need of the campaigns. Indeed, a web interface was especially developed to help with the human judgments and store the results, but some design changes were made between the first and the second campaign. Considering the low agreements observed, the judges' behaviour has been analysed in that specific context. We also asked several judges to repeat their own evaluations a few times after the first judgments done during the official evaluation campaigns. Even if judges did not seem to agree fully at first sight, a less strict comparison led to a strong agreement. Furthermore, the evolution of the design during the project seemed to have been a source for the difficulties that judges encountered to keep the same interpretation of quality.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hamon-2010-judge">
<titleInfo>
<title>Is my Judge a good One?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Olivier</namePart>
<namePart type="family">Hamon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2010-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC’10)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mike</namePart>
<namePart type="family">Rosner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Tapias</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Valletta, Malta</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper aims at measuring the reliability of judges in MT evaluation. The scope is two evaluation campaigns from the CESTA project, during which human evaluations were carried out on fluency and adequacy criteria for English-to-French documents. Our objectives were threefold: observe both inter- and intra-judge agreements, and then study the influence of the evaluation design especially implemented for the need of the campaigns. Indeed, a web interface was especially developed to help with the human judgments and store the results, but some design changes were made between the first and the second campaign. Considering the low agreements observed, the judges’ behaviour has been analysed in that specific context. We also asked several judges to repeat their own evaluations a few times after the first judgments done during the official evaluation campaigns. Even if judges did not seem to agree fully at first sight, a less strict comparison led to a strong agreement. Furthermore, the evolution of the design during the project seemed to have been a source for the difficulties that judges encountered to keep the same interpretation of quality.</abstract>
<identifier type="citekey">hamon-2010-judge</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2010/pdf/402_Paper.pdf</url>
</location>
<part>
<date>2010-05</date>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Is my Judge a good One?
%A Hamon, Olivier
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Odijk, Jan
%Y Piperidis, Stelios
%Y Rosner, Mike
%Y Tapias, Daniel
%S Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC’10)
%D 2010
%8 May
%I European Language Resources Association (ELRA)
%C Valletta, Malta
%F hamon-2010-judge
%X This paper aims at measuring the reliability of judges in MT evaluation. The scope is two evaluation campaigns from the CESTA project, during which human evaluations were carried out on fluency and adequacy criteria for English-to-French documents. Our objectives were threefold: observe both inter- and intra-judge agreements, and then study the influence of the evaluation design especially implemented for the need of the campaigns. Indeed, a web interface was especially developed to help with the human judgments and store the results, but some design changes were made between the first and the second campaign. Considering the low agreements observed, the judges’ behaviour has been analysed in that specific context. We also asked several judges to repeat their own evaluations a few times after the first judgments done during the official evaluation campaigns. Even if judges did not seem to agree fully at first sight, a less strict comparison led to a strong agreement. Furthermore, the evolution of the design during the project seemed to have been a source for the difficulties that judges encountered to keep the same interpretation of quality.
%U http://www.lrec-conf.org/proceedings/lrec2010/pdf/402_Paper.pdf
Markdown (Informal)
[Is my Judge a good One?](http://www.lrec-conf.org/proceedings/lrec2010/pdf/402_Paper.pdf) (Hamon, LREC 2010)
ACL
- Olivier Hamon. 2010. Is my Judge a good One?. In Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC'10), Valletta, Malta. European Language Resources Association (ELRA).