@inproceedings{walker-copperman-2010-evaluating,
title = "Evaluating Complex Semantic Artifacts",
author = "Walker, Christopher R and
Copperman, Hannah",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Maegaard, Bente and
Mariani, Joseph and
Odijk, Jan and
Piperidis, Stelios and
Rosner, Mike and
Tapias, Daniel",
booktitle = "Proceedings of the Seventh International Conference on Language Resources and Evaluation ({LREC}'10)",
month = may,
year = "2010",
address = "Valletta, Malta",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2010/pdf/441_Paper.pdf",
abstract = "Evaluating complex Natural Language Processing (NLP) systems can prove extremely difficult. In many cases, the best one can do is to evaluate these systems indirectly, by looking at the impact they have on the performance of the downstream use case. For complex end-to-end systems, these metrics are not always enlightening, especially from the perspective of NLP failure analysis, as component interaction can obscure issues specific to the NLP technology. We present an evaluation program for complex NLP systems designed to produce meaningful aggregate accuracy metrics with sufficient granularity to support active development by NLP specialists. Our goals were threefold: to produce reliable metrics, to produce useful metrics and to produce actionable data. Our use case is a graph-based Wikipedia search index. Since the evaluation of a complex graph structure is beyond the conceptual grasp of a single human judge, the problem needs to be broken down. Slices of complex data reflective of coherent Decision Points provide a good framework for evaluation using human judges (Medero et al., 2006). For NL semantics, there really is no substitute. Leveraging Decision Points allows complex semantic artifacts to be tracked with judge-driven evaluations that are accurate, timely and actionable.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="walker-copperman-2010-evaluating">
<titleInfo>
<title>Evaluating Complex Semantic Artifacts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="given">R</namePart>
<namePart type="family">Walker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hannah</namePart>
<namePart type="family">Copperman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2010-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC’10)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mike</namePart>
<namePart type="family">Rosner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Tapias</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Valletta, Malta</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Evaluating complex Natural Language Processing (NLP) systems can prove extremely difficult. In many cases, the best one can do is to evaluate these systems indirectly, by looking at the impact they have on the performance of the downstream use case. For complex end-to-end systems, these metrics are not always enlightening, especially from the perspective of NLP failure analysis, as component interaction can obscure issues specific to the NLP technology. We present an evaluation program for complex NLP systems designed to produce meaningful aggregate accuracy metrics with sufficient granularity to support active development by NLP specialists. Our goals were threefold: to produce reliable metrics, to produce useful metrics and to produce actionable data. Our use case is a graph-based Wikipedia search index. Since the evaluation of a complex graph structure is beyond the conceptual grasp of a single human judge, the problem needs to be broken down. Slices of complex data reflective of coherent Decision Points provide a good framework for evaluation using human judges (Medero et al., 2006). For NL semantics, there really is no substitute. Leveraging Decision Points allows complex semantic artifacts to be tracked with judge-driven evaluations that are accurate, timely and actionable.</abstract>
<identifier type="citekey">walker-copperman-2010-evaluating</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2010/pdf/441_Paper.pdf</url>
</location>
<part>
<date>2010-05</date>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluating Complex Semantic Artifacts
%A Walker, Christopher R.
%A Copperman, Hannah
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Odijk, Jan
%Y Piperidis, Stelios
%Y Rosner, Mike
%Y Tapias, Daniel
%S Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC’10)
%D 2010
%8 May
%I European Language Resources Association (ELRA)
%C Valletta, Malta
%F walker-copperman-2010-evaluating
%X Evaluating complex Natural Language Processing (NLP) systems can prove extremely difficult. In many cases, the best one can do is to evaluate these systems indirectly, by looking at the impact they have on the performance of the downstream use case. For complex end-to-end systems, these metrics are not always enlightening, especially from the perspective of NLP failure analysis, as component interaction can obscure issues specific to the NLP technology. We present an evaluation program for complex NLP systems designed to produce meaningful aggregate accuracy metrics with sufficient granularity to support active development by NLP specialists. Our goals were threefold: to produce reliable metrics, to produce useful metrics and to produce actionable data. Our use case is a graph-based Wikipedia search index. Since the evaluation of a complex graph structure is beyond the conceptual grasp of a single human judge, the problem needs to be broken down. Slices of complex data reflective of coherent Decision Points provide a good framework for evaluation using human judges (Medero et al., 2006). For NL semantics, there really is no substitute. Leveraging Decision Points allows complex semantic artifacts to be tracked with judge-driven evaluations that are accurate, timely and actionable.
%U http://www.lrec-conf.org/proceedings/lrec2010/pdf/441_Paper.pdf
Markdown (Informal)
[Evaluating Complex Semantic Artifacts](http://www.lrec-conf.org/proceedings/lrec2010/pdf/441_Paper.pdf) (Walker & Copperman, LREC 2010)
ACL
- Christopher R Walker and Hannah Copperman. 2010. Evaluating Complex Semantic Artifacts. In Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC'10), Valletta, Malta. European Language Resources Association (ELRA).