@inproceedings{tate-etal-2003-task,
title = "Task-based {MT} evaluation: tackling software, experimental design, {\&} statistical models.",
author = "Tate, Calandra and
Lee, Sooyon and
Voss, Clare R.",
booktitle = "Workshop on Systemizing MT Evaluation",
month = sep # " 23-27",
year = "2003",
address = "New Orleans, USA",
url = "https://aclanthology.org/2003.mtsummit-eval.6",
abstract = "Even with recent, renewed attention to MT evaluation{---}due in part to n-gram-based metrics (Papineni et al., 2001; Doddington, 2002) and the extensive, online catalogue of MT metrics on the ISLE project (Hovy et al., 2001, 2003), few reports involving task-based metrics have surfaced. This paper presents our work on three parts of task-based MT evaluation: (i) software to track and record users' task performance via a browser, run from a desktop computer or remotely over the web, (ii) factorial experimental design with replicate observations to compare the MT engines, based on the accuracy of users' task responses, and (iii) the use of chi-squared and generalized linear models (GLMs) to permit finer-grained data analyses. We report on the experimental results of a six-way document categorization task, used for the evaluation of three Korean-English MT engines. The statistical models of the probabilities of correct responses yield an ordering of the MT engines, with one engine having a statistically significant lead over the other two. Future research will involve testing user performance on linguistically more complex tasks, as well as extending our initial GLMs with the documents' Bleu scores as variables, to test the scores as independent predictors of task results.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tate-etal-2003-task">
<titleInfo>
<title>Task-based MT evaluation: tackling software, experimental design, & statistical models.</title>
</titleInfo>
<name type="personal">
<namePart type="given">Calandra</namePart>
<namePart type="family">Tate</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sooyon</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Clare</namePart>
<namePart type="given">R</namePart>
<namePart type="family">Voss</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2003-sep 23-27</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Workshop on Systemizing MT Evaluation</title>
</titleInfo>
<originInfo>
<place>
<placeTerm type="text">New Orleans, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Even with recent, renewed attention to MT evaluation—due in part to n-gram-based metrics (Papineni et al., 2001; Doddington, 2002) and the extensive, online catalogue of MT metrics on the ISLE project (Hovy et al., 2001, 2003), few reports involving task-based metrics have surfaced. This paper presents our work on three parts of task-based MT evaluation: (i) software to track and record users’ task performance via a browser, run from a desktop computer or remotely over the web, (ii) factorial experimental design with replicate observations to compare the MT engines, based on the accuracy of users’ task responses, and (iii) the use of chi-squared and generalized linear models (GLMs) to permit finer-grained data analyses. We report on the experimental results of a six-way document categorization task, used for the evaluation of three Korean-English MT engines. The statistical models of the probabilities of correct responses yield an ordering of the MT engines, with one engine having a statistically significant lead over the other two. Future research will involve testing user performance on linguistically more complex tasks, as well as extending our initial GLMs with the documents’ Bleu scores as variables, to test the scores as independent predictors of task results.</abstract>
<identifier type="citekey">tate-etal-2003-task</identifier>
<location>
<url>https://aclanthology.org/2003.mtsummit-eval.6</url>
</location>
<part>
<date>2003-sep 23-27</date>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Task-based MT evaluation: tackling software, experimental design, & statistical models.
%A Tate, Calandra
%A Lee, Sooyon
%A Voss, Clare R.
%S Workshop on Systemizing MT Evaluation
%D 2003
%8 sep 23 27
%C New Orleans, USA
%F tate-etal-2003-task
%X Even with recent, renewed attention to MT evaluation—due in part to n-gram-based metrics (Papineni et al., 2001; Doddington, 2002) and the extensive, online catalogue of MT metrics on the ISLE project (Hovy et al., 2001, 2003), few reports involving task-based metrics have surfaced. This paper presents our work on three parts of task-based MT evaluation: (i) software to track and record users’ task performance via a browser, run from a desktop computer or remotely over the web, (ii) factorial experimental design with replicate observations to compare the MT engines, based on the accuracy of users’ task responses, and (iii) the use of chi-squared and generalized linear models (GLMs) to permit finer-grained data analyses. We report on the experimental results of a six-way document categorization task, used for the evaluation of three Korean-English MT engines. The statistical models of the probabilities of correct responses yield an ordering of the MT engines, with one engine having a statistically significant lead over the other two. Future research will involve testing user performance on linguistically more complex tasks, as well as extending our initial GLMs with the documents’ Bleu scores as variables, to test the scores as independent predictors of task results.
%U https://aclanthology.org/2003.mtsummit-eval.6
Markdown (Informal)
[Task-based MT evaluation: tackling software, experimental design, & statistical models.](https://aclanthology.org/2003.mtsummit-eval.6) (Tate et al., MTSummit 2003)
ACL