@inproceedings{arehart-miller-2008-ground,
title = "A Ground Truth Dataset for Matching Culturally Diverse {R}omanized Person Names",
author = "Arehart, Mark and
Miller, Keith J.",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Maegaard, Bente and
Mariani, Joseph and
Odijk, Jan and
Piperidis, Stelios and
Tapias, Daniel",
booktitle = "Proceedings of the Sixth International Conference on Language Resources and Evaluation ({LREC}'08)",
month = may,
year = "2008",
address = "Marrakech, Morocco",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2008/pdf/291_paper.pdf",
abstract = "This paper describes the development of a ground truth dataset of culturally diverse Romanized names in which approximately 70,000 names are matched against a subset of 700. We ran the subset as queries against the complete list using several matchers, created adjudication pools, adjudicated the results, and compiled two versions of ground truth based on different sets of adjudication guidelines and methods for resolving adjudicator conflicts. The name list, drawn from publicly available sources, was manually seeded with over 1500 name variants. These names include transliteration variation, database fielding errors, segmentation differences, incomplete names, titles, initials, abbreviations, nicknames, typos, OCR errors, and truncated data. These diverse types of matches, along with the coincidental name similarities already in the list, make possible a comprehensive evaluation of name matching systems. We have used the dataset to evaluate several open source and commercial algorithms and provide some of those results.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="arehart-miller-2008-ground">
<titleInfo>
<title>A Ground Truth Dataset for Matching Culturally Diverse Romanized Person Names</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Arehart</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Keith</namePart>
<namePart type="given">J</namePart>
<namePart type="family">Miller</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2008-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Sixth International Conference on Language Resources and Evaluation (LREC’08)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Tapias</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Marrakech, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper describes the development of a ground truth dataset of culturally diverse Romanized names in which approximately 70,000 names are matched against a subset of 700. We ran the subset as queries against the complete list using several matchers, created adjudication pools, adjudicated the results, and compiled two versions of ground truth based on different sets of adjudication guidelines and methods for resolving adjudicator conflicts. The name list, drawn from publicly available sources, was manually seeded with over 1500 name variants. These names include transliteration variation, database fielding errors, segmentation differences, incomplete names, titles, initials, abbreviations, nicknames, typos, OCR errors, and truncated data. These diverse types of matches, along with the coincidental name similarities already in the list, make possible a comprehensive evaluation of name matching systems. We have used the dataset to evaluate several open source and commercial algorithms and provide some of those results.</abstract>
<identifier type="citekey">arehart-miller-2008-ground</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2008/pdf/291_paper.pdf</url>
</location>
<part>
<date>2008-05</date>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Ground Truth Dataset for Matching Culturally Diverse Romanized Person Names
%A Arehart, Mark
%A Miller, Keith J.
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Odijk, Jan
%Y Piperidis, Stelios
%Y Tapias, Daniel
%S Proceedings of the Sixth International Conference on Language Resources and Evaluation (LREC’08)
%D 2008
%8 May
%I European Language Resources Association (ELRA)
%C Marrakech, Morocco
%F arehart-miller-2008-ground
%X This paper describes the development of a ground truth dataset of culturally diverse Romanized names in which approximately 70,000 names are matched against a subset of 700. We ran the subset as queries against the complete list using several matchers, created adjudication pools, adjudicated the results, and compiled two versions of ground truth based on different sets of adjudication guidelines and methods for resolving adjudicator conflicts. The name list, drawn from publicly available sources, was manually seeded with over 1500 name variants. These names include transliteration variation, database fielding errors, segmentation differences, incomplete names, titles, initials, abbreviations, nicknames, typos, OCR errors, and truncated data. These diverse types of matches, along with the coincidental name similarities already in the list, make possible a comprehensive evaluation of name matching systems. We have used the dataset to evaluate several open source and commercial algorithms and provide some of those results.
%U http://www.lrec-conf.org/proceedings/lrec2008/pdf/291_paper.pdf
Markdown (Informal)
[A Ground Truth Dataset for Matching Culturally Diverse Romanized Person Names](http://www.lrec-conf.org/proceedings/lrec2008/pdf/291_paper.pdf) (Arehart & Miller, LREC 2008)
ACL