@inproceedings{yamashita-etal-2018-comparison,
title = "A Comparison of Entity Matching Methods between {E}nglish and {J}apanese Katakana",
author = "Yamashita, Michiharu and
Awashima, Hideki and
Oiwa, Hidekazu",
editor = "Kuebler, Sandra and
Nicolai, Garrett",
booktitle = "Proceedings of the Fifteenth Workshop on Computational Research in Phonetics, Phonology, and Morphology",
month = oct,
year = "2018",
address = "Brussels, Belgium",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W18-5809",
doi = "10.18653/v1/W18-5809",
pages = "84--92",
abstract = "Japanese Katakana is one component of the Japanese writing system and is used to express English terms, loanwords, and onomatopoeia in Japanese characters based on the phonemes. The main purpose of this research is to find the best entity matching methods between English and Katakana. We built two research questions to clarify which types of entity matching systems works better than others. The first question is what transliteration should be used for conversion. We need to transliterate English or Katakana terms into the same form in order to compute the string similarity. We consider five conversions that transliterate English to Katakana directly, Katakana to English directly, English to Katakana via phoneme, Katakana to English via phoneme, and both English and Katakana to phoneme. The second question is what should be used for the similarity measure at entity matching. To investigate the problem, we choose six methods, which are Overlap Coefficient, Cosine, Jaccard, Jaro-Winkler, Levenshtein, and the similarity of the phoneme probability predicted by RNN. Our results show that 1) matching using phonemes and conversion of Katakana to English works better than other methods, and 2) the similarity of phonemes outperforms other methods while other similarity score is changed depending on data and models.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yamashita-etal-2018-comparison">
<titleInfo>
<title>A Comparison of Entity Matching Methods between English and Japanese Katakana</title>
</titleInfo>
<name type="personal">
<namePart type="given">Michiharu</namePart>
<namePart type="family">Yamashita</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hideki</namePart>
<namePart type="family">Awashima</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hidekazu</namePart>
<namePart type="family">Oiwa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifteenth Workshop on Computational Research in Phonetics, Phonology, and Morphology</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sandra</namePart>
<namePart type="family">Kuebler</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Garrett</namePart>
<namePart type="family">Nicolai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Brussels, Belgium</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Japanese Katakana is one component of the Japanese writing system and is used to express English terms, loanwords, and onomatopoeia in Japanese characters based on the phonemes. The main purpose of this research is to find the best entity matching methods between English and Katakana. We built two research questions to clarify which types of entity matching systems works better than others. The first question is what transliteration should be used for conversion. We need to transliterate English or Katakana terms into the same form in order to compute the string similarity. We consider five conversions that transliterate English to Katakana directly, Katakana to English directly, English to Katakana via phoneme, Katakana to English via phoneme, and both English and Katakana to phoneme. The second question is what should be used for the similarity measure at entity matching. To investigate the problem, we choose six methods, which are Overlap Coefficient, Cosine, Jaccard, Jaro-Winkler, Levenshtein, and the similarity of the phoneme probability predicted by RNN. Our results show that 1) matching using phonemes and conversion of Katakana to English works better than other methods, and 2) the similarity of phonemes outperforms other methods while other similarity score is changed depending on data and models.</abstract>
<identifier type="citekey">yamashita-etal-2018-comparison</identifier>
<identifier type="doi">10.18653/v1/W18-5809</identifier>
<location>
<url>https://aclanthology.org/W18-5809</url>
</location>
<part>
<date>2018-10</date>
<extent unit="page">
<start>84</start>
<end>92</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Comparison of Entity Matching Methods between English and Japanese Katakana
%A Yamashita, Michiharu
%A Awashima, Hideki
%A Oiwa, Hidekazu
%Y Kuebler, Sandra
%Y Nicolai, Garrett
%S Proceedings of the Fifteenth Workshop on Computational Research in Phonetics, Phonology, and Morphology
%D 2018
%8 October
%I Association for Computational Linguistics
%C Brussels, Belgium
%F yamashita-etal-2018-comparison
%X Japanese Katakana is one component of the Japanese writing system and is used to express English terms, loanwords, and onomatopoeia in Japanese characters based on the phonemes. The main purpose of this research is to find the best entity matching methods between English and Katakana. We built two research questions to clarify which types of entity matching systems works better than others. The first question is what transliteration should be used for conversion. We need to transliterate English or Katakana terms into the same form in order to compute the string similarity. We consider five conversions that transliterate English to Katakana directly, Katakana to English directly, English to Katakana via phoneme, Katakana to English via phoneme, and both English and Katakana to phoneme. The second question is what should be used for the similarity measure at entity matching. To investigate the problem, we choose six methods, which are Overlap Coefficient, Cosine, Jaccard, Jaro-Winkler, Levenshtein, and the similarity of the phoneme probability predicted by RNN. Our results show that 1) matching using phonemes and conversion of Katakana to English works better than other methods, and 2) the similarity of phonemes outperforms other methods while other similarity score is changed depending on data and models.
%R 10.18653/v1/W18-5809
%U https://aclanthology.org/W18-5809
%U https://doi.org/10.18653/v1/W18-5809
%P 84-92
Markdown (Informal)
[A Comparison of Entity Matching Methods between English and Japanese Katakana](https://aclanthology.org/W18-5809) (Yamashita et al., EMNLP 2018)
ACL