@inproceedings{matogawa-etal-2024-japanese,
title = "{J}apanese Rule-based Grapheme-to-phoneme Conversion System and Multilingual Named Entity Dataset with International Phonetic Alphabet",
author = "Matogawa, Yuhi and
Sakai, Yusuke and
Watanabe, Taro and
Taguchi, Chihiro",
editor = {Nicolai, Garrett and
Chodroff, Eleanor and
Mailhot, Frederic and
{\c{C}}{\"o}ltekin, {\c{C}}a{\u{g}}r{\i}},
booktitle = "Proceedings of the 21st SIGMORPHON workshop on Computational Research in Phonetics, Phonology, and Morphology",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.sigmorphon-1.9",
doi = "10.18653/v1/2024.sigmorphon-1.9",
pages = "77--86",
abstract = "In Japanese, loanwords are primarily written in Katakana, a syllabic writing system, based on their pronunciation. However, the transliterated loanwords often exhibit spelling variations, such as the word {``}Hepburn{''} being written as {``}ヘボン (hebon){''}, {``}ヘプバーン (hepubaan){''}, {``}ヘップバーン (heppubaan){''}. These orthographical variants pose a bottleneck in multilingual Named Entity Recognition (NER), because named entities (NEs) do not have one-to-one matches. In this study, we introduce a rule-based grapheme-to-phoneme (G2P) system for Japanese based on literature in linguistics and a large-scale multilingual NE dataset with annotations of the International Phonetic Alphabet (IPA), focusing on IPA to address the Katakana spelling variations in loanwords. These rules and dataset are expected to be beneficial for tasks such as NE aggregation, G2P system, construction of cross-lingual language models, and entity linking. We hope our work advances research on Japanese NER with multilingual loanwords by solving the spelling ambiguities.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="matogawa-etal-2024-japanese">
<titleInfo>
<title>Japanese Rule-based Grapheme-to-phoneme Conversion System and Multilingual Named Entity Dataset with International Phonetic Alphabet</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yuhi</namePart>
<namePart type="family">Matogawa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yusuke</namePart>
<namePart type="family">Sakai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Taro</namePart>
<namePart type="family">Watanabe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chihiro</namePart>
<namePart type="family">Taguchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 21st SIGMORPHON workshop on Computational Research in Phonetics, Phonology, and Morphology</title>
</titleInfo>
<name type="personal">
<namePart type="given">Garrett</namePart>
<namePart type="family">Nicolai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eleanor</namePart>
<namePart type="family">Chodroff</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frederic</namePart>
<namePart type="family">Mailhot</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Çağrı</namePart>
<namePart type="family">Çöltekin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In Japanese, loanwords are primarily written in Katakana, a syllabic writing system, based on their pronunciation. However, the transliterated loanwords often exhibit spelling variations, such as the word “Hepburn” being written as “ヘボン (hebon)”, “ヘプバーン (hepubaan)”, “ヘップバーン (heppubaan)”. These orthographical variants pose a bottleneck in multilingual Named Entity Recognition (NER), because named entities (NEs) do not have one-to-one matches. In this study, we introduce a rule-based grapheme-to-phoneme (G2P) system for Japanese based on literature in linguistics and a large-scale multilingual NE dataset with annotations of the International Phonetic Alphabet (IPA), focusing on IPA to address the Katakana spelling variations in loanwords. These rules and dataset are expected to be beneficial for tasks such as NE aggregation, G2P system, construction of cross-lingual language models, and entity linking. We hope our work advances research on Japanese NER with multilingual loanwords by solving the spelling ambiguities.</abstract>
<identifier type="citekey">matogawa-etal-2024-japanese</identifier>
<identifier type="doi">10.18653/v1/2024.sigmorphon-1.9</identifier>
<location>
<url>https://aclanthology.org/2024.sigmorphon-1.9</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>77</start>
<end>86</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Japanese Rule-based Grapheme-to-phoneme Conversion System and Multilingual Named Entity Dataset with International Phonetic Alphabet
%A Matogawa, Yuhi
%A Sakai, Yusuke
%A Watanabe, Taro
%A Taguchi, Chihiro
%Y Nicolai, Garrett
%Y Chodroff, Eleanor
%Y Mailhot, Frederic
%Y Çöltekin, Çağrı
%S Proceedings of the 21st SIGMORPHON workshop on Computational Research in Phonetics, Phonology, and Morphology
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F matogawa-etal-2024-japanese
%X In Japanese, loanwords are primarily written in Katakana, a syllabic writing system, based on their pronunciation. However, the transliterated loanwords often exhibit spelling variations, such as the word “Hepburn” being written as “ヘボン (hebon)”, “ヘプバーン (hepubaan)”, “ヘップバーン (heppubaan)”. These orthographical variants pose a bottleneck in multilingual Named Entity Recognition (NER), because named entities (NEs) do not have one-to-one matches. In this study, we introduce a rule-based grapheme-to-phoneme (G2P) system for Japanese based on literature in linguistics and a large-scale multilingual NE dataset with annotations of the International Phonetic Alphabet (IPA), focusing on IPA to address the Katakana spelling variations in loanwords. These rules and dataset are expected to be beneficial for tasks such as NE aggregation, G2P system, construction of cross-lingual language models, and entity linking. We hope our work advances research on Japanese NER with multilingual loanwords by solving the spelling ambiguities.
%R 10.18653/v1/2024.sigmorphon-1.9
%U https://aclanthology.org/2024.sigmorphon-1.9
%U https://doi.org/10.18653/v1/2024.sigmorphon-1.9
%P 77-86
Markdown (Informal)
[Japanese Rule-based Grapheme-to-phoneme Conversion System and Multilingual Named Entity Dataset with International Phonetic Alphabet](https://aclanthology.org/2024.sigmorphon-1.9) (Matogawa et al., SIGMORPHON 2024)
ACL