@inproceedings{song-etal-2022-bertseg,
title = "{BERTS}eg: {BERT} Based Unsupervised Subword Segmentation for Neural Machine Translation",
author = "Song, Haiyue and
Dabre, Raj and
Mao, Zhuoyuan and
Chu, Chenhui and
Kurohashi, Sadao",
editor = "He, Yulan and
Ji, Heng and
Li, Sujian and
Liu, Yang and
Chang, Chua-Hui",
booktitle = "Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing (Volume 2: Short Papers)",
month = nov,
year = "2022",
address = "Online only",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.aacl-short.12",
pages = "85--94",
abstract = "Existing subword segmenters are either 1) frequency-based without semantics information or 2) neural-based but trained on parallel corpora. To address this, we present BERTSeg, an unsupervised neural subword segmenter for neural machine translation, which utilizes the contextualized semantic embeddings of words from characterBERT and maximizes the generation probability of subword segmentations. Furthermore, we propose a generation probability-based regularization method that enables BERTSeg to produce multiple segmentations for one word to improve the robustness of neural machine translation. Experimental results show that BERTSeg with regularization achieves up to 8 BLEU points improvement in 9 translation directions on ALT, IWSLT15 Vi-{\textgreater}En, WMT16 Ro-{\textgreater}En, and WMT15 Fi-{\textgreater}En datasets compared with BPE. In addition, BERTSeg is efficient, needing up to 5 minutes for training.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="song-etal-2022-bertseg">
<titleInfo>
<title>BERTSeg: BERT Based Unsupervised Subword Segmentation for Neural Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Haiyue</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raj</namePart>
<namePart type="family">Dabre</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhuoyuan</namePart>
<namePart type="family">Mao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chenhui</namePart>
<namePart type="family">Chu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sadao</namePart>
<namePart type="family">Kurohashi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing (Volume 2: Short Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yulan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Heng</namePart>
<namePart type="family">Ji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sujian</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chua-Hui</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online only</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Existing subword segmenters are either 1) frequency-based without semantics information or 2) neural-based but trained on parallel corpora. To address this, we present BERTSeg, an unsupervised neural subword segmenter for neural machine translation, which utilizes the contextualized semantic embeddings of words from characterBERT and maximizes the generation probability of subword segmentations. Furthermore, we propose a generation probability-based regularization method that enables BERTSeg to produce multiple segmentations for one word to improve the robustness of neural machine translation. Experimental results show that BERTSeg with regularization achieves up to 8 BLEU points improvement in 9 translation directions on ALT, IWSLT15 Vi-\textgreaterEn, WMT16 Ro-\textgreaterEn, and WMT15 Fi-\textgreaterEn datasets compared with BPE. In addition, BERTSeg is efficient, needing up to 5 minutes for training.</abstract>
<identifier type="citekey">song-etal-2022-bertseg</identifier>
<location>
<url>https://aclanthology.org/2022.aacl-short.12</url>
</location>
<part>
<date>2022-11</date>
<extent unit="page">
<start>85</start>
<end>94</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T BERTSeg: BERT Based Unsupervised Subword Segmentation for Neural Machine Translation
%A Song, Haiyue
%A Dabre, Raj
%A Mao, Zhuoyuan
%A Chu, Chenhui
%A Kurohashi, Sadao
%Y He, Yulan
%Y Ji, Heng
%Y Li, Sujian
%Y Liu, Yang
%Y Chang, Chua-Hui
%S Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing (Volume 2: Short Papers)
%D 2022
%8 November
%I Association for Computational Linguistics
%C Online only
%F song-etal-2022-bertseg
%X Existing subword segmenters are either 1) frequency-based without semantics information or 2) neural-based but trained on parallel corpora. To address this, we present BERTSeg, an unsupervised neural subword segmenter for neural machine translation, which utilizes the contextualized semantic embeddings of words from characterBERT and maximizes the generation probability of subword segmentations. Furthermore, we propose a generation probability-based regularization method that enables BERTSeg to produce multiple segmentations for one word to improve the robustness of neural machine translation. Experimental results show that BERTSeg with regularization achieves up to 8 BLEU points improvement in 9 translation directions on ALT, IWSLT15 Vi-\textgreaterEn, WMT16 Ro-\textgreaterEn, and WMT15 Fi-\textgreaterEn datasets compared with BPE. In addition, BERTSeg is efficient, needing up to 5 minutes for training.
%U https://aclanthology.org/2022.aacl-short.12
%P 85-94
Markdown (Informal)
[BERTSeg: BERT Based Unsupervised Subword Segmentation for Neural Machine Translation](https://aclanthology.org/2022.aacl-short.12) (Song et al., AACL-IJCNLP 2022)
ACL
- Haiyue Song, Raj Dabre, Zhuoyuan Mao, Chenhui Chu, and Sadao Kurohashi. 2022. BERTSeg: BERT Based Unsupervised Subword Segmentation for Neural Machine Translation. In Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing (Volume 2: Short Papers), pages 85–94, Online only. Association for Computational Linguistics.