@inproceedings{ngo-ho-yvon-2021-optimizing,
title = "Optimizing Word Alignments with Better Subword Tokenization",
author = "Ngo Ho, Anh Khoa and
Yvon, Fran{\c{c}}ois",
editor = "Duh, Kevin and
Guzm{\'a}n, Francisco",
booktitle = "Proceedings of Machine Translation Summit XVIII: Research Track",
month = aug,
year = "2021",
address = "Virtual",
publisher = "Association for Machine Translation in the Americas",
url = "https://aclanthology.org/2021.mtsummit-research.21",
pages = "256--269",
abstract = "Word alignment identify translational correspondences between words in a parallel sentence pair and are used and for example and to train statistical machine translation and learn bilingual dictionaries or to perform quality estimation. Subword tokenization has become a standard preprocessing step for a large number of applications and notably for state-of-the-art open vocabulary machine translation systems. In this paper and we thoroughly study how this preprocessing step interacts with the word alignment task and propose several tokenization strategies to obtain well-segmented parallel corpora. Using these new techniques and we were able to improve baseline word-based alignment models for six language pairs.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ngo-ho-yvon-2021-optimizing">
<titleInfo>
<title>Optimizing Word Alignments with Better Subword Tokenization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anh</namePart>
<namePart type="given">Khoa</namePart>
<namePart type="family">Ngo Ho</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">François</namePart>
<namePart type="family">Yvon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of Machine Translation Summit XVIII: Research Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Francisco</namePart>
<namePart type="family">Guzmán</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Machine Translation in the Americas</publisher>
<place>
<placeTerm type="text">Virtual</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Word alignment identify translational correspondences between words in a parallel sentence pair and are used and for example and to train statistical machine translation and learn bilingual dictionaries or to perform quality estimation. Subword tokenization has become a standard preprocessing step for a large number of applications and notably for state-of-the-art open vocabulary machine translation systems. In this paper and we thoroughly study how this preprocessing step interacts with the word alignment task and propose several tokenization strategies to obtain well-segmented parallel corpora. Using these new techniques and we were able to improve baseline word-based alignment models for six language pairs.</abstract>
<identifier type="citekey">ngo-ho-yvon-2021-optimizing</identifier>
<location>
<url>https://aclanthology.org/2021.mtsummit-research.21</url>
</location>
<part>
<date>2021-08</date>
<extent unit="page">
<start>256</start>
<end>269</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Optimizing Word Alignments with Better Subword Tokenization
%A Ngo Ho, Anh Khoa
%A Yvon, François
%Y Duh, Kevin
%Y Guzmán, Francisco
%S Proceedings of Machine Translation Summit XVIII: Research Track
%D 2021
%8 August
%I Association for Machine Translation in the Americas
%C Virtual
%F ngo-ho-yvon-2021-optimizing
%X Word alignment identify translational correspondences between words in a parallel sentence pair and are used and for example and to train statistical machine translation and learn bilingual dictionaries or to perform quality estimation. Subword tokenization has become a standard preprocessing step for a large number of applications and notably for state-of-the-art open vocabulary machine translation systems. In this paper and we thoroughly study how this preprocessing step interacts with the word alignment task and propose several tokenization strategies to obtain well-segmented parallel corpora. Using these new techniques and we were able to improve baseline word-based alignment models for six language pairs.
%U https://aclanthology.org/2021.mtsummit-research.21
%P 256-269
Markdown (Informal)
[Optimizing Word Alignments with Better Subword Tokenization](https://aclanthology.org/2021.mtsummit-research.21) (Ngo Ho & Yvon, MTSummit 2021)
ACL