@inproceedings{fernandez-adlaon-2022-exploring,
title = "Exploring Word Alignment towards an Efficient Sentence Aligner for {F}ilipino and {C}ebuano Languages",
author = "Fernandez, Jenn Leana and
Adlaon, Kristine Mae M.",
booktitle = "Proceedings of the Fifth Workshop on Technologies for Machine Translation of Low-Resource Languages (LoResMT 2022)",
month = oct,
year = "2022",
address = "Gyeongju, Republic of Korea",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.loresmt-1.13",
pages = "99--106",
abstract = "Building a robust machine translation (MT) system requires a large amount of parallel corpus which is an expensive resource for low-resourced languages. The two major languages being spoken in the Philippines which are Filipino and Cebuano have an abundance in monolingual data that this study took advantage of attempting to find the best way to automatically generate parallel corpus out from monolingual corpora through the use of bitext alignment. Byte-pair encoding was applied in an attempt to optimize the alignment of the source and target texts. Results have shown that alignment was best achieved without segmenting the tokens. Itermax alignment score is best for short-length sentences and match or argmax alignment score are best for long-length sentences.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="fernandez-adlaon-2022-exploring">
<titleInfo>
<title>Exploring Word Alignment towards an Efficient Sentence Aligner for Filipino and Cebuano Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jenn</namePart>
<namePart type="given">Leana</namePart>
<namePart type="family">Fernandez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kristine</namePart>
<namePart type="given">Mae</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Adlaon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Workshop on Technologies for Machine Translation of Low-Resource Languages (LoResMT 2022)</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Gyeongju, Republic of Korea</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Building a robust machine translation (MT) system requires a large amount of parallel corpus which is an expensive resource for low-resourced languages. The two major languages being spoken in the Philippines which are Filipino and Cebuano have an abundance in monolingual data that this study took advantage of attempting to find the best way to automatically generate parallel corpus out from monolingual corpora through the use of bitext alignment. Byte-pair encoding was applied in an attempt to optimize the alignment of the source and target texts. Results have shown that alignment was best achieved without segmenting the tokens. Itermax alignment score is best for short-length sentences and match or argmax alignment score are best for long-length sentences.</abstract>
<identifier type="citekey">fernandez-adlaon-2022-exploring</identifier>
<location>
<url>https://aclanthology.org/2022.loresmt-1.13</url>
</location>
<part>
<date>2022-10</date>
<extent unit="page">
<start>99</start>
<end>106</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Exploring Word Alignment towards an Efficient Sentence Aligner for Filipino and Cebuano Languages
%A Fernandez, Jenn Leana
%A Adlaon, Kristine Mae M.
%S Proceedings of the Fifth Workshop on Technologies for Machine Translation of Low-Resource Languages (LoResMT 2022)
%D 2022
%8 October
%I Association for Computational Linguistics
%C Gyeongju, Republic of Korea
%F fernandez-adlaon-2022-exploring
%X Building a robust machine translation (MT) system requires a large amount of parallel corpus which is an expensive resource for low-resourced languages. The two major languages being spoken in the Philippines which are Filipino and Cebuano have an abundance in monolingual data that this study took advantage of attempting to find the best way to automatically generate parallel corpus out from monolingual corpora through the use of bitext alignment. Byte-pair encoding was applied in an attempt to optimize the alignment of the source and target texts. Results have shown that alignment was best achieved without segmenting the tokens. Itermax alignment score is best for short-length sentences and match or argmax alignment score are best for long-length sentences.
%U https://aclanthology.org/2022.loresmt-1.13
%P 99-106
Markdown (Informal)
[Exploring Word Alignment towards an Efficient Sentence Aligner for Filipino and Cebuano Languages](https://aclanthology.org/2022.loresmt-1.13) (Fernandez & Adlaon, LoResMT 2022)
ACL