@inproceedings{al-khdour-etal-2022-unlocking,
title = "Unlocking the value of bilingual translated documents with Deep Learning Segmentation and Alignment for {A}rabic",
author = "Al-Khdour, Nour and
Jonsson, Rebecca and
Jaikat, Ruba W and
Nasir, Abdallah and
Alisis, Sara and
Qardan, Sara and
Shawahneh, Eyas",
editor = "Campbell, Janice and
Larocca, Stephen and
Marciano, Jay and
Savenkov, Konstantin and
Yanishevsky, Alex",
booktitle = "Proceedings of the 15th Biennial Conference of the Association for Machine Translation in the Americas (Volume 2: Users and Providers Track and Government Track)",
month = sep,
year = "2022",
address = "Orlando, USA",
publisher = "Association for Machine Translation in the Americas",
url = "https://aclanthology.org/2022.amta-upg.24",
pages = "341--359",
abstract = "To unlock the value of high-quality bilingual translated documents we need parallel data. With sentence-aligned translation pairs, we can fuel our neural machine translation, customize MT or create translation memories for our clients. To automate this process, automatic segmentation and alignment are required. Despite Arabic being the fifth biggest language in the world, language technology for Arabic is many times way behind other languages. We will show how we struggled to find a proper sentence segmentation for Arabic and instead explored different frameworks, from statistical to deep learning, to end up fine-tuning our own Arabic DL segmentation model. We will highlight our learnings and challenges with segmenting and aligning Arabic and English bilingual data. Finally, we will show the impact on our proprietary NMT engine as we started to unlock the value and could leverage data that had been translated offline, outside CAT tools, as well as comparable corpora, to feed our NMT.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="al-khdour-etal-2022-unlocking">
<titleInfo>
<title>Unlocking the value of bilingual translated documents with Deep Learning Segmentation and Alignment for Arabic</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nour</namePart>
<namePart type="family">Al-Khdour</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rebecca</namePart>
<namePart type="family">Jonsson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruba</namePart>
<namePart type="given">W</namePart>
<namePart type="family">Jaikat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abdallah</namePart>
<namePart type="family">Nasir</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Alisis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Qardan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eyas</namePart>
<namePart type="family">Shawahneh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 15th Biennial Conference of the Association for Machine Translation in the Americas (Volume 2: Users and Providers Track and Government Track)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Janice</namePart>
<namePart type="family">Campbell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stephen</namePart>
<namePart type="family">Larocca</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jay</namePart>
<namePart type="family">Marciano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Konstantin</namePart>
<namePart type="family">Savenkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Yanishevsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Machine Translation in the Americas</publisher>
<place>
<placeTerm type="text">Orlando, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>To unlock the value of high-quality bilingual translated documents we need parallel data. With sentence-aligned translation pairs, we can fuel our neural machine translation, customize MT or create translation memories for our clients. To automate this process, automatic segmentation and alignment are required. Despite Arabic being the fifth biggest language in the world, language technology for Arabic is many times way behind other languages. We will show how we struggled to find a proper sentence segmentation for Arabic and instead explored different frameworks, from statistical to deep learning, to end up fine-tuning our own Arabic DL segmentation model. We will highlight our learnings and challenges with segmenting and aligning Arabic and English bilingual data. Finally, we will show the impact on our proprietary NMT engine as we started to unlock the value and could leverage data that had been translated offline, outside CAT tools, as well as comparable corpora, to feed our NMT.</abstract>
<identifier type="citekey">al-khdour-etal-2022-unlocking</identifier>
<location>
<url>https://aclanthology.org/2022.amta-upg.24</url>
</location>
<part>
<date>2022-09</date>
<extent unit="page">
<start>341</start>
<end>359</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Unlocking the value of bilingual translated documents with Deep Learning Segmentation and Alignment for Arabic
%A Al-Khdour, Nour
%A Jonsson, Rebecca
%A Jaikat, Ruba W.
%A Nasir, Abdallah
%A Alisis, Sara
%A Qardan, Sara
%A Shawahneh, Eyas
%Y Campbell, Janice
%Y Larocca, Stephen
%Y Marciano, Jay
%Y Savenkov, Konstantin
%Y Yanishevsky, Alex
%S Proceedings of the 15th Biennial Conference of the Association for Machine Translation in the Americas (Volume 2: Users and Providers Track and Government Track)
%D 2022
%8 September
%I Association for Machine Translation in the Americas
%C Orlando, USA
%F al-khdour-etal-2022-unlocking
%X To unlock the value of high-quality bilingual translated documents we need parallel data. With sentence-aligned translation pairs, we can fuel our neural machine translation, customize MT or create translation memories for our clients. To automate this process, automatic segmentation and alignment are required. Despite Arabic being the fifth biggest language in the world, language technology for Arabic is many times way behind other languages. We will show how we struggled to find a proper sentence segmentation for Arabic and instead explored different frameworks, from statistical to deep learning, to end up fine-tuning our own Arabic DL segmentation model. We will highlight our learnings and challenges with segmenting and aligning Arabic and English bilingual data. Finally, we will show the impact on our proprietary NMT engine as we started to unlock the value and could leverage data that had been translated offline, outside CAT tools, as well as comparable corpora, to feed our NMT.
%U https://aclanthology.org/2022.amta-upg.24
%P 341-359
Markdown (Informal)
[Unlocking the value of bilingual translated documents with Deep Learning Segmentation and Alignment for Arabic](https://aclanthology.org/2022.amta-upg.24) (Al-Khdour et al., AMTA 2022)
ACL