@inproceedings{araabi-etal-2024-entropy,
title = "Entropy{--} and Distance-Regularized Attention Improves Low-Resource Neural Machine Translation",
author = "Araabi, Ali and
Niculae, Vlad and
Monz, Christof",
editor = "Knowles, Rebecca and
Eriguchi, Akiko and
Goel, Shivali",
booktitle = "Proceedings of the 16th Conference of the Association for Machine Translation in the Americas (Volume 1: Research Track)",
month = sep,
year = "2024",
address = "Chicago, USA",
publisher = "Association for Machine Translation in the Americas",
url = "https://aclanthology.org/2024.amta-research.13",
pages = "140--153",
abstract = "Transformer-based models in Neural Machine Translation (NMT) rely heavily on multi-head attention for capturing dependencies within and across source and target sequences. In Transformers, attention mechanisms dynamically determine which parts of the sentence to focus on in the encoder and decoder through self-attention and cross-attention. Our experiments show that high-resource NMT systems often exhibit a specific peaked attention distribution, indicating a focus on key elements. However, in low-resource NMT, attention tends to be dispersed throughout the sentence, lacking the focus demonstrated by high-resource models. To tackle this issue, we present EaDRA (Entropy{--} and Distance-Regularized Attention), which introduces an inductive bias to prioritize essential elements and guide the attention mechanism accordingly. Extensive experiments using EaDRA on diverse low-resource language pairs demonstrate significant improvements in translation quality, while incurring negligible computational cost.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="araabi-etal-2024-entropy">
<titleInfo>
<title>Entropy– and Distance-Regularized Attention Improves Low-Resource Neural Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ali</namePart>
<namePart type="family">Araabi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vlad</namePart>
<namePart type="family">Niculae</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christof</namePart>
<namePart type="family">Monz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 16th Conference of the Association for Machine Translation in the Americas (Volume 1: Research Track)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rebecca</namePart>
<namePart type="family">Knowles</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Akiko</namePart>
<namePart type="family">Eriguchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shivali</namePart>
<namePart type="family">Goel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Machine Translation in the Americas</publisher>
<place>
<placeTerm type="text">Chicago, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Transformer-based models in Neural Machine Translation (NMT) rely heavily on multi-head attention for capturing dependencies within and across source and target sequences. In Transformers, attention mechanisms dynamically determine which parts of the sentence to focus on in the encoder and decoder through self-attention and cross-attention. Our experiments show that high-resource NMT systems often exhibit a specific peaked attention distribution, indicating a focus on key elements. However, in low-resource NMT, attention tends to be dispersed throughout the sentence, lacking the focus demonstrated by high-resource models. To tackle this issue, we present EaDRA (Entropy– and Distance-Regularized Attention), which introduces an inductive bias to prioritize essential elements and guide the attention mechanism accordingly. Extensive experiments using EaDRA on diverse low-resource language pairs demonstrate significant improvements in translation quality, while incurring negligible computational cost.</abstract>
<identifier type="citekey">araabi-etal-2024-entropy</identifier>
<location>
<url>https://aclanthology.org/2024.amta-research.13</url>
</location>
<part>
<date>2024-09</date>
<extent unit="page">
<start>140</start>
<end>153</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Entropy– and Distance-Regularized Attention Improves Low-Resource Neural Machine Translation
%A Araabi, Ali
%A Niculae, Vlad
%A Monz, Christof
%Y Knowles, Rebecca
%Y Eriguchi, Akiko
%Y Goel, Shivali
%S Proceedings of the 16th Conference of the Association for Machine Translation in the Americas (Volume 1: Research Track)
%D 2024
%8 September
%I Association for Machine Translation in the Americas
%C Chicago, USA
%F araabi-etal-2024-entropy
%X Transformer-based models in Neural Machine Translation (NMT) rely heavily on multi-head attention for capturing dependencies within and across source and target sequences. In Transformers, attention mechanisms dynamically determine which parts of the sentence to focus on in the encoder and decoder through self-attention and cross-attention. Our experiments show that high-resource NMT systems often exhibit a specific peaked attention distribution, indicating a focus on key elements. However, in low-resource NMT, attention tends to be dispersed throughout the sentence, lacking the focus demonstrated by high-resource models. To tackle this issue, we present EaDRA (Entropy– and Distance-Regularized Attention), which introduces an inductive bias to prioritize essential elements and guide the attention mechanism accordingly. Extensive experiments using EaDRA on diverse low-resource language pairs demonstrate significant improvements in translation quality, while incurring negligible computational cost.
%U https://aclanthology.org/2024.amta-research.13
%P 140-153
Markdown (Informal)
[Entropy– and Distance-Regularized Attention Improves Low-Resource Neural Machine Translation](https://aclanthology.org/2024.amta-research.13) (Araabi et al., AMTA 2024)
ACL