@inproceedings{montesinos-etal-2026-improving,
title = "Improving Machine Translation of Idioms: A {S}panish{--}{G}alician Parallel Dataset and Synthetic Augmentation Approach",
author = "Montesinos, L{\'u}a Santamar{\'i}a and
Buj{\'a}n, Sa{\'u}l and
Bardanca, Daniel and
Gamallo, Pablo",
editor = "Souza, Marlo and
de-Dios-Flores, Iria and
Santos, Diana and
Freitas, Larissa and
Souza, Jackson Wilke da Cruz and
Ribeiro, Eug{\'e}nio",
booktitle = "Proceedings of the 17th International Conference on Computational Processing of {P}ortuguese ({PROPOR} 2026) - Vol. 1",
month = apr,
year = "2026",
address = "Salvador, Brazil",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.propor-1.99/",
pages = "980--987",
ISBN = "979-8-89176-387-6",
abstract = "Idiomatic expressions are a well-known challenge for neural machine translation, including both traditional sequence-to-sequence models and large language models (LLMs). This paper presents a systematic approach to improve idiom translation between Spanish and Galician. First, we build a high-quality parallel dataset of idioms manually aligned across both languages. Then, we automatically extend this dataset into a large synthetic parallel corpus using LLMs, following a strategy that prioritizes the most frequent idioms observed in authentic corpora. This augmented dataset is used to retrain a seq2seq translation model. We evaluate the resulting system and compare it both to the baseline model without idiom data and to state-of-the-art LLM-based translators such as SalamandraTA. Results show that the translation of idioms improves significantly after the training, alongside a slight boost in the model{'}s overall performance."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="montesinos-etal-2026-improving">
<titleInfo>
<title>Improving Machine Translation of Idioms: A Spanish–Galician Parallel Dataset and Synthetic Augmentation Approach</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lúa</namePart>
<namePart type="given">Santamaría</namePart>
<namePart type="family">Montesinos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Saúl</namePart>
<namePart type="family">Buján</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Bardanca</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pablo</namePart>
<namePart type="family">Gamallo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 17th International Conference on Computational Processing of Portuguese (PROPOR 2026) - Vol. 1</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marlo</namePart>
<namePart type="family">Souza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iria</namePart>
<namePart type="family">de-Dios-Flores</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Diana</namePart>
<namePart type="family">Santos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Larissa</namePart>
<namePart type="family">Freitas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jackson</namePart>
<namePart type="given">Wilke</namePart>
<namePart type="given">da</namePart>
<namePart type="given">Cruz</namePart>
<namePart type="family">Souza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eugénio</namePart>
<namePart type="family">Ribeiro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Salvador, Brazil</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-387-6</identifier>
</relatedItem>
<abstract>Idiomatic expressions are a well-known challenge for neural machine translation, including both traditional sequence-to-sequence models and large language models (LLMs). This paper presents a systematic approach to improve idiom translation between Spanish and Galician. First, we build a high-quality parallel dataset of idioms manually aligned across both languages. Then, we automatically extend this dataset into a large synthetic parallel corpus using LLMs, following a strategy that prioritizes the most frequent idioms observed in authentic corpora. This augmented dataset is used to retrain a seq2seq translation model. We evaluate the resulting system and compare it both to the baseline model without idiom data and to state-of-the-art LLM-based translators such as SalamandraTA. Results show that the translation of idioms improves significantly after the training, alongside a slight boost in the model’s overall performance.</abstract>
<identifier type="citekey">montesinos-etal-2026-improving</identifier>
<location>
<url>https://aclanthology.org/2026.propor-1.99/</url>
</location>
<part>
<date>2026-04</date>
<extent unit="page">
<start>980</start>
<end>987</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Improving Machine Translation of Idioms: A Spanish–Galician Parallel Dataset and Synthetic Augmentation Approach
%A Montesinos, Lúa Santamaría
%A Buján, Saúl
%A Bardanca, Daniel
%A Gamallo, Pablo
%Y Souza, Marlo
%Y de-Dios-Flores, Iria
%Y Santos, Diana
%Y Freitas, Larissa
%Y Souza, Jackson Wilke da Cruz
%Y Ribeiro, Eugénio
%S Proceedings of the 17th International Conference on Computational Processing of Portuguese (PROPOR 2026) - Vol. 1
%D 2026
%8 April
%I Association for Computational Linguistics
%C Salvador, Brazil
%@ 979-8-89176-387-6
%F montesinos-etal-2026-improving
%X Idiomatic expressions are a well-known challenge for neural machine translation, including both traditional sequence-to-sequence models and large language models (LLMs). This paper presents a systematic approach to improve idiom translation between Spanish and Galician. First, we build a high-quality parallel dataset of idioms manually aligned across both languages. Then, we automatically extend this dataset into a large synthetic parallel corpus using LLMs, following a strategy that prioritizes the most frequent idioms observed in authentic corpora. This augmented dataset is used to retrain a seq2seq translation model. We evaluate the resulting system and compare it both to the baseline model without idiom data and to state-of-the-art LLM-based translators such as SalamandraTA. Results show that the translation of idioms improves significantly after the training, alongside a slight boost in the model’s overall performance.
%U https://aclanthology.org/2026.propor-1.99/
%P 980-987
Markdown (Informal)
[Improving Machine Translation of Idioms: A Spanish–Galician Parallel Dataset and Synthetic Augmentation Approach](https://aclanthology.org/2026.propor-1.99/) (Montesinos et al., PROPOR 2026)
ACL