@inproceedings{lin-etal-2025-construction,
title = "Construction-Based Reduction of Translationese for Low-Resource Languages: A Pilot Study on {B}avarian",
author = {Lin, Peiqin and
Thaler, Marion and
Goschala, Daniela and
Kargaran, Amir Hossein and
Liu, Yihong and
Martins, Andr{\'e} F. T. and
Sch{\"u}tze, Hinrich},
editor = "Hahn, Michael and
Rani, Priya and
Kumar, Ritesh and
Shcherbakov, Andreas and
Sorokin, Alexey and
Serikov, Oleg and
Cotterell, Ryan and
Vylomova, Ekaterina",
booktitle = "Proceedings of the 7th Workshop on Research in Computational Linguistic Typology and Multilingual NLP",
month = aug,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.sigtyp-1.13/",
doi = "10.18653/v1/2025.sigtyp-1.13",
pages = "114--121",
ISBN = "979-8-89176-281-7",
abstract = "When translating into a low-resource language, a language model can have a tendency to produce translations that are close to the source (e.g., word-by-word translations) due to a lack of rich low-resource training data in pretraining. Thus, the output often is translationese that differs considerably from what native speakers would produce naturally. To remedy this, we synthetically create a training set in which the frequency of a construction unique to the low-resource language is artificially inflated. For the case of Bavarian, we show that, after training, the language model has learned the unique construction and that native speakers judge its output as more natural. Our pilot study suggests that construction-based mitigation of translationese is a promising approach. Code and artifacts are available at \url{https://github.com/cisnlp/BayernGPT}."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lin-etal-2025-construction">
<titleInfo>
<title>Construction-Based Reduction of Translationese for Low-Resource Languages: A Pilot Study on Bavarian</title>
</titleInfo>
<name type="personal">
<namePart type="given">Peiqin</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marion</namePart>
<namePart type="family">Thaler</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniela</namePart>
<namePart type="family">Goschala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amir</namePart>
<namePart type="given">Hossein</namePart>
<namePart type="family">Kargaran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yihong</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">André</namePart>
<namePart type="given">F</namePart>
<namePart type="given">T</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hinrich</namePart>
<namePart type="family">Schütze</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 7th Workshop on Research in Computational Linguistic Typology and Multilingual NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Hahn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Priya</namePart>
<namePart type="family">Rani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ritesh</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andreas</namePart>
<namePart type="family">Shcherbakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexey</namePart>
<namePart type="family">Sorokin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oleg</namePart>
<namePart type="family">Serikov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryan</namePart>
<namePart type="family">Cotterell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Vylomova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-281-7</identifier>
</relatedItem>
<abstract>When translating into a low-resource language, a language model can have a tendency to produce translations that are close to the source (e.g., word-by-word translations) due to a lack of rich low-resource training data in pretraining. Thus, the output often is translationese that differs considerably from what native speakers would produce naturally. To remedy this, we synthetically create a training set in which the frequency of a construction unique to the low-resource language is artificially inflated. For the case of Bavarian, we show that, after training, the language model has learned the unique construction and that native speakers judge its output as more natural. Our pilot study suggests that construction-based mitigation of translationese is a promising approach. Code and artifacts are available at https://github.com/cisnlp/BayernGPT.</abstract>
<identifier type="citekey">lin-etal-2025-construction</identifier>
<identifier type="doi">10.18653/v1/2025.sigtyp-1.13</identifier>
<location>
<url>https://aclanthology.org/2025.sigtyp-1.13/</url>
</location>
<part>
<date>2025-08</date>
<extent unit="page">
<start>114</start>
<end>121</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Construction-Based Reduction of Translationese for Low-Resource Languages: A Pilot Study on Bavarian
%A Lin, Peiqin
%A Thaler, Marion
%A Goschala, Daniela
%A Kargaran, Amir Hossein
%A Liu, Yihong
%A Martins, André F. T.
%A Schütze, Hinrich
%Y Hahn, Michael
%Y Rani, Priya
%Y Kumar, Ritesh
%Y Shcherbakov, Andreas
%Y Sorokin, Alexey
%Y Serikov, Oleg
%Y Cotterell, Ryan
%Y Vylomova, Ekaterina
%S Proceedings of the 7th Workshop on Research in Computational Linguistic Typology and Multilingual NLP
%D 2025
%8 August
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-281-7
%F lin-etal-2025-construction
%X When translating into a low-resource language, a language model can have a tendency to produce translations that are close to the source (e.g., word-by-word translations) due to a lack of rich low-resource training data in pretraining. Thus, the output often is translationese that differs considerably from what native speakers would produce naturally. To remedy this, we synthetically create a training set in which the frequency of a construction unique to the low-resource language is artificially inflated. For the case of Bavarian, we show that, after training, the language model has learned the unique construction and that native speakers judge its output as more natural. Our pilot study suggests that construction-based mitigation of translationese is a promising approach. Code and artifacts are available at https://github.com/cisnlp/BayernGPT.
%R 10.18653/v1/2025.sigtyp-1.13
%U https://aclanthology.org/2025.sigtyp-1.13/
%U https://doi.org/10.18653/v1/2025.sigtyp-1.13
%P 114-121
Markdown (Informal)
[Construction-Based Reduction of Translationese for Low-Resource Languages: A Pilot Study on Bavarian](https://aclanthology.org/2025.sigtyp-1.13/) (Lin et al., SIGTYP 2025)
ACL