@inproceedings{lucas-etal-2024-grammar,
title = "Grammar-based Data Augmentation for Low-Resource Languages: The Case of {G}uarani-{S}panish Neural Machine Translation",
author = {Lucas, Agust{\'\i}n and
Balad{\'o}n, Alexis and
Pardi{\~n}as, Victoria and
Ag{\"u}ero-Torales, Marvin and
G{\'o}ngora, Santiago and
Chiruzzo, Luis},
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.naacl-long.354",
doi = "10.18653/v1/2024.naacl-long.354",
pages = "6385--6397",
abstract = "One of the main problems low-resource languages face in NLP can be pictured as a vicious circle: data is needed to build and test tools, but the available text is scarce and there are not powerful tools to collect it.In order to break this circle for Guarani, we explore if text automatically generated from a grammar can work as a Data Augmentation technique to boost the performance of Guarani-Spanish Machine Translation (MT) systems.After building a grammar-based system that generates Spanish text and syntactically transfers it to Guarani, we perform several experiments by pretraining models using this synthetic text.We find that the MT systems that are pretrained with synthetic text perform better, even outperforming previous baselines.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lucas-etal-2024-grammar">
<titleInfo>
<title>Grammar-based Data Augmentation for Low-Resource Languages: The Case of Guarani-Spanish Neural Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Agustín</namePart>
<namePart type="family">Lucas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexis</namePart>
<namePart type="family">Baladón</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Victoria</namePart>
<namePart type="family">Pardiñas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marvin</namePart>
<namePart type="family">Agüero-Torales</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Santiago</namePart>
<namePart type="family">Góngora</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>One of the main problems low-resource languages face in NLP can be pictured as a vicious circle: data is needed to build and test tools, but the available text is scarce and there are not powerful tools to collect it.In order to break this circle for Guarani, we explore if text automatically generated from a grammar can work as a Data Augmentation technique to boost the performance of Guarani-Spanish Machine Translation (MT) systems.After building a grammar-based system that generates Spanish text and syntactically transfers it to Guarani, we perform several experiments by pretraining models using this synthetic text.We find that the MT systems that are pretrained with synthetic text perform better, even outperforming previous baselines.</abstract>
<identifier type="citekey">lucas-etal-2024-grammar</identifier>
<identifier type="doi">10.18653/v1/2024.naacl-long.354</identifier>
<location>
<url>https://aclanthology.org/2024.naacl-long.354</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>6385</start>
<end>6397</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Grammar-based Data Augmentation for Low-Resource Languages: The Case of Guarani-Spanish Neural Machine Translation
%A Lucas, Agustín
%A Baladón, Alexis
%A Pardiñas, Victoria
%A Agüero-Torales, Marvin
%A Góngora, Santiago
%A Chiruzzo, Luis
%Y Duh, Kevin
%Y Gomez, Helena
%Y Bethard, Steven
%S Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F lucas-etal-2024-grammar
%X One of the main problems low-resource languages face in NLP can be pictured as a vicious circle: data is needed to build and test tools, but the available text is scarce and there are not powerful tools to collect it.In order to break this circle for Guarani, we explore if text automatically generated from a grammar can work as a Data Augmentation technique to boost the performance of Guarani-Spanish Machine Translation (MT) systems.After building a grammar-based system that generates Spanish text and syntactically transfers it to Guarani, we perform several experiments by pretraining models using this synthetic text.We find that the MT systems that are pretrained with synthetic text perform better, even outperforming previous baselines.
%R 10.18653/v1/2024.naacl-long.354
%U https://aclanthology.org/2024.naacl-long.354
%U https://doi.org/10.18653/v1/2024.naacl-long.354
%P 6385-6397
Markdown (Informal)
[Grammar-based Data Augmentation for Low-Resource Languages: The Case of Guarani-Spanish Neural Machine Translation](https://aclanthology.org/2024.naacl-long.354) (Lucas et al., NAACL 2024)
ACL