@inproceedings{amin-etal-2024-data,
title = "Data Augmentation for Low-Resource {I}talian {NLP}: Enhancing Semantic Processing with {DRS}",
author = "Amin, Muhammad Saad and
Anselma, Luca and
Mazzei, Alessandro",
editor = "Dell'Orletta, Felice and
Lenci, Alessandro and
Montemagni, Simonetta and
Sprugnoli, Rachele",
booktitle = "Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)",
month = dec,
year = "2024",
address = "Pisa, Italy",
publisher = "CEUR Workshop Proceedings",
url = "https://aclanthology.org/2024.clicit-1.5/",
pages = "29--38",
ISBN = "979-12-210-7060-6",
abstract = "Discourse Representation Structure (DRS), a formal meaning representation, has shown promising results in semantic parsing and natural language generation tasks for high-resource languages like English. This paper investigates enhancing the application of DRS to low-resource Italian Natural Language Processing (NLP), in both semantic parsing (Text-to-DRS) and natural language generation (DRS-to-Text). To address the scarcity of annotated corpora for Italian DRS, we propose a novel data augmentation technique that involves the use of external linguistic resources including: (i) WordNet for common nouns, adjectives, adverbs, and verbs; (ii) LLM-generated named entities for proper nouns; and (iii) rule-based algorithms fortense augmentation. This approach not only increases the quantity of training data but also introduces linguistic diversity, which is crucial for improving model performance and robustness. Using this augmented dataset, we developed neural semantic parser and generator models that demonstrated enhanced generalization ability compared to models trained on non-augmented data. We evaluated the effect of semantic data augmentation using two state-of-the-art transformer-based neural sequence-to-sequence models, i.e., byT5 and IT5. Our implementation shows promising results for Italian semanticprocessing. Data augmentation significantly increased the performance of semantic parsing from 76.10 to 90.56 (+14.46{\%}) F1-SMATCH score and generation with 37.79 to 57.48 (+19.69{\%}) BLEU, 30.83 to 40.95 (+10.12{\%}) METEOR, 81.66 to 90.97 (+9.31{\%}) COMET, 54.84 to 70.88 (+16.04{\%}) chrF, and 88.86 to 92.97 (+4.11{\%}) BERT scores. These results demonstrate the effectiveness of our novel augmentation approach in enhancing semantic processing capabilities for low-resource languages like Italian."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="amin-etal-2024-data">
<titleInfo>
<title>Data Augmentation for Low-Resource Italian NLP: Enhancing Semantic Processing with DRS</title>
</titleInfo>
<name type="personal">
<namePart type="given">Muhammad</namePart>
<namePart type="given">Saad</namePart>
<namePart type="family">Amin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luca</namePart>
<namePart type="family">Anselma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Mazzei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Felice</namePart>
<namePart type="family">Dell’Orletta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simonetta</namePart>
<namePart type="family">Montemagni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rachele</namePart>
<namePart type="family">Sprugnoli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>CEUR Workshop Proceedings</publisher>
<place>
<placeTerm type="text">Pisa, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-12-210-7060-6</identifier>
</relatedItem>
<abstract>Discourse Representation Structure (DRS), a formal meaning representation, has shown promising results in semantic parsing and natural language generation tasks for high-resource languages like English. This paper investigates enhancing the application of DRS to low-resource Italian Natural Language Processing (NLP), in both semantic parsing (Text-to-DRS) and natural language generation (DRS-to-Text). To address the scarcity of annotated corpora for Italian DRS, we propose a novel data augmentation technique that involves the use of external linguistic resources including: (i) WordNet for common nouns, adjectives, adverbs, and verbs; (ii) LLM-generated named entities for proper nouns; and (iii) rule-based algorithms fortense augmentation. This approach not only increases the quantity of training data but also introduces linguistic diversity, which is crucial for improving model performance and robustness. Using this augmented dataset, we developed neural semantic parser and generator models that demonstrated enhanced generalization ability compared to models trained on non-augmented data. We evaluated the effect of semantic data augmentation using two state-of-the-art transformer-based neural sequence-to-sequence models, i.e., byT5 and IT5. Our implementation shows promising results for Italian semanticprocessing. Data augmentation significantly increased the performance of semantic parsing from 76.10 to 90.56 (+14.46%) F1-SMATCH score and generation with 37.79 to 57.48 (+19.69%) BLEU, 30.83 to 40.95 (+10.12%) METEOR, 81.66 to 90.97 (+9.31%) COMET, 54.84 to 70.88 (+16.04%) chrF, and 88.86 to 92.97 (+4.11%) BERT scores. These results demonstrate the effectiveness of our novel augmentation approach in enhancing semantic processing capabilities for low-resource languages like Italian.</abstract>
<identifier type="citekey">amin-etal-2024-data</identifier>
<location>
<url>https://aclanthology.org/2024.clicit-1.5/</url>
</location>
<part>
<date>2024-12</date>
<extent unit="page">
<start>29</start>
<end>38</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Data Augmentation for Low-Resource Italian NLP: Enhancing Semantic Processing with DRS
%A Amin, Muhammad Saad
%A Anselma, Luca
%A Mazzei, Alessandro
%Y Dell’Orletta, Felice
%Y Lenci, Alessandro
%Y Montemagni, Simonetta
%Y Sprugnoli, Rachele
%S Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)
%D 2024
%8 December
%I CEUR Workshop Proceedings
%C Pisa, Italy
%@ 979-12-210-7060-6
%F amin-etal-2024-data
%X Discourse Representation Structure (DRS), a formal meaning representation, has shown promising results in semantic parsing and natural language generation tasks for high-resource languages like English. This paper investigates enhancing the application of DRS to low-resource Italian Natural Language Processing (NLP), in both semantic parsing (Text-to-DRS) and natural language generation (DRS-to-Text). To address the scarcity of annotated corpora for Italian DRS, we propose a novel data augmentation technique that involves the use of external linguistic resources including: (i) WordNet for common nouns, adjectives, adverbs, and verbs; (ii) LLM-generated named entities for proper nouns; and (iii) rule-based algorithms fortense augmentation. This approach not only increases the quantity of training data but also introduces linguistic diversity, which is crucial for improving model performance and robustness. Using this augmented dataset, we developed neural semantic parser and generator models that demonstrated enhanced generalization ability compared to models trained on non-augmented data. We evaluated the effect of semantic data augmentation using two state-of-the-art transformer-based neural sequence-to-sequence models, i.e., byT5 and IT5. Our implementation shows promising results for Italian semanticprocessing. Data augmentation significantly increased the performance of semantic parsing from 76.10 to 90.56 (+14.46%) F1-SMATCH score and generation with 37.79 to 57.48 (+19.69%) BLEU, 30.83 to 40.95 (+10.12%) METEOR, 81.66 to 90.97 (+9.31%) COMET, 54.84 to 70.88 (+16.04%) chrF, and 88.86 to 92.97 (+4.11%) BERT scores. These results demonstrate the effectiveness of our novel augmentation approach in enhancing semantic processing capabilities for low-resource languages like Italian.
%U https://aclanthology.org/2024.clicit-1.5/
%P 29-38
Markdown (Informal)
[Data Augmentation for Low-Resource Italian NLP: Enhancing Semantic Processing with DRS](https://aclanthology.org/2024.clicit-1.5/) (Amin et al., CLiC-it 2024)
ACL