@inproceedings{samir-silfverberg-2022-one,
title = "One Wug, Two Wug+s Transformer Inflection Models Hallucinate Affixes",
author = "Samir, Farhan and
Silfverberg, Miikka",
editor = "Moeller, Sarah and
Anastasopoulos, Antonios and
Arppe, Antti and
Chaudhary, Aditi and
Harrigan, Atticus and
Holden, Josh and
Lachler, Jordan and
Palmer, Alexis and
Rijhwani, Shruti and
Schwartz, Lane",
booktitle = "Proceedings of the Fifth Workshop on the Use of Computational Methods in the Study of Endangered Languages",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.computel-1.5",
doi = "10.18653/v1/2022.computel-1.5",
pages = "31--40",
abstract = "Data augmentation strategies are increasingly important in NLP pipelines for low-resourced and endangered languages, and in neural morphological inflection, augmentation by so called data hallucination is a popular technique. This paper presents a detailed analysis of inflection models trained with and without data hallucination for the low-resourced Canadian Indigenous language Gitksan. Our analysis reveals evidence for a concatenative inductive bias in augmented models{---}in contrast to models trained without hallucination, they strongly prefer affixing inflection patterns over suppletive ones. We find that preference for affixation in general improves inflection performance in {``}wug test{''} like settings, where the model is asked to inflect lexemes missing from the training set. However, data hallucination dramatically reduces prediction accuracy for reduplicative forms due to a misanalysis of reduplication as affixation. While the overall impact of data hallucination for unseen lexemes remains positive, our findings call for greater qualitative analysis and more varied evaluation conditions in testing automatic inflection systems. Our results indicate that further innovations in data augmentation for computational morphology are desirable.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="samir-silfverberg-2022-one">
<titleInfo>
<title>One Wug, Two Wug+s Transformer Inflection Models Hallucinate Affixes</title>
</titleInfo>
<name type="personal">
<namePart type="given">Farhan</namePart>
<namePart type="family">Samir</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Miikka</namePart>
<namePart type="family">Silfverberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Workshop on the Use of Computational Methods in the Study of Endangered Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sarah</namePart>
<namePart type="family">Moeller</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonios</namePart>
<namePart type="family">Anastasopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antti</namePart>
<namePart type="family">Arppe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aditi</namePart>
<namePart type="family">Chaudhary</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Atticus</namePart>
<namePart type="family">Harrigan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Josh</namePart>
<namePart type="family">Holden</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Lachler</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexis</namePart>
<namePart type="family">Palmer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shruti</namePart>
<namePart type="family">Rijhwani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lane</namePart>
<namePart type="family">Schwartz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Data augmentation strategies are increasingly important in NLP pipelines for low-resourced and endangered languages, and in neural morphological inflection, augmentation by so called data hallucination is a popular technique. This paper presents a detailed analysis of inflection models trained with and without data hallucination for the low-resourced Canadian Indigenous language Gitksan. Our analysis reveals evidence for a concatenative inductive bias in augmented models—in contrast to models trained without hallucination, they strongly prefer affixing inflection patterns over suppletive ones. We find that preference for affixation in general improves inflection performance in “wug test” like settings, where the model is asked to inflect lexemes missing from the training set. However, data hallucination dramatically reduces prediction accuracy for reduplicative forms due to a misanalysis of reduplication as affixation. While the overall impact of data hallucination for unseen lexemes remains positive, our findings call for greater qualitative analysis and more varied evaluation conditions in testing automatic inflection systems. Our results indicate that further innovations in data augmentation for computational morphology are desirable.</abstract>
<identifier type="citekey">samir-silfverberg-2022-one</identifier>
<identifier type="doi">10.18653/v1/2022.computel-1.5</identifier>
<location>
<url>https://aclanthology.org/2022.computel-1.5</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>31</start>
<end>40</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T One Wug, Two Wug+s Transformer Inflection Models Hallucinate Affixes
%A Samir, Farhan
%A Silfverberg, Miikka
%Y Moeller, Sarah
%Y Anastasopoulos, Antonios
%Y Arppe, Antti
%Y Chaudhary, Aditi
%Y Harrigan, Atticus
%Y Holden, Josh
%Y Lachler, Jordan
%Y Palmer, Alexis
%Y Rijhwani, Shruti
%Y Schwartz, Lane
%S Proceedings of the Fifth Workshop on the Use of Computational Methods in the Study of Endangered Languages
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland
%F samir-silfverberg-2022-one
%X Data augmentation strategies are increasingly important in NLP pipelines for low-resourced and endangered languages, and in neural morphological inflection, augmentation by so called data hallucination is a popular technique. This paper presents a detailed analysis of inflection models trained with and without data hallucination for the low-resourced Canadian Indigenous language Gitksan. Our analysis reveals evidence for a concatenative inductive bias in augmented models—in contrast to models trained without hallucination, they strongly prefer affixing inflection patterns over suppletive ones. We find that preference for affixation in general improves inflection performance in “wug test” like settings, where the model is asked to inflect lexemes missing from the training set. However, data hallucination dramatically reduces prediction accuracy for reduplicative forms due to a misanalysis of reduplication as affixation. While the overall impact of data hallucination for unseen lexemes remains positive, our findings call for greater qualitative analysis and more varied evaluation conditions in testing automatic inflection systems. Our results indicate that further innovations in data augmentation for computational morphology are desirable.
%R 10.18653/v1/2022.computel-1.5
%U https://aclanthology.org/2022.computel-1.5
%U https://doi.org/10.18653/v1/2022.computel-1.5
%P 31-40
Markdown (Informal)
[One Wug, Two Wug+s Transformer Inflection Models Hallucinate Affixes](https://aclanthology.org/2022.computel-1.5) (Samir & Silfverberg, ComputEL 2022)
ACL