@inproceedings{elsner-liu-2025-prompt,
title = "Prompt and circumstance'':'' A word-by-word {LLM} prompting approach to interlinear glossing for low-resource languages",
author = "Elsner, Micha and
Liu, David",
editor = {Nicolai, Garrett and
Chodroff, Eleanor and
Mailhot, Frederic and
{\c{C}}{\"o}ltekin, {\c{C}}a{\u{g}}r{\i}},
booktitle = "Proceedings of the 22nd SIGMORPHON workshop on Computational Morphology, Phonology, and Phonetics",
month = may,
year = "2025",
address = "Albuquerque, New Mexico, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.sigmorphon-main.1/",
doi = "10.18653/v1/2025.sigmorphon-main.1",
pages = "1--14",
ISBN = "979-8-89176-231-2",
abstract = "This paper presents VeLePa, an inflected verbal lexicon of Central Pame (pbs, cent2154), an Otomanguean language from Mexico. This resource contains 12528 words in phonological form representing the complete inflectional paradigms of 216 verbs, supplemented with use frequencies. Computer-operable (CLDF) inflected lexicons of non-WEIRD underresourced languages are urgently needed to expand digital capacities in this languages (e.g. in NLP). VeLePa contributes to this, and does so with data from a language which is morphologically extraordinary, with unusually high levels of irregularity and multiple conjugations at various loci within the word'':'' prefixes, stems, tone, and suffixes constitute different albeit interrelated subsystems of inflection. Partly automated creation of interlinear glossed text (IGT) has the potential to assist in linguistic documentation. We argue that LLMs can make this process more accessible to linguists because of their capacity to follow natural-language instructions. We investigate the effectiveness of a retrieval-based LLM prompting approach to glossing, applied to the seven languages from the SIGMORPHON 2023 shared task. Our system beats the BERTbased shared task baseline for every language in the morpheme-level score category, and we show that a simple 3-best oracle has higher word-level scores than the challenge winner (a tuned sequence model) in five languages. In a case study on Tsez, we ask the LLM to automatically create and follow linguistic instructions, reducing errors on a confusing grammatical feature. Our results thus demonstrate the potential contributions which LLMs can make in interactive systems for glossing, both in making suggestions to human annotators and following directions."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="elsner-liu-2025-prompt">
<titleInfo>
<title>Prompt and circumstance”:” A word-by-word LLM prompting approach to interlinear glossing for low-resource languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Micha</namePart>
<namePart type="family">Elsner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 22nd SIGMORPHON workshop on Computational Morphology, Phonology, and Phonetics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Garrett</namePart>
<namePart type="family">Nicolai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eleanor</namePart>
<namePart type="family">Chodroff</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frederic</namePart>
<namePart type="family">Mailhot</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Çağrı</namePart>
<namePart type="family">Çöltekin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-231-2</identifier>
</relatedItem>
<abstract>This paper presents VeLePa, an inflected verbal lexicon of Central Pame (pbs, cent2154), an Otomanguean language from Mexico. This resource contains 12528 words in phonological form representing the complete inflectional paradigms of 216 verbs, supplemented with use frequencies. Computer-operable (CLDF) inflected lexicons of non-WEIRD underresourced languages are urgently needed to expand digital capacities in this languages (e.g. in NLP). VeLePa contributes to this, and does so with data from a language which is morphologically extraordinary, with unusually high levels of irregularity and multiple conjugations at various loci within the word”:” prefixes, stems, tone, and suffixes constitute different albeit interrelated subsystems of inflection. Partly automated creation of interlinear glossed text (IGT) has the potential to assist in linguistic documentation. We argue that LLMs can make this process more accessible to linguists because of their capacity to follow natural-language instructions. We investigate the effectiveness of a retrieval-based LLM prompting approach to glossing, applied to the seven languages from the SIGMORPHON 2023 shared task. Our system beats the BERTbased shared task baseline for every language in the morpheme-level score category, and we show that a simple 3-best oracle has higher word-level scores than the challenge winner (a tuned sequence model) in five languages. In a case study on Tsez, we ask the LLM to automatically create and follow linguistic instructions, reducing errors on a confusing grammatical feature. Our results thus demonstrate the potential contributions which LLMs can make in interactive systems for glossing, both in making suggestions to human annotators and following directions.</abstract>
<identifier type="citekey">elsner-liu-2025-prompt</identifier>
<identifier type="doi">10.18653/v1/2025.sigmorphon-main.1</identifier>
<location>
<url>https://aclanthology.org/2025.sigmorphon-main.1/</url>
</location>
<part>
<date>2025-05</date>
<extent unit="page">
<start>1</start>
<end>14</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Prompt and circumstance”:” A word-by-word LLM prompting approach to interlinear glossing for low-resource languages
%A Elsner, Micha
%A Liu, David
%Y Nicolai, Garrett
%Y Chodroff, Eleanor
%Y Mailhot, Frederic
%Y Çöltekin, Çağrı
%S Proceedings of the 22nd SIGMORPHON workshop on Computational Morphology, Phonology, and Phonetics
%D 2025
%8 May
%I Association for Computational Linguistics
%C Albuquerque, New Mexico, USA
%@ 979-8-89176-231-2
%F elsner-liu-2025-prompt
%X This paper presents VeLePa, an inflected verbal lexicon of Central Pame (pbs, cent2154), an Otomanguean language from Mexico. This resource contains 12528 words in phonological form representing the complete inflectional paradigms of 216 verbs, supplemented with use frequencies. Computer-operable (CLDF) inflected lexicons of non-WEIRD underresourced languages are urgently needed to expand digital capacities in this languages (e.g. in NLP). VeLePa contributes to this, and does so with data from a language which is morphologically extraordinary, with unusually high levels of irregularity and multiple conjugations at various loci within the word”:” prefixes, stems, tone, and suffixes constitute different albeit interrelated subsystems of inflection. Partly automated creation of interlinear glossed text (IGT) has the potential to assist in linguistic documentation. We argue that LLMs can make this process more accessible to linguists because of their capacity to follow natural-language instructions. We investigate the effectiveness of a retrieval-based LLM prompting approach to glossing, applied to the seven languages from the SIGMORPHON 2023 shared task. Our system beats the BERTbased shared task baseline for every language in the morpheme-level score category, and we show that a simple 3-best oracle has higher word-level scores than the challenge winner (a tuned sequence model) in five languages. In a case study on Tsez, we ask the LLM to automatically create and follow linguistic instructions, reducing errors on a confusing grammatical feature. Our results thus demonstrate the potential contributions which LLMs can make in interactive systems for glossing, both in making suggestions to human annotators and following directions.
%R 10.18653/v1/2025.sigmorphon-main.1
%U https://aclanthology.org/2025.sigmorphon-main.1/
%U https://doi.org/10.18653/v1/2025.sigmorphon-main.1
%P 1-14
Markdown (Informal)
[Prompt and circumstance”:" A word-by-word LLM prompting approach to interlinear glossing for low-resource languages](https://aclanthology.org/2025.sigmorphon-main.1/) (Elsner & Liu, SIGMORPHON 2025)
ACL