@inproceedings{devadiga-chopra-2026-making,
title = "Making Large Language Models Speak {T}ulu: Structured Prompting for an Extremely Low-Resource Language",
author = "Devadiga, Prathamesh and
Chopra, Paras",
editor = "Hettiarachchi, Hansi and
Ranasinghe, Tharindu and
Plum, Alistair and
Rayson, Paul and
Mitkov, Ruslan and
Gaber, Mohamed and
Premasiri, Damith and
Tan, Fiona Anting and
Uyangodage, Lasitha",
booktitle = "Proceedings of the Second Workshop on Language Models for Low-Resource Languages ({L}o{R}es{LM} 2026)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.loreslm-1.5/",
pages = "50--61",
ISBN = "979-8-89176-377-7",
abstract = "Can large language models converse in languages virtually absent from their training data? We investigate this question through a case study on Tulu, a Dravidian language with over two million speakers but minimal digital presence. Rather than fine-tuning, we examine whether structured prompt engineering alone can elicit basic conversational ability under extreme data scarcity. Our framework combines explicit grammar documentation, negative constraints to suppress high-probability tokens from related languages, romanization standardization, and quality-controlled synthetic data generation via self-play. Evaluated on a manually curated held-out set across three LLMs (Gemini 2.0 Flash, GPT-4o, and Llama 3.1 70B) and validated by native speakers, our approach reduces vocabulary contamination from 80{\%} to 5{\%} while achieving 85{\%} grammatical accuracy. Cross-model analysis shows that negative constraints provide consistent improvements (12{--}18 percentage points), while the effectiveness of grammar documentation varies by model architecture (8{--}22 points). These results demonstrate that structured in-context learning can meaningfully extend LLM capabilities to extremely low-resource languages without parameter updates."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="devadiga-chopra-2026-making">
<titleInfo>
<title>Making Large Language Models Speak Tulu: Structured Prompting for an Extremely Low-Resource Language</title>
</titleInfo>
<name type="personal">
<namePart type="given">Prathamesh</namePart>
<namePart type="family">Devadiga</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paras</namePart>
<namePart type="family">Chopra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop on Language Models for Low-Resource Languages (LoResLM 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hansi</namePart>
<namePart type="family">Hettiarachchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tharindu</namePart>
<namePart type="family">Ranasinghe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alistair</namePart>
<namePart type="family">Plum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">Rayson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohamed</namePart>
<namePart type="family">Gaber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Damith</namePart>
<namePart type="family">Premasiri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fiona</namePart>
<namePart type="given">Anting</namePart>
<namePart type="family">Tan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lasitha</namePart>
<namePart type="family">Uyangodage</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-377-7</identifier>
</relatedItem>
<abstract>Can large language models converse in languages virtually absent from their training data? We investigate this question through a case study on Tulu, a Dravidian language with over two million speakers but minimal digital presence. Rather than fine-tuning, we examine whether structured prompt engineering alone can elicit basic conversational ability under extreme data scarcity. Our framework combines explicit grammar documentation, negative constraints to suppress high-probability tokens from related languages, romanization standardization, and quality-controlled synthetic data generation via self-play. Evaluated on a manually curated held-out set across three LLMs (Gemini 2.0 Flash, GPT-4o, and Llama 3.1 70B) and validated by native speakers, our approach reduces vocabulary contamination from 80% to 5% while achieving 85% grammatical accuracy. Cross-model analysis shows that negative constraints provide consistent improvements (12–18 percentage points), while the effectiveness of grammar documentation varies by model architecture (8–22 points). These results demonstrate that structured in-context learning can meaningfully extend LLM capabilities to extremely low-resource languages without parameter updates.</abstract>
<identifier type="citekey">devadiga-chopra-2026-making</identifier>
<location>
<url>https://aclanthology.org/2026.loreslm-1.5/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>50</start>
<end>61</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Making Large Language Models Speak Tulu: Structured Prompting for an Extremely Low-Resource Language
%A Devadiga, Prathamesh
%A Chopra, Paras
%Y Hettiarachchi, Hansi
%Y Ranasinghe, Tharindu
%Y Plum, Alistair
%Y Rayson, Paul
%Y Mitkov, Ruslan
%Y Gaber, Mohamed
%Y Premasiri, Damith
%Y Tan, Fiona Anting
%Y Uyangodage, Lasitha
%S Proceedings of the Second Workshop on Language Models for Low-Resource Languages (LoResLM 2026)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-377-7
%F devadiga-chopra-2026-making
%X Can large language models converse in languages virtually absent from their training data? We investigate this question through a case study on Tulu, a Dravidian language with over two million speakers but minimal digital presence. Rather than fine-tuning, we examine whether structured prompt engineering alone can elicit basic conversational ability under extreme data scarcity. Our framework combines explicit grammar documentation, negative constraints to suppress high-probability tokens from related languages, romanization standardization, and quality-controlled synthetic data generation via self-play. Evaluated on a manually curated held-out set across three LLMs (Gemini 2.0 Flash, GPT-4o, and Llama 3.1 70B) and validated by native speakers, our approach reduces vocabulary contamination from 80% to 5% while achieving 85% grammatical accuracy. Cross-model analysis shows that negative constraints provide consistent improvements (12–18 percentage points), while the effectiveness of grammar documentation varies by model architecture (8–22 points). These results demonstrate that structured in-context learning can meaningfully extend LLM capabilities to extremely low-resource languages without parameter updates.
%U https://aclanthology.org/2026.loreslm-1.5/
%P 50-61
Markdown (Informal)
[Making Large Language Models Speak Tulu: Structured Prompting for an Extremely Low-Resource Language](https://aclanthology.org/2026.loreslm-1.5/) (Devadiga & Chopra, LoResLM 2026)
ACL