@inproceedings{coleman-etal-2026-ran,
title = "{RAN}: Resource Abundance Notation for Languages in {NLP}",
author = "Coleman, Jared and
Coleman, Tain{\~a} and
Krishnmachari, Bhaskar",
editor = "Mager, Manuel and
Ebrahimi, Abteen and
Bui, Minh Duc and
Pugh, Robert and
Oncevay, Arturo and
Chiruzzo, Luis and
Solano, Rolando Coto and
Rijhwani, Shruti and
Von Der Wense, Katharina",
booktitle = "Proceedings of the Sixth Workshop on {NLP} for Indigenous Languages of the {A}mericas ({A}mericas{NLP})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.americasnlp-6.15/",
pages = "168--172",
ISBN = "979-8-89176-415-6",
abstract = "The term ``low-resource'' is used pervasively in NLP but communicates almost nothing precise. We propose RAN (Resource Abundance Notation), a compact, multi-dimensional notation for quantifying a language{'}s NLP resource profile. A RAN score is written as S/M/L{\_}1-B{\_}1/L{\_}2-B{\_}2/..., where S = floor(log10(speakers)), M = floor(log10(monolingual sentences)), and each L{\_}i-B{\_}i pair records a bilingual partner and floor(log10(parallel sentences)). Values derive from canonical sources: Wikidata for speakers, OSCAR 23.01 for monolingual corpora, and (where available) OPUS for parallel corpora. We score 20 typologically diverse languages and correlate each profile against published benchmarks for three tasks: machine translation (MT, via NLLB-200 chrF++), named entity recognition (NER, via XTREME XLM-R WikiANN F1), and part-of-speech tagging (POS, via XTREME XLM-R UD accuracy). The RAN components carry complementary information: a linear model using all three explains 52{\%} of MT variance, 76{\%} of NER variance, and 72{\%} of POS variance. Among single predictors, B{\_}max (the largest bilingual corpus, regardless of partner) is strongest for the cross-lingual transfer tasks (NER, POS), while M and B{\_}en are strongest for MT. RAN is designed first as a communication tool, not a predictive model."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="coleman-etal-2026-ran">
<titleInfo>
<title>RAN: Resource Abundance Notation for Languages in NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jared</namePart>
<namePart type="family">Coleman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tainã</namePart>
<namePart type="family">Coleman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bhaskar</namePart>
<namePart type="family">Krishnmachari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Sixth Workshop on NLP for Indigenous Languages of the Americas (AmericasNLP)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Manuel</namePart>
<namePart type="family">Mager</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abteen</namePart>
<namePart type="family">Ebrahimi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Minh</namePart>
<namePart type="given">Duc</namePart>
<namePart type="family">Bui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Robert</namePart>
<namePart type="family">Pugh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arturo</namePart>
<namePart type="family">Oncevay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rolando</namePart>
<namePart type="given">Coto</namePart>
<namePart type="family">Solano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shruti</namePart>
<namePart type="family">Rijhwani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Katharina</namePart>
<namePart type="family">Von Der Wense</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-415-6</identifier>
</relatedItem>
<abstract>The term “low-resource” is used pervasively in NLP but communicates almost nothing precise. We propose RAN (Resource Abundance Notation), a compact, multi-dimensional notation for quantifying a language’s NLP resource profile. A RAN score is written as S/M/L_1-B_1/L_2-B_2/..., where S = floor(log10(speakers)), M = floor(log10(monolingual sentences)), and each L_i-B_i pair records a bilingual partner and floor(log10(parallel sentences)). Values derive from canonical sources: Wikidata for speakers, OSCAR 23.01 for monolingual corpora, and (where available) OPUS for parallel corpora. We score 20 typologically diverse languages and correlate each profile against published benchmarks for three tasks: machine translation (MT, via NLLB-200 chrF++), named entity recognition (NER, via XTREME XLM-R WikiANN F1), and part-of-speech tagging (POS, via XTREME XLM-R UD accuracy). The RAN components carry complementary information: a linear model using all three explains 52% of MT variance, 76% of NER variance, and 72% of POS variance. Among single predictors, B_max (the largest bilingual corpus, regardless of partner) is strongest for the cross-lingual transfer tasks (NER, POS), while M and B_en are strongest for MT. RAN is designed first as a communication tool, not a predictive model.</abstract>
<identifier type="citekey">coleman-etal-2026-ran</identifier>
<location>
<url>https://aclanthology.org/2026.americasnlp-6.15/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>168</start>
<end>172</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T RAN: Resource Abundance Notation for Languages in NLP
%A Coleman, Jared
%A Coleman, Tainã
%A Krishnmachari, Bhaskar
%Y Mager, Manuel
%Y Ebrahimi, Abteen
%Y Bui, Minh Duc
%Y Pugh, Robert
%Y Oncevay, Arturo
%Y Chiruzzo, Luis
%Y Solano, Rolando Coto
%Y Rijhwani, Shruti
%Y Von Der Wense, Katharina
%S Proceedings of the Sixth Workshop on NLP for Indigenous Languages of the Americas (AmericasNLP)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-415-6
%F coleman-etal-2026-ran
%X The term “low-resource” is used pervasively in NLP but communicates almost nothing precise. We propose RAN (Resource Abundance Notation), a compact, multi-dimensional notation for quantifying a language’s NLP resource profile. A RAN score is written as S/M/L_1-B_1/L_2-B_2/..., where S = floor(log10(speakers)), M = floor(log10(monolingual sentences)), and each L_i-B_i pair records a bilingual partner and floor(log10(parallel sentences)). Values derive from canonical sources: Wikidata for speakers, OSCAR 23.01 for monolingual corpora, and (where available) OPUS for parallel corpora. We score 20 typologically diverse languages and correlate each profile against published benchmarks for three tasks: machine translation (MT, via NLLB-200 chrF++), named entity recognition (NER, via XTREME XLM-R WikiANN F1), and part-of-speech tagging (POS, via XTREME XLM-R UD accuracy). The RAN components carry complementary information: a linear model using all three explains 52% of MT variance, 76% of NER variance, and 72% of POS variance. Among single predictors, B_max (the largest bilingual corpus, regardless of partner) is strongest for the cross-lingual transfer tasks (NER, POS), while M and B_en are strongest for MT. RAN is designed first as a communication tool, not a predictive model.
%U https://aclanthology.org/2026.americasnlp-6.15/
%P 168-172
Markdown (Informal)
[RAN: Resource Abundance Notation for Languages in NLP](https://aclanthology.org/2026.americasnlp-6.15/) (Coleman et al., AmericasNLP 2026)
ACL
- Jared Coleman, Tainã Coleman, and Bhaskar Krishnmachari. 2026. RAN: Resource Abundance Notation for Languages in NLP. In Proceedings of the Sixth Workshop on NLP for Indigenous Languages of the Americas (AmericasNLP), pages 168–172, San Diego, California, USA. Association for Computational Linguistics.