@inproceedings{galyukshev-alimova-2026-processing,
title = "Processing Inconsistency Predicts Language Competence: {LLM} Evaluation Without Answer Labels on {T}urkic Languages",
author = "Galyukshev, Ilya and
Alimova, Ilseyar",
editor = "T.Y.S.S., Santosh and
Rodriguez, Juan Diego and
de Gibert, Ona",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics ({ACL} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-srw.94/",
pages = "1074--1086",
ISBN = "979-8-89176-393-7",
abstract = "Most languages lack labeled evaluation benchmarks for large language models (LLMs). Creating such benchmarks requires native speakers, domain expertise, and answer annotation{---}resources unavailable for the vast majority of languages. We investigate whether a model{'}s internal processing signals{---}such as generation entropy and tokenizer statistics{---}correlate with its actual accuracy on a language, with the long-term goal of estimating language competence without labeled data. Our key observation is that for languages a model does not know, both tokenizer segmentation and generation entropy become highly variable across questions, whereas for known languages they remain consistent. We call this the *inconsistency hypothesis* and test it on 11 instruction-tuned LLMs (1B{--}70B parameters) across 14 language{--}script varieties (12 Turkic plus English and Russian controls). We extract over 25 processing features per model{--}language pair; individually, even the strongest correlate only moderately with accuracy (Pearson $|r|$ up to 0.55). Yet combining just three complementary features{---}a tokenizer coverage ratio, entropy variability, and the model{'}s English/Russian benchmark score{---}explains 75{\%} of accuracy variance in leave-one-language-out evaluation, nearly doubling the 44{\%} explained by a model-mean baseline. The variability of processing signals (standard deviation) consistently outperforms mean values as a predictor across all five model families, but only for greedy-pass measures; sampling-based measures show no such pattern."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="galyukshev-alimova-2026-processing">
<titleInfo>
<title>Processing Inconsistency Predicts Language Competence: LLM Evaluation Without Answer Labels on Turkic Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ilya</namePart>
<namePart type="family">Galyukshev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ilseyar</namePart>
<namePart type="family">Alimova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Santosh</namePart>
<namePart type="family">T.Y.S.S.</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="given">Diego</namePart>
<namePart type="family">Rodriguez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ona</namePart>
<namePart type="family">de Gibert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-393-7</identifier>
</relatedItem>
<abstract>Most languages lack labeled evaluation benchmarks for large language models (LLMs). Creating such benchmarks requires native speakers, domain expertise, and answer annotation—resources unavailable for the vast majority of languages. We investigate whether a model’s internal processing signals—such as generation entropy and tokenizer statistics—correlate with its actual accuracy on a language, with the long-term goal of estimating language competence without labeled data. Our key observation is that for languages a model does not know, both tokenizer segmentation and generation entropy become highly variable across questions, whereas for known languages they remain consistent. We call this the *inconsistency hypothesis* and test it on 11 instruction-tuned LLMs (1B–70B parameters) across 14 language–script varieties (12 Turkic plus English and Russian controls). We extract over 25 processing features per model–language pair; individually, even the strongest correlate only moderately with accuracy (Pearson |r| up to 0.55). Yet combining just three complementary features—a tokenizer coverage ratio, entropy variability, and the model’s English/Russian benchmark score—explains 75% of accuracy variance in leave-one-language-out evaluation, nearly doubling the 44% explained by a model-mean baseline. The variability of processing signals (standard deviation) consistently outperforms mean values as a predictor across all five model families, but only for greedy-pass measures; sampling-based measures show no such pattern.</abstract>
<identifier type="citekey">galyukshev-alimova-2026-processing</identifier>
<location>
<url>https://aclanthology.org/2026.acl-srw.94/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>1074</start>
<end>1086</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Processing Inconsistency Predicts Language Competence: LLM Evaluation Without Answer Labels on Turkic Languages
%A Galyukshev, Ilya
%A Alimova, Ilseyar
%Y T.Y.S.S., Santosh
%Y Rodriguez, Juan Diego
%Y de Gibert, Ona
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-393-7
%F galyukshev-alimova-2026-processing
%X Most languages lack labeled evaluation benchmarks for large language models (LLMs). Creating such benchmarks requires native speakers, domain expertise, and answer annotation—resources unavailable for the vast majority of languages. We investigate whether a model’s internal processing signals—such as generation entropy and tokenizer statistics—correlate with its actual accuracy on a language, with the long-term goal of estimating language competence without labeled data. Our key observation is that for languages a model does not know, both tokenizer segmentation and generation entropy become highly variable across questions, whereas for known languages they remain consistent. We call this the *inconsistency hypothesis* and test it on 11 instruction-tuned LLMs (1B–70B parameters) across 14 language–script varieties (12 Turkic plus English and Russian controls). We extract over 25 processing features per model–language pair; individually, even the strongest correlate only moderately with accuracy (Pearson |r| up to 0.55). Yet combining just three complementary features—a tokenizer coverage ratio, entropy variability, and the model’s English/Russian benchmark score—explains 75% of accuracy variance in leave-one-language-out evaluation, nearly doubling the 44% explained by a model-mean baseline. The variability of processing signals (standard deviation) consistently outperforms mean values as a predictor across all five model families, but only for greedy-pass measures; sampling-based measures show no such pattern.
%U https://aclanthology.org/2026.acl-srw.94/
%P 1074-1086
Markdown (Informal)
[Processing Inconsistency Predicts Language Competence: LLM Evaluation Without Answer Labels on Turkic Languages](https://aclanthology.org/2026.acl-srw.94/) (Galyukshev & Alimova, ACL 2026)
ACL