@inproceedings{ojewale-etal-2026-multi,
title = "Multi-lingual Functional Evaluation for Large Language Models",
author = "Ojewale, Victor and
Raji, Inioluwa Deborah and
Venkatasubramanian, Suresh",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1731/",
pages = "34672--34691",
ISBN = "979-8-89176-395-1",
abstract = "Multi-lingual competence in large language models is often evaluated via static data benchmarks such as Belebele, M-MMLU and M-GSM. However, these evaluations often fail to provide an adequate understanding of the practical performance and robustness of models across multi-lingual settings. In response, we create multi-lingual functional benchmarks {--} Cross-Lingual Grade School Math Symbolic (CL-GSM Symbolic) and Cross-Lingual Instruction-Following Eval (CL-IFEval){--} by translating existing functional benchmark templates from English to five additional languages that span the range of resources available for NLP: French, Spanish, Hindi, Arabic and Yoruba. Our results show that the gap between static and functional evaluations is highly uneven: across models, performance drops from M-GSM to CL-GSM Symbolic by 24{\%}, 17{\%}, and 18{\%} in English, French, and Spanish, while the drop from Belebele to CL-IFEval ranges from 15{\%} to 24{\%} across languages, and the drop from M-MMLU to CL-IFEval is much smaller (0.5{\%} to 3{\%}).Similarly, we find that model robustness across languages varies significantly, with certain languages (eg. Arabic, English) being the most consistently well performing across evaluation iterations."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ojewale-etal-2026-multi">
<titleInfo>
<title>Multi-lingual Functional Evaluation for Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Victor</namePart>
<namePart type="family">Ojewale</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Inioluwa</namePart>
<namePart type="given">Deborah</namePart>
<namePart type="family">Raji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Suresh</namePart>
<namePart type="family">Venkatasubramanian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Multi-lingual competence in large language models is often evaluated via static data benchmarks such as Belebele, M-MMLU and M-GSM. However, these evaluations often fail to provide an adequate understanding of the practical performance and robustness of models across multi-lingual settings. In response, we create multi-lingual functional benchmarks – Cross-Lingual Grade School Math Symbolic (CL-GSM Symbolic) and Cross-Lingual Instruction-Following Eval (CL-IFEval)– by translating existing functional benchmark templates from English to five additional languages that span the range of resources available for NLP: French, Spanish, Hindi, Arabic and Yoruba. Our results show that the gap between static and functional evaluations is highly uneven: across models, performance drops from M-GSM to CL-GSM Symbolic by 24%, 17%, and 18% in English, French, and Spanish, while the drop from Belebele to CL-IFEval ranges from 15% to 24% across languages, and the drop from M-MMLU to CL-IFEval is much smaller (0.5% to 3%).Similarly, we find that model robustness across languages varies significantly, with certain languages (eg. Arabic, English) being the most consistently well performing across evaluation iterations.</abstract>
<identifier type="citekey">ojewale-etal-2026-multi</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1731/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>34672</start>
<end>34691</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Multi-lingual Functional Evaluation for Large Language Models
%A Ojewale, Victor
%A Raji, Inioluwa Deborah
%A Venkatasubramanian, Suresh
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F ojewale-etal-2026-multi
%X Multi-lingual competence in large language models is often evaluated via static data benchmarks such as Belebele, M-MMLU and M-GSM. However, these evaluations often fail to provide an adequate understanding of the practical performance and robustness of models across multi-lingual settings. In response, we create multi-lingual functional benchmarks – Cross-Lingual Grade School Math Symbolic (CL-GSM Symbolic) and Cross-Lingual Instruction-Following Eval (CL-IFEval)– by translating existing functional benchmark templates from English to five additional languages that span the range of resources available for NLP: French, Spanish, Hindi, Arabic and Yoruba. Our results show that the gap between static and functional evaluations is highly uneven: across models, performance drops from M-GSM to CL-GSM Symbolic by 24%, 17%, and 18% in English, French, and Spanish, while the drop from Belebele to CL-IFEval ranges from 15% to 24% across languages, and the drop from M-MMLU to CL-IFEval is much smaller (0.5% to 3%).Similarly, we find that model robustness across languages varies significantly, with certain languages (eg. Arabic, English) being the most consistently well performing across evaluation iterations.
%U https://aclanthology.org/2026.findings-acl.1731/
%P 34672-34691
Markdown (Informal)
[Multi-lingual Functional Evaluation for Large Language Models](https://aclanthology.org/2026.findings-acl.1731/) (Ojewale et al., Findings 2026)
ACL
- Victor Ojewale, Inioluwa Deborah Raji, and Suresh Venkatasubramanian. 2026. Multi-lingual Functional Evaluation for Large Language Models. In Findings of the Association for Computational Linguistics: ACL 2026, pages 34672–34691, San Diego, California, United States. Association for Computational Linguistics.