@inproceedings{garnham-shareghi-2026-language,
title = "Could language models win the International Linguistics Olympiad?",
author = "Garnham, Jamie and
Shareghi, Ehsan",
editor = "Bonial, Claire and
Berzak, Yevgeni",
booktitle = "Proceedings of the 30th Conference on Computational Natural Language Learning",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.conll-main.28/",
pages = "481--500",
ISBN = "979-8-89176-410-1",
abstract = "Linguistic puzzles, wherein the solver must deduce rules of an unfamiliar language purely in-context, represent a uniquely perplexing problem format even for state-of-the-art large language models. Yet by exploring various inference-time scaling methods, we demonstrate that language models' performance on these problems can be improved without the need for fine-tuning or providing supplementary linguistic context. To this end, this paper introduces the first domain-specific inference-time scaling framework for linguistic puzzles, which we use to improve the performance of three model families - R1 (Deepseek), Gemini 2.5 Flash (Google), and Llama 3.3 70B Instruct (Meta) - on a challenging Linguistics Olympiad-based benchmark by 4.9, 13.1, and 4.9 percentage points, respectively. Nonetheless, even when multiple optimisations are applied, we find that LLMs' linguistic puzzle performance remains well below comparable mathematical and commonsense benchmarks, and we speculate as to why linguistic reasoning continues to pose a distinctive challenge for even the most capable large language models."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="garnham-shareghi-2026-language">
<titleInfo>
<title>Could language models win the International Linguistics Olympiad?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jamie</namePart>
<namePart type="family">Garnham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ehsan</namePart>
<namePart type="family">Shareghi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 30th Conference on Computational Natural Language Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Claire</namePart>
<namePart type="family">Bonial</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yevgeni</namePart>
<namePart type="family">Berzak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-410-1</identifier>
</relatedItem>
<abstract>Linguistic puzzles, wherein the solver must deduce rules of an unfamiliar language purely in-context, represent a uniquely perplexing problem format even for state-of-the-art large language models. Yet by exploring various inference-time scaling methods, we demonstrate that language models’ performance on these problems can be improved without the need for fine-tuning or providing supplementary linguistic context. To this end, this paper introduces the first domain-specific inference-time scaling framework for linguistic puzzles, which we use to improve the performance of three model families - R1 (Deepseek), Gemini 2.5 Flash (Google), and Llama 3.3 70B Instruct (Meta) - on a challenging Linguistics Olympiad-based benchmark by 4.9, 13.1, and 4.9 percentage points, respectively. Nonetheless, even when multiple optimisations are applied, we find that LLMs’ linguistic puzzle performance remains well below comparable mathematical and commonsense benchmarks, and we speculate as to why linguistic reasoning continues to pose a distinctive challenge for even the most capable large language models.</abstract>
<identifier type="citekey">garnham-shareghi-2026-language</identifier>
<location>
<url>https://aclanthology.org/2026.conll-main.28/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>481</start>
<end>500</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Could language models win the International Linguistics Olympiad?
%A Garnham, Jamie
%A Shareghi, Ehsan
%Y Bonial, Claire
%Y Berzak, Yevgeni
%S Proceedings of the 30th Conference on Computational Natural Language Learning
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-410-1
%F garnham-shareghi-2026-language
%X Linguistic puzzles, wherein the solver must deduce rules of an unfamiliar language purely in-context, represent a uniquely perplexing problem format even for state-of-the-art large language models. Yet by exploring various inference-time scaling methods, we demonstrate that language models’ performance on these problems can be improved without the need for fine-tuning or providing supplementary linguistic context. To this end, this paper introduces the first domain-specific inference-time scaling framework for linguistic puzzles, which we use to improve the performance of three model families - R1 (Deepseek), Gemini 2.5 Flash (Google), and Llama 3.3 70B Instruct (Meta) - on a challenging Linguistics Olympiad-based benchmark by 4.9, 13.1, and 4.9 percentage points, respectively. Nonetheless, even when multiple optimisations are applied, we find that LLMs’ linguistic puzzle performance remains well below comparable mathematical and commonsense benchmarks, and we speculate as to why linguistic reasoning continues to pose a distinctive challenge for even the most capable large language models.
%U https://aclanthology.org/2026.conll-main.28/
%P 481-500
Markdown (Informal)
[Could language models win the International Linguistics Olympiad?](https://aclanthology.org/2026.conll-main.28/) (Garnham & Shareghi, CoNLL 2026)
ACL