@inproceedings{pamies-etal-2026-vinclat,
title = "Vinclat: Evaluating Reasoning, Cognition and Culture in One Game",
author = "P{\`a}mies, Marc and
Aula-Blasco, Javier and
Gonzalez-Agirre, Aitor and
Villegas, Marta",
editor = "Chen, Pinzhen and
Zouhar, Vil{\'e}m and
Hu, Hanxu and
Khanuja, Simran and
Zhu, Wenhao and
Haddow, Barry and
Birch, Alexandra and
Aji, Alham Fikri and
Sennrich, Rico and
Hooker, Sara",
booktitle = "Proceedings of the First Workshop on Multilingual Multicultural Evaluation",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.mme-main.4/",
pages = "49--66",
ISBN = "979-8-89176-368-5",
abstract = "This paper introduces Vinclat, a novel evaluation dataset for Catalan carefully designed to assess the reasoning capabilities and cultural knowledge of LLMs. It comprises 1,000 high-quality instances, meticulously crafted and reviewed by human annotators. Each instance presents a complex riddle that requires a two-step reasoning process involving inferential and abductive reasoning, along with other cognitive skills such as lexical retrieval, paraphrasing, flexibility in interpretation, pattern recognition, and associative thinking. Given four independent clues, models should infer intermediate concepts which, despite being seemingly unrelated, can be creatively connected to reach a final solution. The task targets a unique blend of capabilities, distinguishing it from existing NLP benchmarks. Our evaluation of state-of-the-art models reveals that these still fall significantly short of human-level reasoning, although scaling trends suggest that the performance gap may narrow over time. This indicates that Vinclat provides a robust and long-term challenge, resisting the rapid saturation that is commonly observed in many existing evaluation datasets."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pamies-etal-2026-vinclat">
<titleInfo>
<title>Vinclat: Evaluating Reasoning, Cognition and Culture in One Game</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marc</namePart>
<namePart type="family">Pàmies</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Javier</namePart>
<namePart type="family">Aula-Blasco</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aitor</namePart>
<namePart type="family">Gonzalez-Agirre</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marta</namePart>
<namePart type="family">Villegas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Multilingual Multicultural Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pinzhen</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vilém</namePart>
<namePart type="family">Zouhar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hanxu</namePart>
<namePart type="family">Hu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simran</namePart>
<namePart type="family">Khanuja</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenhao</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barry</namePart>
<namePart type="family">Haddow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexandra</namePart>
<namePart type="family">Birch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alham</namePart>
<namePart type="given">Fikri</namePart>
<namePart type="family">Aji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rico</namePart>
<namePart type="family">Sennrich</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Hooker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-368-5</identifier>
</relatedItem>
<abstract>This paper introduces Vinclat, a novel evaluation dataset for Catalan carefully designed to assess the reasoning capabilities and cultural knowledge of LLMs. It comprises 1,000 high-quality instances, meticulously crafted and reviewed by human annotators. Each instance presents a complex riddle that requires a two-step reasoning process involving inferential and abductive reasoning, along with other cognitive skills such as lexical retrieval, paraphrasing, flexibility in interpretation, pattern recognition, and associative thinking. Given four independent clues, models should infer intermediate concepts which, despite being seemingly unrelated, can be creatively connected to reach a final solution. The task targets a unique blend of capabilities, distinguishing it from existing NLP benchmarks. Our evaluation of state-of-the-art models reveals that these still fall significantly short of human-level reasoning, although scaling trends suggest that the performance gap may narrow over time. This indicates that Vinclat provides a robust and long-term challenge, resisting the rapid saturation that is commonly observed in many existing evaluation datasets.</abstract>
<identifier type="citekey">pamies-etal-2026-vinclat</identifier>
<location>
<url>https://aclanthology.org/2026.mme-main.4/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>49</start>
<end>66</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Vinclat: Evaluating Reasoning, Cognition and Culture in One Game
%A Pàmies, Marc
%A Aula-Blasco, Javier
%A Gonzalez-Agirre, Aitor
%A Villegas, Marta
%Y Chen, Pinzhen
%Y Zouhar, Vilém
%Y Hu, Hanxu
%Y Khanuja, Simran
%Y Zhu, Wenhao
%Y Haddow, Barry
%Y Birch, Alexandra
%Y Aji, Alham Fikri
%Y Sennrich, Rico
%Y Hooker, Sara
%S Proceedings of the First Workshop on Multilingual Multicultural Evaluation
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-368-5
%F pamies-etal-2026-vinclat
%X This paper introduces Vinclat, a novel evaluation dataset for Catalan carefully designed to assess the reasoning capabilities and cultural knowledge of LLMs. It comprises 1,000 high-quality instances, meticulously crafted and reviewed by human annotators. Each instance presents a complex riddle that requires a two-step reasoning process involving inferential and abductive reasoning, along with other cognitive skills such as lexical retrieval, paraphrasing, flexibility in interpretation, pattern recognition, and associative thinking. Given four independent clues, models should infer intermediate concepts which, despite being seemingly unrelated, can be creatively connected to reach a final solution. The task targets a unique blend of capabilities, distinguishing it from existing NLP benchmarks. Our evaluation of state-of-the-art models reveals that these still fall significantly short of human-level reasoning, although scaling trends suggest that the performance gap may narrow over time. This indicates that Vinclat provides a robust and long-term challenge, resisting the rapid saturation that is commonly observed in many existing evaluation datasets.
%U https://aclanthology.org/2026.mme-main.4/
%P 49-66
Markdown (Informal)
[Vinclat: Evaluating Reasoning, Cognition and Culture in One Game](https://aclanthology.org/2026.mme-main.4/) (Pàmies et al., MME 2026)
ACL