@inproceedings{namgoong-etal-2026-auxiliary,
title = "When Does Auxiliary Modality Matter in Solving Geometric Problems? A Comprehensive Study of Textual, Formal, and Visual Modalities",
author = "Namgoong, Hyuk and
Jung, Jeesu and
Han, Yerim and
Jung, Sangkeun",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 2: Short Papers)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.eacl-short.4/",
pages = "76--92",
ISBN = "979-8-89176-381-4",
abstract = "Large Language Models (LLMs) face challenges in integrating linguistic and spatial reasoning, which limits their performance on geometry problems. While prior work has attempted to bridge this gap using diagram parsers with multimodal models, a systematic comparison of how various auxiliary modalities and their combinations affect performance has been lacking. To address this, we present a systematic study of four auxiliary modalities{---}formal diagram facts (CDL), natural language representations ($T_{CDL}$), diagram descriptions (DES), and image augmentations (IMG){---}on a range of open- and closed-source multimodal LLMs. Our analysis reveals a compelling dichotomy in the effectiveness of these modalities. While formal representations like CDL and $T_{CDL}$ offer a modest performance lift, diagram descriptions (DES) cause a dramatic split: they significantly boost the accuracy of open-source LLMs which often struggle with visual parsing, while often misleading more capable closed-source models and causing a performance drop. This highlights a critical trade-off between augmenting input with helpful information and introducing misleading noise, demonstrating that the efficacy of auxiliary modalities is heavily dependent on the inherent capabilities of the underlying model."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="namgoong-etal-2026-auxiliary">
<titleInfo>
<title>When Does Auxiliary Modality Matter in Solving Geometric Problems? A Comprehensive Study of Textual, Formal, and Visual Modalities</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hyuk</namePart>
<namePart type="family">Namgoong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jeesu</namePart>
<namePart type="family">Jung</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yerim</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sangkeun</namePart>
<namePart type="family">Jung</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 2: Short Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-381-4</identifier>
</relatedItem>
<abstract>Large Language Models (LLMs) face challenges in integrating linguistic and spatial reasoning, which limits their performance on geometry problems. While prior work has attempted to bridge this gap using diagram parsers with multimodal models, a systematic comparison of how various auxiliary modalities and their combinations affect performance has been lacking. To address this, we present a systematic study of four auxiliary modalities—formal diagram facts (CDL), natural language representations (T_CDL), diagram descriptions (DES), and image augmentations (IMG)—on a range of open- and closed-source multimodal LLMs. Our analysis reveals a compelling dichotomy in the effectiveness of these modalities. While formal representations like CDL and T_CDL offer a modest performance lift, diagram descriptions (DES) cause a dramatic split: they significantly boost the accuracy of open-source LLMs which often struggle with visual parsing, while often misleading more capable closed-source models and causing a performance drop. This highlights a critical trade-off between augmenting input with helpful information and introducing misleading noise, demonstrating that the efficacy of auxiliary modalities is heavily dependent on the inherent capabilities of the underlying model.</abstract>
<identifier type="citekey">namgoong-etal-2026-auxiliary</identifier>
<location>
<url>https://aclanthology.org/2026.eacl-short.4/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>76</start>
<end>92</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T When Does Auxiliary Modality Matter in Solving Geometric Problems? A Comprehensive Study of Textual, Formal, and Visual Modalities
%A Namgoong, Hyuk
%A Jung, Jeesu
%A Han, Yerim
%A Jung, Sangkeun
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 2: Short Papers)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-381-4
%F namgoong-etal-2026-auxiliary
%X Large Language Models (LLMs) face challenges in integrating linguistic and spatial reasoning, which limits their performance on geometry problems. While prior work has attempted to bridge this gap using diagram parsers with multimodal models, a systematic comparison of how various auxiliary modalities and their combinations affect performance has been lacking. To address this, we present a systematic study of four auxiliary modalities—formal diagram facts (CDL), natural language representations (T_CDL), diagram descriptions (DES), and image augmentations (IMG)—on a range of open- and closed-source multimodal LLMs. Our analysis reveals a compelling dichotomy in the effectiveness of these modalities. While formal representations like CDL and T_CDL offer a modest performance lift, diagram descriptions (DES) cause a dramatic split: they significantly boost the accuracy of open-source LLMs which often struggle with visual parsing, while often misleading more capable closed-source models and causing a performance drop. This highlights a critical trade-off between augmenting input with helpful information and introducing misleading noise, demonstrating that the efficacy of auxiliary modalities is heavily dependent on the inherent capabilities of the underlying model.
%U https://aclanthology.org/2026.eacl-short.4/
%P 76-92
Markdown (Informal)
[When Does Auxiliary Modality Matter in Solving Geometric Problems? A Comprehensive Study of Textual, Formal, and Visual Modalities](https://aclanthology.org/2026.eacl-short.4/) (Namgoong et al., EACL 2026)
ACL