@inproceedings{wang-etal-2026-mimesis,
title = "From Mimesis to Metamorphosis: Evolving {VLM} Judges via In-Context Comparing and Knowledge Internalization",
author = "Wang, Juntuo and
Qiao, Yuming and
Yang, Yifan and
Yuan, Lunxi and
Luo, Liang and
Meng, Dan",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.631/",
pages = "12957--12971",
ISBN = "979-8-89176-395-1",
abstract = "Vision-language models (VLMs) are increasingly adopted as judges for subjective assessment, yet absolute scoring remains brittle due to inconsistent scales and inherent preference biases. To bridge this gap, we propose $\text{S}^2\text{AD}$ (**Semantic-Anchored Scale-Agnostic Distillation**), a novel easy-to-hard framework that operationalizes subjective assessment as comparative analysis, conceptualizing the judge{'}s evolution from mimesis to metamorphosis. In Stage 1 (Mimesis), we introduce Dynamic Soft Positioning (DSP) to train the judge to compare a query against retrieved reference images, establishing a relative evaluation space that ensures consistent ordering under heterogeneous scales. In Stage 2 (Metamorphosis), this comparative capability is internalized via Language Buttons{---}discrete semantic levels serving as a retrieval-free internal reference. Optimized with Group Relative Policy Optimization (GRPO), $\text{S}^2\text{AD}$ achieves efficient, scale-steerable inference that adapts to diverse grading standards. Our framework reaches state-of-the-art performance across multiple benchmarks, validating the effectiveness of internalized comparative priors for robust, rank-invariant, and scale-steerable evaluation. The code is available at: https://github.com/SpatialVision-Research/SSAD{\_}ACL2026{\_}Findings."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-etal-2026-mimesis">
<titleInfo>
<title>From Mimesis to Metamorphosis: Evolving VLM Judges via In-Context Comparing and Knowledge Internalization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Juntuo</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuming</namePart>
<namePart type="family">Qiao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yifan</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lunxi</namePart>
<namePart type="family">Yuan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Liang</namePart>
<namePart type="family">Luo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dan</namePart>
<namePart type="family">Meng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Vision-language models (VLMs) are increasingly adopted as judges for subjective assessment, yet absolute scoring remains brittle due to inconsistent scales and inherent preference biases. To bridge this gap, we propose \textS²\textAD (**Semantic-Anchored Scale-Agnostic Distillation**), a novel easy-to-hard framework that operationalizes subjective assessment as comparative analysis, conceptualizing the judge’s evolution from mimesis to metamorphosis. In Stage 1 (Mimesis), we introduce Dynamic Soft Positioning (DSP) to train the judge to compare a query against retrieved reference images, establishing a relative evaluation space that ensures consistent ordering under heterogeneous scales. In Stage 2 (Metamorphosis), this comparative capability is internalized via Language Buttons—discrete semantic levels serving as a retrieval-free internal reference. Optimized with Group Relative Policy Optimization (GRPO), \textS²\textAD achieves efficient, scale-steerable inference that adapts to diverse grading standards. Our framework reaches state-of-the-art performance across multiple benchmarks, validating the effectiveness of internalized comparative priors for robust, rank-invariant, and scale-steerable evaluation. The code is available at: https://github.com/SpatialVision-Research/SSAD_ACL2026_Findings.</abstract>
<identifier type="citekey">wang-etal-2026-mimesis</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.631/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>12957</start>
<end>12971</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T From Mimesis to Metamorphosis: Evolving VLM Judges via In-Context Comparing and Knowledge Internalization
%A Wang, Juntuo
%A Qiao, Yuming
%A Yang, Yifan
%A Yuan, Lunxi
%A Luo, Liang
%A Meng, Dan
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F wang-etal-2026-mimesis
%X Vision-language models (VLMs) are increasingly adopted as judges for subjective assessment, yet absolute scoring remains brittle due to inconsistent scales and inherent preference biases. To bridge this gap, we propose \textS²\textAD (**Semantic-Anchored Scale-Agnostic Distillation**), a novel easy-to-hard framework that operationalizes subjective assessment as comparative analysis, conceptualizing the judge’s evolution from mimesis to metamorphosis. In Stage 1 (Mimesis), we introduce Dynamic Soft Positioning (DSP) to train the judge to compare a query against retrieved reference images, establishing a relative evaluation space that ensures consistent ordering under heterogeneous scales. In Stage 2 (Metamorphosis), this comparative capability is internalized via Language Buttons—discrete semantic levels serving as a retrieval-free internal reference. Optimized with Group Relative Policy Optimization (GRPO), \textS²\textAD achieves efficient, scale-steerable inference that adapts to diverse grading standards. Our framework reaches state-of-the-art performance across multiple benchmarks, validating the effectiveness of internalized comparative priors for robust, rank-invariant, and scale-steerable evaluation. The code is available at: https://github.com/SpatialVision-Research/SSAD_ACL2026_Findings.
%U https://aclanthology.org/2026.findings-acl.631/
%P 12957-12971
Markdown (Informal)
[From Mimesis to Metamorphosis: Evolving VLM Judges via In-Context Comparing and Knowledge Internalization](https://aclanthology.org/2026.findings-acl.631/) (Wang et al., Findings 2026)
ACL