@inproceedings{zhao-etal-2026-mentor,
title = "{MENTOR}: Efficient Autoregressive Image Generation with Balanced Multimodal Control",
author = "Zhao, Haozhe and
Cai, Zefan and
Si, Shuzheng and
Chen, Liang and
Gu, Jiuxiang and
Xiao, Wen and
Zhang, Minjia and
Hu, Junjie",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1508/",
pages = "30167--30193",
ISBN = "979-8-89176-395-1",
abstract = "Recent text-to-image models achieve impressive visual quality but still face challenges in precise controllability, balancing multimodal inputs, and high training cost for multimodal image generation. To address these limitations, we propose MENTOR, an autoregressive (AR) framework with a two-stage training paradigm for controllable multimodal image generation: (1) a multimodal alignment stage that establishes robust pixel and semantic-level alignment between inputs and generated tokens, followed by (2) a multimodal instruction tuning stage that balance model{'}s integration of multimodal inputs and enhance generation controllability. Extensive experiments on DreamBench++ and DreamBench demonstrate that, despite modest model size and training resources, achieves a strong balance between textual and visual guidance for controllable image generation, delivering competitive performance at significantly lower computational cost compared to leading baselines. Moreover, our approach attains superior image reconstruction fidelity, broad adaptability across different tasks, and training efficiency."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhao-etal-2026-mentor">
<titleInfo>
<title>MENTOR: Efficient Autoregressive Image Generation with Balanced Multimodal Control</title>
</titleInfo>
<name type="personal">
<namePart type="given">Haozhe</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zefan</namePart>
<namePart type="family">Cai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuzheng</namePart>
<namePart type="family">Si</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Liang</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiuxiang</namePart>
<namePart type="family">Gu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wen</namePart>
<namePart type="family">Xiao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Minjia</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junjie</namePart>
<namePart type="family">Hu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Recent text-to-image models achieve impressive visual quality but still face challenges in precise controllability, balancing multimodal inputs, and high training cost for multimodal image generation. To address these limitations, we propose MENTOR, an autoregressive (AR) framework with a two-stage training paradigm for controllable multimodal image generation: (1) a multimodal alignment stage that establishes robust pixel and semantic-level alignment between inputs and generated tokens, followed by (2) a multimodal instruction tuning stage that balance model’s integration of multimodal inputs and enhance generation controllability. Extensive experiments on DreamBench++ and DreamBench demonstrate that, despite modest model size and training resources, achieves a strong balance between textual and visual guidance for controllable image generation, delivering competitive performance at significantly lower computational cost compared to leading baselines. Moreover, our approach attains superior image reconstruction fidelity, broad adaptability across different tasks, and training efficiency.</abstract>
<identifier type="citekey">zhao-etal-2026-mentor</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1508/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>30167</start>
<end>30193</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MENTOR: Efficient Autoregressive Image Generation with Balanced Multimodal Control
%A Zhao, Haozhe
%A Cai, Zefan
%A Si, Shuzheng
%A Chen, Liang
%A Gu, Jiuxiang
%A Xiao, Wen
%A Zhang, Minjia
%A Hu, Junjie
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F zhao-etal-2026-mentor
%X Recent text-to-image models achieve impressive visual quality but still face challenges in precise controllability, balancing multimodal inputs, and high training cost for multimodal image generation. To address these limitations, we propose MENTOR, an autoregressive (AR) framework with a two-stage training paradigm for controllable multimodal image generation: (1) a multimodal alignment stage that establishes robust pixel and semantic-level alignment between inputs and generated tokens, followed by (2) a multimodal instruction tuning stage that balance model’s integration of multimodal inputs and enhance generation controllability. Extensive experiments on DreamBench++ and DreamBench demonstrate that, despite modest model size and training resources, achieves a strong balance between textual and visual guidance for controllable image generation, delivering competitive performance at significantly lower computational cost compared to leading baselines. Moreover, our approach attains superior image reconstruction fidelity, broad adaptability across different tasks, and training efficiency.
%U https://aclanthology.org/2026.findings-acl.1508/
%P 30167-30193
Markdown (Informal)
[MENTOR: Efficient Autoregressive Image Generation with Balanced Multimodal Control](https://aclanthology.org/2026.findings-acl.1508/) (Zhao et al., Findings 2026)
ACL
- Haozhe Zhao, Zefan Cai, Shuzheng Si, Liang Chen, Jiuxiang Gu, Wen Xiao, Minjia Zhang, and Junjie Hu. 2026. MENTOR: Efficient Autoregressive Image Generation with Balanced Multimodal Control. In Findings of the Association for Computational Linguistics: ACL 2026, pages 30167–30193, San Diego, California, United States. Association for Computational Linguistics.