@inproceedings{irmiger-etal-2026-learning, title = "Learning Vision-Language Alignment in Unified {LLM}s with 24 Text Tokens per Image", author = "Irmiger, Nicola and Xu, Yixuan and Kreft, Raphael and Davtyan, Aram and Kaufmann, Manuel and Schlag, Imanol", editor = "Riccardi, Giuseppe and Mousavi, Seyed Mahed and Torres, Maria Ines and Yoshino, Koichiro and Callejas, Zoraida and Chowdhury, Shammur Absar and Chen, Yun-Nung and Bechet, Frederic and Gustafson, Joakim and Damnati, G{\'e}raldine and Papangelis, Alex and D{'}Haro, Luis Fernando and Mendon{\c{c}}a, John and Bernardi, Raffaella and Hakkani-Tur, Dilek and Di Fabbrizio, Giuseppe {''}Pino{''} and Kawahara, Tatsuya and Alam, Firoj and Tur, Gokhan and Johnston, Michael", booktitle = "Proceedings of the 16th International Workshop on Spoken Dialogue System Technology", month = feb, year = "2026", address = "Trento, Italy", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2026.iwsds-1.28/", pages = "275--287" }