@inproceedings{irmiger-etal-2026-learning,
    title = "Learning Vision-Language Alignment in Unified {LLM}s with 24 Text Tokens per Image",
    author = "Irmiger, Nicola  and
      Xu, Yixuan  and
      Kreft, Raphael  and
      Davtyan, Aram  and
      Kaufmann, Manuel  and
      Schlag, Imanol",
    editor = "Riccardi, Giuseppe  and
      Mousavi, Seyed Mahed  and
      Torres, Maria Ines  and
      Yoshino, Koichiro  and
      Callejas, Zoraida  and
      Chowdhury, Shammur Absar  and
      Chen, Yun-Nung  and
      Bechet, Frederic  and
      Gustafson, Joakim  and
      Damnati, G{\'e}raldine  and
      Papangelis, Alex  and
      D{'}Haro, Luis Fernando  and
      Mendon{\c{c}}a, John  and
      Bernardi, Raffaella  and
      Hakkani-Tur, Dilek  and
      Di Fabbrizio, Giuseppe {''}Pino{''}  and
      Kawahara, Tatsuya  and
      Alam, Firoj  and
      Tur, Gokhan  and
      Johnston, Michael",
    booktitle = "Proceedings of the 16th International Workshop on Spoken Dialogue System Technology",
    month = feb,
    year = "2026",
    address = "Trento, Italy",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2026.iwsds-1.28/",
    pages = "275--287"
}