@inproceedings{xu-etal-2026-ftibsuite,
title = "{FT}ib{S}uite: A Comprehensive Resource Suite for {T}ibetan Vision{--}Language Modeling",
author = "Xu, Guixian and
Liang, Yide and
Su, Zeli and
Song, Xuexian and
Zhang, Ziyin and
Dong, Yushuang and
Zhang, Ting and
Han, Xu",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.903/",
pages = "18143--18159",
ISBN = "979-8-89176-395-1",
abstract = "Vision{--}language models (VLMs) have progressed rapidly, but Tibetan remains largely underserved due to the lack of infrastructure for reproducible training and evaluation. To help address this gap, we introduce FTibSuite, a resource-centric foundation for Tibetan VLM research that provides an end-to-end training-and-evaluation workflow and includes human-verified multimodal annotations, partially filling a long-standing shortage of Tibetan multimodal resources. FTibSuite comprises FTibData, FTibBench, and a reproducible baseline model, FTibVLM, built on Qwen3-VL-8B-Instruct. FTibVLM adopts a three-stage adaptation pipeline consisting of Tibetan continual pretraining, image{--}text alignment, and multimodal instruction tuning. For systematic evaluation, FTibBench adapts five established multimodal benchmarks to Tibetan and offers a reproducible evaluation protocol to support consistent comparisons across models. Specifically, FTibBench includes Tibetan versions of MMBench, MME, POPE, BinaryVQA, and COREVQA. Experiments on FTibBench demonstrate that FTibVLM consistently improves Tibetan multimodal performance. For instance, FTibVLM attains 76.01 accuracy on BinaryVQA, indicating that Tibetan performance can be competitive with high-resource settings on this diagnostic task. We also observe substantial gains on other benchmarks, including an improvement on MMBench (dev) from 42.97 to 67.78 and an increase in POPE-random accuracy from 47.53 to 80.56, underscoring the practical value of the proposed workflow and resources."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="xu-etal-2026-ftibsuite">
<titleInfo>
<title>FTibSuite: A Comprehensive Resource Suite for Tibetan Vision–Language Modeling</title>
</titleInfo>
<name type="personal">
<namePart type="given">Guixian</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yide</namePart>
<namePart type="family">Liang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zeli</namePart>
<namePart type="family">Su</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuexian</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ziyin</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yushuang</namePart>
<namePart type="family">Dong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ting</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xu</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Vision–language models (VLMs) have progressed rapidly, but Tibetan remains largely underserved due to the lack of infrastructure for reproducible training and evaluation. To help address this gap, we introduce FTibSuite, a resource-centric foundation for Tibetan VLM research that provides an end-to-end training-and-evaluation workflow and includes human-verified multimodal annotations, partially filling a long-standing shortage of Tibetan multimodal resources. FTibSuite comprises FTibData, FTibBench, and a reproducible baseline model, FTibVLM, built on Qwen3-VL-8B-Instruct. FTibVLM adopts a three-stage adaptation pipeline consisting of Tibetan continual pretraining, image–text alignment, and multimodal instruction tuning. For systematic evaluation, FTibBench adapts five established multimodal benchmarks to Tibetan and offers a reproducible evaluation protocol to support consistent comparisons across models. Specifically, FTibBench includes Tibetan versions of MMBench, MME, POPE, BinaryVQA, and COREVQA. Experiments on FTibBench demonstrate that FTibVLM consistently improves Tibetan multimodal performance. For instance, FTibVLM attains 76.01 accuracy on BinaryVQA, indicating that Tibetan performance can be competitive with high-resource settings on this diagnostic task. We also observe substantial gains on other benchmarks, including an improvement on MMBench (dev) from 42.97 to 67.78 and an increase in POPE-random accuracy from 47.53 to 80.56, underscoring the practical value of the proposed workflow and resources.</abstract>
<identifier type="citekey">xu-etal-2026-ftibsuite</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.903/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>18143</start>
<end>18159</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T FTibSuite: A Comprehensive Resource Suite for Tibetan Vision–Language Modeling
%A Xu, Guixian
%A Liang, Yide
%A Su, Zeli
%A Song, Xuexian
%A Zhang, Ziyin
%A Dong, Yushuang
%A Zhang, Ting
%A Han, Xu
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F xu-etal-2026-ftibsuite
%X Vision–language models (VLMs) have progressed rapidly, but Tibetan remains largely underserved due to the lack of infrastructure for reproducible training and evaluation. To help address this gap, we introduce FTibSuite, a resource-centric foundation for Tibetan VLM research that provides an end-to-end training-and-evaluation workflow and includes human-verified multimodal annotations, partially filling a long-standing shortage of Tibetan multimodal resources. FTibSuite comprises FTibData, FTibBench, and a reproducible baseline model, FTibVLM, built on Qwen3-VL-8B-Instruct. FTibVLM adopts a three-stage adaptation pipeline consisting of Tibetan continual pretraining, image–text alignment, and multimodal instruction tuning. For systematic evaluation, FTibBench adapts five established multimodal benchmarks to Tibetan and offers a reproducible evaluation protocol to support consistent comparisons across models. Specifically, FTibBench includes Tibetan versions of MMBench, MME, POPE, BinaryVQA, and COREVQA. Experiments on FTibBench demonstrate that FTibVLM consistently improves Tibetan multimodal performance. For instance, FTibVLM attains 76.01 accuracy on BinaryVQA, indicating that Tibetan performance can be competitive with high-resource settings on this diagnostic task. We also observe substantial gains on other benchmarks, including an improvement on MMBench (dev) from 42.97 to 67.78 and an increase in POPE-random accuracy from 47.53 to 80.56, underscoring the practical value of the proposed workflow and resources.
%U https://aclanthology.org/2026.findings-acl.903/
%P 18143-18159
Markdown (Informal)
[FTibSuite: A Comprehensive Resource Suite for Tibetan Vision–Language Modeling](https://aclanthology.org/2026.findings-acl.903/) (Xu et al., Findings 2026)
ACL
- Guixian Xu, Yide Liang, Zeli Su, Xuexian Song, Ziyin Zhang, Yushuang Dong, Ting Zhang, and Xu Han. 2026. FTibSuite: A Comprehensive Resource Suite for Tibetan Vision–Language Modeling. In Findings of the Association for Computational Linguistics: ACL 2026, pages 18143–18159, San Diego, California, United States. Association for Computational Linguistics.