@inproceedings{preeti-etal-2026-semantically,
title = "Semantically Aware Optimal Transport for Dense Label Transfer",
author = "Preeti and
Ravish, Kiran and
Kushwaha, Ankita and
Kumar, Pawan",
editor = "Yan, Qianqi and
Montariol, Syrielle and
Fan, Yue and
Gu, Jing and
Pan, Jiayi and
Li, Manling and
Kordjamshidi, Parisa and
Suhr, Alane and
Wang, Xin Eric",
booktitle = "Proceedings of the 4th Workshop on Advances in Language and Vision Research ({ALVR})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.alvr-main.3/",
pages = "18--45",
ISBN = "979-8-89176-398-2",
abstract = "Vision foundation models produce features that generalize across visual domains without fine-tuning, yet naively transferring labels through these feature spaces fails under large distribution shifts.We propose SAOT (**S**emantically **A**ware **O**ptimal **T**ransport), which learns a transport cost within a fused unbalanced optimal transport formulation for dense label transfer from frozen vision transformer features to new domains.SAOT combines a learnable appearance metric with semantic class-prototype priors, unbalanced transport for partial matching under distribution shift, and a block-sparse solver for tractable inference.We pair this with a two-stage decoder: an MLP trained on SAOT pseudo-labels, then refined via EMA-teacher self-training with class-balanced sampling.On GTA5$\to$Cityscapes with frozen DINOv2 ViT-L/14 features, SAOT+Decoder reaches 25.7{\%} mIoU, a **3.8$\times$** improvement over nearest-neighbor transfer (6.7{\%}), without any backbone adaptation.Per-class results show large gains on spatially coherent classes (road 90.3{\%}, car 76.2{\%}, building 71.5{\%}), demonstrating that learned semantic transport costs capture domain-invariant structure even under severe synthetic-to-real shifts. On VOC train$\to$val with frozen ViT-B/16 features, the full pipeline reaches 47.5{\%} mIoU, indicating that the approach extends beyond synthetic-to-real adaptation."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="preeti-etal-2026-semantically">
<titleInfo>
<title>Semantically Aware Optimal Transport for Dense Label Transfer</title>
</titleInfo>
<name>
<namePart>Preeti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kiran</namePart>
<namePart type="family">Ravish</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ankita</namePart>
<namePart type="family">Kushwaha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pawan</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 4th Workshop on Advances in Language and Vision Research (ALVR)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Qianqi</namePart>
<namePart type="family">Yan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Syrielle</namePart>
<namePart type="family">Montariol</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Fan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jing</namePart>
<namePart type="family">Gu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiayi</namePart>
<namePart type="family">Pan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Manling</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Parisa</namePart>
<namePart type="family">Kordjamshidi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alane</namePart>
<namePart type="family">Suhr</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xin</namePart>
<namePart type="given">Eric</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-398-2</identifier>
</relatedItem>
<abstract>Vision foundation models produce features that generalize across visual domains without fine-tuning, yet naively transferring labels through these feature spaces fails under large distribution shifts.We propose SAOT (**S**emantically **A**ware **O**ptimal **T**ransport), which learns a transport cost within a fused unbalanced optimal transport formulation for dense label transfer from frozen vision transformer features to new domains.SAOT combines a learnable appearance metric with semantic class-prototype priors, unbalanced transport for partial matching under distribution shift, and a block-sparse solver for tractable inference.We pair this with a two-stage decoder: an MLP trained on SAOT pseudo-labels, then refined via EMA-teacher self-training with class-balanced sampling.On GTA5Cityscapes with frozen DINOv2 ViT-L/14 features, SAOT+Decoder reaches 25.7% mIoU, a **3.8\times** improvement over nearest-neighbor transfer (6.7%), without any backbone adaptation.Per-class results show large gains on spatially coherent classes (road 90.3%, car 76.2%, building 71.5%), demonstrating that learned semantic transport costs capture domain-invariant structure even under severe synthetic-to-real shifts. On VOC trainval with frozen ViT-B/16 features, the full pipeline reaches 47.5% mIoU, indicating that the approach extends beyond synthetic-to-real adaptation.</abstract>
<identifier type="citekey">preeti-etal-2026-semantically</identifier>
<location>
<url>https://aclanthology.org/2026.alvr-main.3/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>18</start>
<end>45</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Semantically Aware Optimal Transport for Dense Label Transfer
%A Ravish, Kiran
%A Kushwaha, Ankita
%A Kumar, Pawan
%Y Yan, Qianqi
%Y Montariol, Syrielle
%Y Fan, Yue
%Y Gu, Jing
%Y Pan, Jiayi
%Y Li, Manling
%Y Kordjamshidi, Parisa
%Y Suhr, Alane
%Y Wang, Xin Eric
%A Preeti
%S Proceedings of the 4th Workshop on Advances in Language and Vision Research (ALVR)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-398-2
%F preeti-etal-2026-semantically
%X Vision foundation models produce features that generalize across visual domains without fine-tuning, yet naively transferring labels through these feature spaces fails under large distribution shifts.We propose SAOT (**S**emantically **A**ware **O**ptimal **T**ransport), which learns a transport cost within a fused unbalanced optimal transport formulation for dense label transfer from frozen vision transformer features to new domains.SAOT combines a learnable appearance metric with semantic class-prototype priors, unbalanced transport for partial matching under distribution shift, and a block-sparse solver for tractable inference.We pair this with a two-stage decoder: an MLP trained on SAOT pseudo-labels, then refined via EMA-teacher self-training with class-balanced sampling.On GTA5Cityscapes with frozen DINOv2 ViT-L/14 features, SAOT+Decoder reaches 25.7% mIoU, a **3.8\times** improvement over nearest-neighbor transfer (6.7%), without any backbone adaptation.Per-class results show large gains on spatially coherent classes (road 90.3%, car 76.2%, building 71.5%), demonstrating that learned semantic transport costs capture domain-invariant structure even under severe synthetic-to-real shifts. On VOC trainval with frozen ViT-B/16 features, the full pipeline reaches 47.5% mIoU, indicating that the approach extends beyond synthetic-to-real adaptation.
%U https://aclanthology.org/2026.alvr-main.3/
%P 18-45
Markdown (Informal)
[Semantically Aware Optimal Transport for Dense Label Transfer](https://aclanthology.org/2026.alvr-main.3/) (Preeti et al., ALVR 2026)
ACL