@inproceedings{liu-yu-2025-mt2st,
title = "{MT}2{ST}: Adaptive Multi-Task to Single-Task Learning",
author = "Liu, Dong and
Yu, Yanxuan",
editor = "Kriz, Reno and
Murray, Kenton",
booktitle = "Proceedings of the 1st Workshop on Multimodal Augmented Generation via Multimodal Retrieval (MAGMaR 2025)",
month = aug,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.magmar-1.8/",
doi = "10.18653/v1/2025.magmar-1.8",
pages = "79--89",
ISBN = "979-8-89176-280-0",
abstract = "We propose \textbf{MT2ST}, a general and efficient framework for accelerating multi-task training by progressively transitioning to single-task optimization. Unlike conventional multi-task learning (MTL) or single-task fine-tuning (STL), MT2ST dynamically adjusts the training focus via two complementary strategies: \textit{Diminish}, which gradually down-weights auxiliary losses, and \textit{Switch}, which explicitly switches to the primary task at a scheduled point. We demonstrate the effectiveness of MT2ST across three key paradigms: representation learning, transformers, and diffusion models, covering both unimodal (text/image) and multimodal (vision-language) tasks. Extensive experiments show that MT2ST significantly improves training efficiency{---}achieving up to 56{\%} FLOPs compression{---}while maintaining or surpassing task performance. These results suggest MT2ST as a general-purpose solution for scalable and adaptive multi-task training. Although this work is general-purpose, it is especially suitable for multimodal settings such as VQA or vision-language retrieval, where auxiliary pretraining (e.g., masked language modeling or contrastive learning) often diverges from final objectives. We include a VQA case study and outline its efficiency for multimodal retrieval."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="liu-yu-2025-mt2st">
<titleInfo>
<title>MT2ST: Adaptive Multi-Task to Single-Task Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dong</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yanxuan</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on Multimodal Augmented Generation via Multimodal Retrieval (MAGMaR 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Reno</namePart>
<namePart type="family">Kriz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kenton</namePart>
<namePart type="family">Murray</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-280-0</identifier>
</relatedItem>
<abstract>We propose MT2ST, a general and efficient framework for accelerating multi-task training by progressively transitioning to single-task optimization. Unlike conventional multi-task learning (MTL) or single-task fine-tuning (STL), MT2ST dynamically adjusts the training focus via two complementary strategies: Diminish, which gradually down-weights auxiliary losses, and Switch, which explicitly switches to the primary task at a scheduled point. We demonstrate the effectiveness of MT2ST across three key paradigms: representation learning, transformers, and diffusion models, covering both unimodal (text/image) and multimodal (vision-language) tasks. Extensive experiments show that MT2ST significantly improves training efficiency—achieving up to 56% FLOPs compression—while maintaining or surpassing task performance. These results suggest MT2ST as a general-purpose solution for scalable and adaptive multi-task training. Although this work is general-purpose, it is especially suitable for multimodal settings such as VQA or vision-language retrieval, where auxiliary pretraining (e.g., masked language modeling or contrastive learning) often diverges from final objectives. We include a VQA case study and outline its efficiency for multimodal retrieval.</abstract>
<identifier type="citekey">liu-yu-2025-mt2st</identifier>
<identifier type="doi">10.18653/v1/2025.magmar-1.8</identifier>
<location>
<url>https://aclanthology.org/2025.magmar-1.8/</url>
</location>
<part>
<date>2025-08</date>
<extent unit="page">
<start>79</start>
<end>89</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MT2ST: Adaptive Multi-Task to Single-Task Learning
%A Liu, Dong
%A Yu, Yanxuan
%Y Kriz, Reno
%Y Murray, Kenton
%S Proceedings of the 1st Workshop on Multimodal Augmented Generation via Multimodal Retrieval (MAGMaR 2025)
%D 2025
%8 August
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-280-0
%F liu-yu-2025-mt2st
%X We propose MT2ST, a general and efficient framework for accelerating multi-task training by progressively transitioning to single-task optimization. Unlike conventional multi-task learning (MTL) or single-task fine-tuning (STL), MT2ST dynamically adjusts the training focus via two complementary strategies: Diminish, which gradually down-weights auxiliary losses, and Switch, which explicitly switches to the primary task at a scheduled point. We demonstrate the effectiveness of MT2ST across three key paradigms: representation learning, transformers, and diffusion models, covering both unimodal (text/image) and multimodal (vision-language) tasks. Extensive experiments show that MT2ST significantly improves training efficiency—achieving up to 56% FLOPs compression—while maintaining or surpassing task performance. These results suggest MT2ST as a general-purpose solution for scalable and adaptive multi-task training. Although this work is general-purpose, it is especially suitable for multimodal settings such as VQA or vision-language retrieval, where auxiliary pretraining (e.g., masked language modeling or contrastive learning) often diverges from final objectives. We include a VQA case study and outline its efficiency for multimodal retrieval.
%R 10.18653/v1/2025.magmar-1.8
%U https://aclanthology.org/2025.magmar-1.8/
%U https://doi.org/10.18653/v1/2025.magmar-1.8
%P 79-89
Markdown (Informal)
[MT2ST: Adaptive Multi-Task to Single-Task Learning](https://aclanthology.org/2025.magmar-1.8/) (Liu & Yu, MAGMaR 2025)
ACL
- Dong Liu and Yanxuan Yu. 2025. MT2ST: Adaptive Multi-Task to Single-Task Learning. In Proceedings of the 1st Workshop on Multimodal Augmented Generation via Multimodal Retrieval (MAGMaR 2025), pages 79–89, Vienna, Austria. Association for Computational Linguistics.