@inproceedings{bulat-etal-2024-efficient,
title = "Efficient Vision-Language pre-training via domain-specific learning for human activities",
author = "Bulat, Adrian and
Ouali, Yassine and
Guerrero, Ricardo and
Martinez, Brais and
Tzimiropoulos, Georgios",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.454",
pages = "7978--8000",
abstract = "Current Vision-Language (VL) models owe their success to large-scale pre-training on web-collected data, which in turn requires high-capacity architectures and large compute resources for training. We posit that when the downstream tasks are known in advance, which is in practice common, the pretraining process can be aligned to the downstream domain, leading to more efficient and accurate models, while shortening the pretraining step. To this end, we introduce a domain-aligned pretraining strategy that, without additional data collection, improves the accuracy on a domain of interest, herein, that of human activities, while largely preserving the generalist knowledge. At the core of our approach stands a new LLM-based method that, provided with a simple set of concept seeds, produces a concept hierarchy with high coverage of the target domain.The concept hierarchy is used to filter a large-scale web-crawled dataset and, then, enhance the resulting instances with targeted synthetic labels. We study in depth how to train such approaches and their resulting behavior. We further show generalization to video-based data by introducing a fast adaptation approach for transitioning from a static (image) model to a dynamic one (i.e. with temporal modeling). On the domain of interest, our approach significantly outperforms models trained on up to $60\times$ more samples and between $10-100\times$ shorter training schedules for image retrieval, video retrieval and action recognition. Code will be released.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bulat-etal-2024-efficient">
<titleInfo>
<title>Efficient Vision-Language pre-training via domain-specific learning for human activities</title>
</titleInfo>
<name type="personal">
<namePart type="given">Adrian</namePart>
<namePart type="family">Bulat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yassine</namePart>
<namePart type="family">Ouali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ricardo</namePart>
<namePart type="family">Guerrero</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Brais</namePart>
<namePart type="family">Martinez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Georgios</namePart>
<namePart type="family">Tzimiropoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Current Vision-Language (VL) models owe their success to large-scale pre-training on web-collected data, which in turn requires high-capacity architectures and large compute resources for training. We posit that when the downstream tasks are known in advance, which is in practice common, the pretraining process can be aligned to the downstream domain, leading to more efficient and accurate models, while shortening the pretraining step. To this end, we introduce a domain-aligned pretraining strategy that, without additional data collection, improves the accuracy on a domain of interest, herein, that of human activities, while largely preserving the generalist knowledge. At the core of our approach stands a new LLM-based method that, provided with a simple set of concept seeds, produces a concept hierarchy with high coverage of the target domain.The concept hierarchy is used to filter a large-scale web-crawled dataset and, then, enhance the resulting instances with targeted synthetic labels. We study in depth how to train such approaches and their resulting behavior. We further show generalization to video-based data by introducing a fast adaptation approach for transitioning from a static (image) model to a dynamic one (i.e. with temporal modeling). On the domain of interest, our approach significantly outperforms models trained on up to 60\times more samples and between 10-100\times shorter training schedules for image retrieval, video retrieval and action recognition. Code will be released.</abstract>
<identifier type="citekey">bulat-etal-2024-efficient</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.454</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>7978</start>
<end>8000</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Efficient Vision-Language pre-training via domain-specific learning for human activities
%A Bulat, Adrian
%A Ouali, Yassine
%A Guerrero, Ricardo
%A Martinez, Brais
%A Tzimiropoulos, Georgios
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F bulat-etal-2024-efficient
%X Current Vision-Language (VL) models owe their success to large-scale pre-training on web-collected data, which in turn requires high-capacity architectures and large compute resources for training. We posit that when the downstream tasks are known in advance, which is in practice common, the pretraining process can be aligned to the downstream domain, leading to more efficient and accurate models, while shortening the pretraining step. To this end, we introduce a domain-aligned pretraining strategy that, without additional data collection, improves the accuracy on a domain of interest, herein, that of human activities, while largely preserving the generalist knowledge. At the core of our approach stands a new LLM-based method that, provided with a simple set of concept seeds, produces a concept hierarchy with high coverage of the target domain.The concept hierarchy is used to filter a large-scale web-crawled dataset and, then, enhance the resulting instances with targeted synthetic labels. We study in depth how to train such approaches and their resulting behavior. We further show generalization to video-based data by introducing a fast adaptation approach for transitioning from a static (image) model to a dynamic one (i.e. with temporal modeling). On the domain of interest, our approach significantly outperforms models trained on up to 60\times more samples and between 10-100\times shorter training schedules for image retrieval, video retrieval and action recognition. Code will be released.
%U https://aclanthology.org/2024.emnlp-main.454
%P 7978-8000
Markdown (Informal)
[Efficient Vision-Language pre-training via domain-specific learning for human activities](https://aclanthology.org/2024.emnlp-main.454) (Bulat et al., EMNLP 2024)
ACL