@inproceedings{gu-etal-2024-rwkv,
title = "{RWKV}-{CLIP}: A Robust Vision-Language Representation Learner",
author = "Gu, Tiancheng and
Yang, Kaicheng and
An, Xiang and
Feng, Ziyong and
Liu, Dongnan and
Cai, Weidong and
Deng, Jiankang",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.276",
pages = "4799--4812",
abstract = "Contrastive Language-Image Pre-training (CLIP) has significantly improved performance in various vision-language tasks by expanding the dataset with image-text pairs obtained from the web. This paper further explores CLIP from the perspectives of data and model architecture. To mitigate the impact of the noise data and enhance the quality of large-scale image-text data crawled from the internet, we introduce a diverse description generation framework that can leverage Large Language Models (LLMs) to combine and refine information from web-based image-text pairs, synthetic captions, and detection tags. Additionally, we propose RWKV-CLIP, the first RWKV-driven vision-language representation learning model that combines the effective parallel training of transformers with the efficient inference of RNNs. Extensive experiments across different model scales and pre-training datasets demonstrate that RWKV-CLIP is a robust vision-language representation learner and it achieves state-of-the-art performance across multiple downstream tasks, including linear probing, zero-shot classification, and zero-shot image-text retrieval. To facilitate future research, the code and pre-trained models are released at https://github.com/deepglint/RWKV-CLIP.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gu-etal-2024-rwkv">
<titleInfo>
<title>RWKV-CLIP: A Robust Vision-Language Representation Learner</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tiancheng</namePart>
<namePart type="family">Gu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kaicheng</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiang</namePart>
<namePart type="family">An</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ziyong</namePart>
<namePart type="family">Feng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dongnan</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Weidong</namePart>
<namePart type="family">Cai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiankang</namePart>
<namePart type="family">Deng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Contrastive Language-Image Pre-training (CLIP) has significantly improved performance in various vision-language tasks by expanding the dataset with image-text pairs obtained from the web. This paper further explores CLIP from the perspectives of data and model architecture. To mitigate the impact of the noise data and enhance the quality of large-scale image-text data crawled from the internet, we introduce a diverse description generation framework that can leverage Large Language Models (LLMs) to combine and refine information from web-based image-text pairs, synthetic captions, and detection tags. Additionally, we propose RWKV-CLIP, the first RWKV-driven vision-language representation learning model that combines the effective parallel training of transformers with the efficient inference of RNNs. Extensive experiments across different model scales and pre-training datasets demonstrate that RWKV-CLIP is a robust vision-language representation learner and it achieves state-of-the-art performance across multiple downstream tasks, including linear probing, zero-shot classification, and zero-shot image-text retrieval. To facilitate future research, the code and pre-trained models are released at https://github.com/deepglint/RWKV-CLIP.</abstract>
<identifier type="citekey">gu-etal-2024-rwkv</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.276</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>4799</start>
<end>4812</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T RWKV-CLIP: A Robust Vision-Language Representation Learner
%A Gu, Tiancheng
%A Yang, Kaicheng
%A An, Xiang
%A Feng, Ziyong
%A Liu, Dongnan
%A Cai, Weidong
%A Deng, Jiankang
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F gu-etal-2024-rwkv
%X Contrastive Language-Image Pre-training (CLIP) has significantly improved performance in various vision-language tasks by expanding the dataset with image-text pairs obtained from the web. This paper further explores CLIP from the perspectives of data and model architecture. To mitigate the impact of the noise data and enhance the quality of large-scale image-text data crawled from the internet, we introduce a diverse description generation framework that can leverage Large Language Models (LLMs) to combine and refine information from web-based image-text pairs, synthetic captions, and detection tags. Additionally, we propose RWKV-CLIP, the first RWKV-driven vision-language representation learning model that combines the effective parallel training of transformers with the efficient inference of RNNs. Extensive experiments across different model scales and pre-training datasets demonstrate that RWKV-CLIP is a robust vision-language representation learner and it achieves state-of-the-art performance across multiple downstream tasks, including linear probing, zero-shot classification, and zero-shot image-text retrieval. To facilitate future research, the code and pre-trained models are released at https://github.com/deepglint/RWKV-CLIP.
%U https://aclanthology.org/2024.emnlp-main.276
%P 4799-4812
Markdown (Informal)
[RWKV-CLIP: A Robust Vision-Language Representation Learner](https://aclanthology.org/2024.emnlp-main.276) (Gu et al., EMNLP 2024)
ACL
- Tiancheng Gu, Kaicheng Yang, Xiang An, Ziyong Feng, Dongnan Liu, Weidong Cai, and Jiankang Deng. 2024. RWKV-CLIP: A Robust Vision-Language Representation Learner. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, pages 4799–4812, Miami, Florida, USA. Association for Computational Linguistics.