@inproceedings{lo-etal-2025-discoclip,
title = "{D}is{C}o{CLIP}: A Distributional Compositional Tensor Network Encoder for Vision-Language Understanding",
author = {Lo, Kin Ian and
Hawashin, Hala and
Abbaszadeh, Mina and
Limb{\"a}ck-Stokin, Tilen Gaetano and
Wazni, Hadi and
Sadrzadeh, Mehrnoosh},
editor = "Frermann, Lea and
Stevenson, Mark",
booktitle = "Proceedings of the 14th Joint Conference on Lexical and Computational Semantics (*SEM 2025)",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.starsem-1.25/",
pages = "316--327",
ISBN = "979-8-89176-340-1",
abstract = "Recent vision{--}language models excel at large-scale image{--}text alignment but often neglect the compositional structure of language, leading to failures on tasks that hinge on word order and predicate{--}argument structure. We introduce DisCoCLIP, a multimodal encoder that combines a frozen CLIP vision transformer with a novel tensor network text encoder that explicitly encodes syntactic structure. Sentences are parsed with a Combinatory Categorial Grammar parser to yield distributional word tensors whose contractions mirror the sentence{'}s grammatical derivation. To keep the model efficient, high-order tensors are factorized with tensor decompositions, reducing parameter count from tens of millions to under one million. Trained end-to-end with a self-supervised contrastive loss, DisCoCLIP markedly improves sensitivity to verb semantics and word order: it raises CLIP{'}s SVO-Probes verb accuracy from 77.6{\%} to 82.4{\%}, boosts ARO attribution and relation scores by over 9{\%} and 4{\%}, and achieves 93.7{\%} on a newly introduced SVO-Swap benchmark. These results demonstrate that embedding explicit linguistic structure via tensor networks yields interpretable, parameter-efficient representations that substantially improve compositional reasoning in vision{--}language tasks."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lo-etal-2025-discoclip">
<titleInfo>
<title>DisCoCLIP: A Distributional Compositional Tensor Network Encoder for Vision-Language Understanding</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kin</namePart>
<namePart type="given">Ian</namePart>
<namePart type="family">Lo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hala</namePart>
<namePart type="family">Hawashin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mina</namePart>
<namePart type="family">Abbaszadeh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tilen</namePart>
<namePart type="given">Gaetano</namePart>
<namePart type="family">Limbäck-Stokin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hadi</namePart>
<namePart type="family">Wazni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mehrnoosh</namePart>
<namePart type="family">Sadrzadeh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 14th Joint Conference on Lexical and Computational Semantics (*SEM 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lea</namePart>
<namePart type="family">Frermann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Stevenson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-340-1</identifier>
</relatedItem>
<abstract>Recent vision–language models excel at large-scale image–text alignment but often neglect the compositional structure of language, leading to failures on tasks that hinge on word order and predicate–argument structure. We introduce DisCoCLIP, a multimodal encoder that combines a frozen CLIP vision transformer with a novel tensor network text encoder that explicitly encodes syntactic structure. Sentences are parsed with a Combinatory Categorial Grammar parser to yield distributional word tensors whose contractions mirror the sentence’s grammatical derivation. To keep the model efficient, high-order tensors are factorized with tensor decompositions, reducing parameter count from tens of millions to under one million. Trained end-to-end with a self-supervised contrastive loss, DisCoCLIP markedly improves sensitivity to verb semantics and word order: it raises CLIP’s SVO-Probes verb accuracy from 77.6% to 82.4%, boosts ARO attribution and relation scores by over 9% and 4%, and achieves 93.7% on a newly introduced SVO-Swap benchmark. These results demonstrate that embedding explicit linguistic structure via tensor networks yields interpretable, parameter-efficient representations that substantially improve compositional reasoning in vision–language tasks.</abstract>
<identifier type="citekey">lo-etal-2025-discoclip</identifier>
<location>
<url>https://aclanthology.org/2025.starsem-1.25/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>316</start>
<end>327</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T DisCoCLIP: A Distributional Compositional Tensor Network Encoder for Vision-Language Understanding
%A Lo, Kin Ian
%A Hawashin, Hala
%A Abbaszadeh, Mina
%A Limbäck-Stokin, Tilen Gaetano
%A Wazni, Hadi
%A Sadrzadeh, Mehrnoosh
%Y Frermann, Lea
%Y Stevenson, Mark
%S Proceedings of the 14th Joint Conference on Lexical and Computational Semantics (*SEM 2025)
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-340-1
%F lo-etal-2025-discoclip
%X Recent vision–language models excel at large-scale image–text alignment but often neglect the compositional structure of language, leading to failures on tasks that hinge on word order and predicate–argument structure. We introduce DisCoCLIP, a multimodal encoder that combines a frozen CLIP vision transformer with a novel tensor network text encoder that explicitly encodes syntactic structure. Sentences are parsed with a Combinatory Categorial Grammar parser to yield distributional word tensors whose contractions mirror the sentence’s grammatical derivation. To keep the model efficient, high-order tensors are factorized with tensor decompositions, reducing parameter count from tens of millions to under one million. Trained end-to-end with a self-supervised contrastive loss, DisCoCLIP markedly improves sensitivity to verb semantics and word order: it raises CLIP’s SVO-Probes verb accuracy from 77.6% to 82.4%, boosts ARO attribution and relation scores by over 9% and 4%, and achieves 93.7% on a newly introduced SVO-Swap benchmark. These results demonstrate that embedding explicit linguistic structure via tensor networks yields interpretable, parameter-efficient representations that substantially improve compositional reasoning in vision–language tasks.
%U https://aclanthology.org/2025.starsem-1.25/
%P 316-327
Markdown (Informal)
[DisCoCLIP: A Distributional Compositional Tensor Network Encoder for Vision-Language Understanding](https://aclanthology.org/2025.starsem-1.25/) (Lo et al., *SEM 2025)
ACL