@inproceedings{reijtenbach-etal-2025-dataset,
title = "Dataset Creation for Visual Entailment using Generative {AI}",
author = "Reijtenbach, Rob and
Verberne, Suzan and
Wijnholds, Gijs",
editor = "Abzianidze, Lasha and
de Paiva, Valeria",
booktitle = "Proceedings of the 5th Workshop on Natural Logic Meets Machine Learning (NALOMA)",
month = aug,
year = "2025",
address = "Bochum, Germany",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naloma-1.2/",
pages = "8--17",
ISBN = "979-8-89176-287-9",
abstract = "In this paper we present and validate a new synthetic dataset for training visual entailment models. Existing datasets for visual entailment are small and sparse compared to datasets for textual entailment. Manually creating datasets is labor-intensive. We base our synthetic dataset on the SNLI dataset for textual entailment. We take the premise text from SNLI as input prompts in a generative image model, Stable Diffusion, creating an image to replace each textual premise. We evaluate our dataset both intrinsically and extrinsically. For extrinsic evaluation, we evaluate the validity of the generated images by using them as training data for a visual entailment classifier based on CLIP feature vectors. We find that synthetic training data only leads to a slight drop in quality on SNLI-VE, with an F-score 0.686 compared to 0.703 when trained on real data. We also compare the quality of our generated training data to original training data on another dataset: SICK-VTE. Again, there is only a slight drop in F-score: from 0.400 to 0.384. These results indicate that in settings with data sparsity, synthetic data can be a promising solution for training visual entailment models."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="reijtenbach-etal-2025-dataset">
<titleInfo>
<title>Dataset Creation for Visual Entailment using Generative AI</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rob</namePart>
<namePart type="family">Reijtenbach</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Suzan</namePart>
<namePart type="family">Verberne</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gijs</namePart>
<namePart type="family">Wijnholds</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 5th Workshop on Natural Logic Meets Machine Learning (NALOMA)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lasha</namePart>
<namePart type="family">Abzianidze</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Valeria</namePart>
<namePart type="family">de Paiva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bochum, Germany</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-287-9</identifier>
</relatedItem>
<abstract>In this paper we present and validate a new synthetic dataset for training visual entailment models. Existing datasets for visual entailment are small and sparse compared to datasets for textual entailment. Manually creating datasets is labor-intensive. We base our synthetic dataset on the SNLI dataset for textual entailment. We take the premise text from SNLI as input prompts in a generative image model, Stable Diffusion, creating an image to replace each textual premise. We evaluate our dataset both intrinsically and extrinsically. For extrinsic evaluation, we evaluate the validity of the generated images by using them as training data for a visual entailment classifier based on CLIP feature vectors. We find that synthetic training data only leads to a slight drop in quality on SNLI-VE, with an F-score 0.686 compared to 0.703 when trained on real data. We also compare the quality of our generated training data to original training data on another dataset: SICK-VTE. Again, there is only a slight drop in F-score: from 0.400 to 0.384. These results indicate that in settings with data sparsity, synthetic data can be a promising solution for training visual entailment models.</abstract>
<identifier type="citekey">reijtenbach-etal-2025-dataset</identifier>
<location>
<url>https://aclanthology.org/2025.naloma-1.2/</url>
</location>
<part>
<date>2025-08</date>
<extent unit="page">
<start>8</start>
<end>17</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Dataset Creation for Visual Entailment using Generative AI
%A Reijtenbach, Rob
%A Verberne, Suzan
%A Wijnholds, Gijs
%Y Abzianidze, Lasha
%Y de Paiva, Valeria
%S Proceedings of the 5th Workshop on Natural Logic Meets Machine Learning (NALOMA)
%D 2025
%8 August
%I Association for Computational Linguistics
%C Bochum, Germany
%@ 979-8-89176-287-9
%F reijtenbach-etal-2025-dataset
%X In this paper we present and validate a new synthetic dataset for training visual entailment models. Existing datasets for visual entailment are small and sparse compared to datasets for textual entailment. Manually creating datasets is labor-intensive. We base our synthetic dataset on the SNLI dataset for textual entailment. We take the premise text from SNLI as input prompts in a generative image model, Stable Diffusion, creating an image to replace each textual premise. We evaluate our dataset both intrinsically and extrinsically. For extrinsic evaluation, we evaluate the validity of the generated images by using them as training data for a visual entailment classifier based on CLIP feature vectors. We find that synthetic training data only leads to a slight drop in quality on SNLI-VE, with an F-score 0.686 compared to 0.703 when trained on real data. We also compare the quality of our generated training data to original training data on another dataset: SICK-VTE. Again, there is only a slight drop in F-score: from 0.400 to 0.384. These results indicate that in settings with data sparsity, synthetic data can be a promising solution for training visual entailment models.
%U https://aclanthology.org/2025.naloma-1.2/
%P 8-17
Markdown (Informal)
[Dataset Creation for Visual Entailment using Generative AI](https://aclanthology.org/2025.naloma-1.2/) (Reijtenbach et al., NALOMA 2025)
ACL