@inproceedings{alacam-etal-2024-wikiscenes,
title = "{W}iki{S}cenes with Descriptions: Aligning Paragraphs and Sentences with Images in {W}ikipedia Articles",
author = {Ala{\c{c}}am, {\"O}zge and
Utescher, Ronja and
Gr{\"o}nner, Hannes and
Sieker, Judith and
Zarrie{\ss}, Sina},
editor = "Bollegala, Danushka and
Shwartz, Vered",
booktitle = "Proceedings of the 13th Joint Conference on Lexical and Computational Semantics (*SEM 2024)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.starsem-1.8",
doi = "10.18653/v1/2024.starsem-1.8",
pages = "93--105",
abstract = "Research in Language {\&} Vision rarely uses naturally occurring multimodal documents as Wikipedia articles, since they feature complex image-text relations and implicit image-text alignments. In this paper, we provide one of the first datasets that provides ground-truth annotations of image-text alignments in multi-paragraph multi-image articles. The dataset can be used to study phenomena of visual language grounding in longer documents and assess retrieval capabilities of language models trained on, e.g., captioning data. Our analyses show that there are systematic linguistic differences between the image captions and descriptive sentences from the article{'}s text and that intra-document retrieval is a challenging task for state-of-the-art models in L{\&}V (CLIP, VILT, MCSE).",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="alacam-etal-2024-wikiscenes">
<titleInfo>
<title>WikiScenes with Descriptions: Aligning Paragraphs and Sentences with Images in Wikipedia Articles</title>
</titleInfo>
<name type="personal">
<namePart type="given">Özge</namePart>
<namePart type="family">Alaçam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ronja</namePart>
<namePart type="family">Utescher</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hannes</namePart>
<namePart type="family">Grönner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Judith</namePart>
<namePart type="family">Sieker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sina</namePart>
<namePart type="family">Zarrieß</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 13th Joint Conference on Lexical and Computational Semantics (*SEM 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Danushka</namePart>
<namePart type="family">Bollegala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vered</namePart>
<namePart type="family">Shwartz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Research in Language & Vision rarely uses naturally occurring multimodal documents as Wikipedia articles, since they feature complex image-text relations and implicit image-text alignments. In this paper, we provide one of the first datasets that provides ground-truth annotations of image-text alignments in multi-paragraph multi-image articles. The dataset can be used to study phenomena of visual language grounding in longer documents and assess retrieval capabilities of language models trained on, e.g., captioning data. Our analyses show that there are systematic linguistic differences between the image captions and descriptive sentences from the article’s text and that intra-document retrieval is a challenging task for state-of-the-art models in L&V (CLIP, VILT, MCSE).</abstract>
<identifier type="citekey">alacam-etal-2024-wikiscenes</identifier>
<identifier type="doi">10.18653/v1/2024.starsem-1.8</identifier>
<location>
<url>https://aclanthology.org/2024.starsem-1.8</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>93</start>
<end>105</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T WikiScenes with Descriptions: Aligning Paragraphs and Sentences with Images in Wikipedia Articles
%A Alaçam, Özge
%A Utescher, Ronja
%A Grönner, Hannes
%A Sieker, Judith
%A Zarrieß, Sina
%Y Bollegala, Danushka
%Y Shwartz, Vered
%S Proceedings of the 13th Joint Conference on Lexical and Computational Semantics (*SEM 2024)
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F alacam-etal-2024-wikiscenes
%X Research in Language & Vision rarely uses naturally occurring multimodal documents as Wikipedia articles, since they feature complex image-text relations and implicit image-text alignments. In this paper, we provide one of the first datasets that provides ground-truth annotations of image-text alignments in multi-paragraph multi-image articles. The dataset can be used to study phenomena of visual language grounding in longer documents and assess retrieval capabilities of language models trained on, e.g., captioning data. Our analyses show that there are systematic linguistic differences between the image captions and descriptive sentences from the article’s text and that intra-document retrieval is a challenging task for state-of-the-art models in L&V (CLIP, VILT, MCSE).
%R 10.18653/v1/2024.starsem-1.8
%U https://aclanthology.org/2024.starsem-1.8
%U https://doi.org/10.18653/v1/2024.starsem-1.8
%P 93-105
Markdown (Informal)
[WikiScenes with Descriptions: Aligning Paragraphs and Sentences with Images in Wikipedia Articles](https://aclanthology.org/2024.starsem-1.8) (Alaçam et al., *SEM 2024)
ACL