@inproceedings{doostmohammadi-kuhlmann-2022-effects,
title = "On the Effects of Video Grounding on Language Models",
author = "Doostmohammadi, Ehsan and
Kuhlmann, Marco",
booktitle = "Proceedings of the First Workshop on Performance and Interpretability Evaluations of Multimodal, Multipurpose, Massive-Scale Models",
month = oct,
year = "2022",
address = "Virtual",
publisher = "International Conference on Computational Linguistics",
url = "https://aclanthology.org/2022.mmmpie-1.1",
pages = "1--6",
abstract = "Transformer-based models trained on text and vision modalities try to improve the performance on multimodal downstream tasks or tackle the problem Transformer-based models trained on text and vision modalities try to improve the performance on multimodal downstream tasks or tackle the problem of lack of grounding, e.g., addressing issues like models{'} insufficient commonsense knowledge. While it is more straightforward to evaluate the effects of such models on multimodal tasks, such as visual question answering or image captioning, it is not as well-understood how these tasks affect the model itself, and its internal linguistic representations. In this work, we experiment with language models grounded in videos and measure the models{'} performance on predicting masked words chosen based on their \textit{imageability}. The results show that the smaller model benefits from video grounding in predicting highly imageable words, while the results for the larger model seem harder to interpret.of lack of grounding, e.g., addressing issues like models{'} insufficient commonsense knowledge. While it is more straightforward to evaluate the effects of such models on multimodal tasks, such as visual question answering or image captioning, it is not as well-understood how these tasks affect the model itself, and its internal linguistic representations. In this work, we experiment with language models grounded in videos and measure the models{'} performance on predicting masked words chosen based on their imageability. The results show that the smaller model benefits from video grounding in predicting highly imageable words, while the results for the larger model seem harder to interpret.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="doostmohammadi-kuhlmann-2022-effects">
<titleInfo>
<title>On the Effects of Video Grounding on Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ehsan</namePart>
<namePart type="family">Doostmohammadi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marco</namePart>
<namePart type="family">Kuhlmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Performance and Interpretability Evaluations of Multimodal, Multipurpose, Massive-Scale Models</title>
</titleInfo>
<originInfo>
<publisher>International Conference on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Virtual</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Transformer-based models trained on text and vision modalities try to improve the performance on multimodal downstream tasks or tackle the problem Transformer-based models trained on text and vision modalities try to improve the performance on multimodal downstream tasks or tackle the problem of lack of grounding, e.g., addressing issues like models’ insufficient commonsense knowledge. While it is more straightforward to evaluate the effects of such models on multimodal tasks, such as visual question answering or image captioning, it is not as well-understood how these tasks affect the model itself, and its internal linguistic representations. In this work, we experiment with language models grounded in videos and measure the models’ performance on predicting masked words chosen based on their imageability. The results show that the smaller model benefits from video grounding in predicting highly imageable words, while the results for the larger model seem harder to interpret.of lack of grounding, e.g., addressing issues like models’ insufficient commonsense knowledge. While it is more straightforward to evaluate the effects of such models on multimodal tasks, such as visual question answering or image captioning, it is not as well-understood how these tasks affect the model itself, and its internal linguistic representations. In this work, we experiment with language models grounded in videos and measure the models’ performance on predicting masked words chosen based on their imageability. The results show that the smaller model benefits from video grounding in predicting highly imageable words, while the results for the larger model seem harder to interpret.</abstract>
<identifier type="citekey">doostmohammadi-kuhlmann-2022-effects</identifier>
<location>
<url>https://aclanthology.org/2022.mmmpie-1.1</url>
</location>
<part>
<date>2022-10</date>
<extent unit="page">
<start>1</start>
<end>6</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T On the Effects of Video Grounding on Language Models
%A Doostmohammadi, Ehsan
%A Kuhlmann, Marco
%S Proceedings of the First Workshop on Performance and Interpretability Evaluations of Multimodal, Multipurpose, Massive-Scale Models
%D 2022
%8 October
%I International Conference on Computational Linguistics
%C Virtual
%F doostmohammadi-kuhlmann-2022-effects
%X Transformer-based models trained on text and vision modalities try to improve the performance on multimodal downstream tasks or tackle the problem Transformer-based models trained on text and vision modalities try to improve the performance on multimodal downstream tasks or tackle the problem of lack of grounding, e.g., addressing issues like models’ insufficient commonsense knowledge. While it is more straightforward to evaluate the effects of such models on multimodal tasks, such as visual question answering or image captioning, it is not as well-understood how these tasks affect the model itself, and its internal linguistic representations. In this work, we experiment with language models grounded in videos and measure the models’ performance on predicting masked words chosen based on their imageability. The results show that the smaller model benefits from video grounding in predicting highly imageable words, while the results for the larger model seem harder to interpret.of lack of grounding, e.g., addressing issues like models’ insufficient commonsense knowledge. While it is more straightforward to evaluate the effects of such models on multimodal tasks, such as visual question answering or image captioning, it is not as well-understood how these tasks affect the model itself, and its internal linguistic representations. In this work, we experiment with language models grounded in videos and measure the models’ performance on predicting masked words chosen based on their imageability. The results show that the smaller model benefits from video grounding in predicting highly imageable words, while the results for the larger model seem harder to interpret.
%U https://aclanthology.org/2022.mmmpie-1.1
%P 1-6
Markdown (Informal)
[On the Effects of Video Grounding on Language Models](https://aclanthology.org/2022.mmmpie-1.1) (Doostmohammadi & Kuhlmann, MMMPIE 2022)
ACL
- Ehsan Doostmohammadi and Marco Kuhlmann. 2022. On the Effects of Video Grounding on Language Models. In Proceedings of the First Workshop on Performance and Interpretability Evaluations of Multimodal, Multipurpose, Massive-Scale Models, pages 1–6, Virtual. International Conference on Computational Linguistics.