@inproceedings{jafaritazehjani-etal-2019-visually,
title = "Visually grounded generation of entailments from premises",
author = "Jafaritazehjani, Somayeh and
Gatt, Albert and
Tanti, Marc",
editor = "van Deemter, Kees and
Lin, Chenghua and
Takamura, Hiroya",
booktitle = "Proceedings of the 12th International Conference on Natural Language Generation",
month = oct # "{--}" # nov,
year = "2019",
address = "Tokyo, Japan",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W19-8625",
doi = "10.18653/v1/W19-8625",
pages = "178--188",
abstract = "Natural Language Inference (NLI) is the task of determining the semantic relationship between a premise and a hypothesis. In this paper, we focus on the generation of hypotheses from premises in a multimodal setting, to generate a sentence (hypothesis) given an image and/or its description (premise) as the input. The main goals of this paper are (a) to investigate whether it is reasonable to frame NLI as a generation task; and (b) to consider the degree to which grounding textual premises in visual information is beneficial to generation. We compare different neural architectures, showing through automatic and human evaluation that entailments can indeed be generated successfully. We also show that multimodal models outperform unimodal models in this task, albeit marginally",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jafaritazehjani-etal-2019-visually">
<titleInfo>
<title>Visually grounded generation of entailments from premises</title>
</titleInfo>
<name type="personal">
<namePart type="given">Somayeh</namePart>
<namePart type="family">Jafaritazehjani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Albert</namePart>
<namePart type="family">Gatt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marc</namePart>
<namePart type="family">Tanti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-oct–nov</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 12th International Conference on Natural Language Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kees</namePart>
<namePart type="family">van Deemter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chenghua</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hiroya</namePart>
<namePart type="family">Takamura</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Tokyo, Japan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Natural Language Inference (NLI) is the task of determining the semantic relationship between a premise and a hypothesis. In this paper, we focus on the generation of hypotheses from premises in a multimodal setting, to generate a sentence (hypothesis) given an image and/or its description (premise) as the input. The main goals of this paper are (a) to investigate whether it is reasonable to frame NLI as a generation task; and (b) to consider the degree to which grounding textual premises in visual information is beneficial to generation. We compare different neural architectures, showing through automatic and human evaluation that entailments can indeed be generated successfully. We also show that multimodal models outperform unimodal models in this task, albeit marginally</abstract>
<identifier type="citekey">jafaritazehjani-etal-2019-visually</identifier>
<identifier type="doi">10.18653/v1/W19-8625</identifier>
<location>
<url>https://aclanthology.org/W19-8625</url>
</location>
<part>
<date>2019-oct–nov</date>
<extent unit="page">
<start>178</start>
<end>188</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Visually grounded generation of entailments from premises
%A Jafaritazehjani, Somayeh
%A Gatt, Albert
%A Tanti, Marc
%Y van Deemter, Kees
%Y Lin, Chenghua
%Y Takamura, Hiroya
%S Proceedings of the 12th International Conference on Natural Language Generation
%D 2019
%8 oct–nov
%I Association for Computational Linguistics
%C Tokyo, Japan
%F jafaritazehjani-etal-2019-visually
%X Natural Language Inference (NLI) is the task of determining the semantic relationship between a premise and a hypothesis. In this paper, we focus on the generation of hypotheses from premises in a multimodal setting, to generate a sentence (hypothesis) given an image and/or its description (premise) as the input. The main goals of this paper are (a) to investigate whether it is reasonable to frame NLI as a generation task; and (b) to consider the degree to which grounding textual premises in visual information is beneficial to generation. We compare different neural architectures, showing through automatic and human evaluation that entailments can indeed be generated successfully. We also show that multimodal models outperform unimodal models in this task, albeit marginally
%R 10.18653/v1/W19-8625
%U https://aclanthology.org/W19-8625
%U https://doi.org/10.18653/v1/W19-8625
%P 178-188
Markdown (Informal)
[Visually grounded generation of entailments from premises](https://aclanthology.org/W19-8625) (Jafaritazehjani et al., INLG 2019)
ACL