@inproceedings{jung-etal-2022-language,
title = "Language-agnostic Semantic Consistent Text-to-Image Generation",
author = "Jung, SeongJun and
Choi, Woo Suk and
Choi, Seongho and
Zhang, Byoung-Tak",
editor = "Bugliarello, Emanuele and
Cheng, Kai-Wei and
Elliott, Desmond and
Gella, Spandana and
Kamath, Aishwarya and
Li, Liunian Harold and
Liu, Fangyu and
Pfeiffer, Jonas and
Ponti, Edoardo Maria and
Srinivasan, Krishna and
Vuli{\'c}, Ivan and
Yang, Yinfei and
Yin, Da",
booktitle = "Proceedings of the Workshop on Multilingual Multimodal Learning",
month = may,
year = "2022",
address = "Dublin, Ireland and Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.mml-1.1/",
doi = "10.18653/v1/2022.mml-1.1",
pages = "1--5",
abstract = "Recent GAN-based text-to-image generation models have advanced that they can generate photo-realistic images matching semantically with descriptions. However, research on multi-lingual text-to-image generation has not been carried out yet much. There are two problems when constructing a multilingual text-to-image generation model: 1) language imbalance issue in text-to-image paired datasets and 2) generating images that have the same meaning but are semantically inconsistent with each other in texts expressed in different languages. To this end, we propose a Language-agnostic Semantic Consistent Generative Adversarial Network (LaSC-GAN) for text-to-image generation, which can generate semantically consistent images via language-agnostic text encoder and Siamese mechanism. Experiments on relatively low-resource language text-image datasets show that the model has comparable generation quality as images generated by high-resource language text, and generates semantically consistent images for texts with the same meaning even in different languages."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jung-etal-2022-language">
<titleInfo>
<title>Language-agnostic Semantic Consistent Text-to-Image Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">SeongJun</namePart>
<namePart type="family">Jung</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Woo</namePart>
<namePart type="given">Suk</namePart>
<namePart type="family">Choi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seongho</namePart>
<namePart type="family">Choi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Byoung-Tak</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop on Multilingual Multimodal Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Emanuele</namePart>
<namePart type="family">Bugliarello</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai-Wei</namePart>
<namePart type="family">Cheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Desmond</namePart>
<namePart type="family">Elliott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Spandana</namePart>
<namePart type="family">Gella</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aishwarya</namePart>
<namePart type="family">Kamath</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Liunian</namePart>
<namePart type="given">Harold</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fangyu</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jonas</namePart>
<namePart type="family">Pfeiffer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Edoardo</namePart>
<namePart type="given">Maria</namePart>
<namePart type="family">Ponti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Krishna</namePart>
<namePart type="family">Srinivasan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ivan</namePart>
<namePart type="family">Vulić</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yinfei</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Da</namePart>
<namePart type="family">Yin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland and Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recent GAN-based text-to-image generation models have advanced that they can generate photo-realistic images matching semantically with descriptions. However, research on multi-lingual text-to-image generation has not been carried out yet much. There are two problems when constructing a multilingual text-to-image generation model: 1) language imbalance issue in text-to-image paired datasets and 2) generating images that have the same meaning but are semantically inconsistent with each other in texts expressed in different languages. To this end, we propose a Language-agnostic Semantic Consistent Generative Adversarial Network (LaSC-GAN) for text-to-image generation, which can generate semantically consistent images via language-agnostic text encoder and Siamese mechanism. Experiments on relatively low-resource language text-image datasets show that the model has comparable generation quality as images generated by high-resource language text, and generates semantically consistent images for texts with the same meaning even in different languages.</abstract>
<identifier type="citekey">jung-etal-2022-language</identifier>
<identifier type="doi">10.18653/v1/2022.mml-1.1</identifier>
<location>
<url>https://aclanthology.org/2022.mml-1.1/</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>1</start>
<end>5</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Language-agnostic Semantic Consistent Text-to-Image Generation
%A Jung, SeongJun
%A Choi, Woo Suk
%A Choi, Seongho
%A Zhang, Byoung-Tak
%Y Bugliarello, Emanuele
%Y Cheng, Kai-Wei
%Y Elliott, Desmond
%Y Gella, Spandana
%Y Kamath, Aishwarya
%Y Li, Liunian Harold
%Y Liu, Fangyu
%Y Pfeiffer, Jonas
%Y Ponti, Edoardo Maria
%Y Srinivasan, Krishna
%Y Vulić, Ivan
%Y Yang, Yinfei
%Y Yin, Da
%S Proceedings of the Workshop on Multilingual Multimodal Learning
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland and Online
%F jung-etal-2022-language
%X Recent GAN-based text-to-image generation models have advanced that they can generate photo-realistic images matching semantically with descriptions. However, research on multi-lingual text-to-image generation has not been carried out yet much. There are two problems when constructing a multilingual text-to-image generation model: 1) language imbalance issue in text-to-image paired datasets and 2) generating images that have the same meaning but are semantically inconsistent with each other in texts expressed in different languages. To this end, we propose a Language-agnostic Semantic Consistent Generative Adversarial Network (LaSC-GAN) for text-to-image generation, which can generate semantically consistent images via language-agnostic text encoder and Siamese mechanism. Experiments on relatively low-resource language text-image datasets show that the model has comparable generation quality as images generated by high-resource language text, and generates semantically consistent images for texts with the same meaning even in different languages.
%R 10.18653/v1/2022.mml-1.1
%U https://aclanthology.org/2022.mml-1.1/
%U https://doi.org/10.18653/v1/2022.mml-1.1
%P 1-5
Markdown (Informal)
[Language-agnostic Semantic Consistent Text-to-Image Generation](https://aclanthology.org/2022.mml-1.1/) (Jung et al., MML 2022)
ACL