@article{nuyts-etal-2024-explicitly,
title = "Explicitly Representing Syntax Improves Sentence-to-Layout Prediction of Unexpected Situations",
author = "Nuyts, Wolf and
Cartuyvels, Ruben and
Moens, Marie-Francine",
journal = "Transactions of the Association for Computational Linguistics",
volume = "12",
year = "2024",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2024.tacl-1.15",
doi = "10.1162/tacl_a_00643",
pages = "264--282",
abstract = "Recognizing visual entities in a natural language sentence and arranging them in a 2D spatial layout require a compositional understanding of language and space. This task of layout prediction is valuable in text-to-image synthesis as it allows localized and controlled in-painting of the image. In this comparative study it is shown that we can predict layouts from language representations that implicitly or explicitly encode sentence syntax, if the sentences mention similar entity-relationships to the ones seen during training. To test compositional understanding, we collect a test set of grammatically correct sentences and layouts describing compositions of entities and relations that unlikely have been seen during training. Performance on this test set substantially drops, showing that current models rely on correlations in the training data and have difficulties in understanding the structure of the input sentences. We propose a novel structural loss function that better enforces the syntactic structure of the input sentence and show large performance gains in the task of 2D spatial layout prediction conditioned on text. The loss has the potential to be used in other generation tasks where a tree-like structure underlies the conditioning modality. Code, trained models, and the USCOCO evaluation set are available via Github.1",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nuyts-etal-2024-explicitly">
<titleInfo>
<title>Explicitly Representing Syntax Improves Sentence-to-Layout Prediction of Unexpected Situations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wolf</namePart>
<namePart type="family">Nuyts</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruben</namePart>
<namePart type="family">Cartuyvels</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie-Francine</namePart>
<namePart type="family">Moens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Recognizing visual entities in a natural language sentence and arranging them in a 2D spatial layout require a compositional understanding of language and space. This task of layout prediction is valuable in text-to-image synthesis as it allows localized and controlled in-painting of the image. In this comparative study it is shown that we can predict layouts from language representations that implicitly or explicitly encode sentence syntax, if the sentences mention similar entity-relationships to the ones seen during training. To test compositional understanding, we collect a test set of grammatically correct sentences and layouts describing compositions of entities and relations that unlikely have been seen during training. Performance on this test set substantially drops, showing that current models rely on correlations in the training data and have difficulties in understanding the structure of the input sentences. We propose a novel structural loss function that better enforces the syntactic structure of the input sentence and show large performance gains in the task of 2D spatial layout prediction conditioned on text. The loss has the potential to be used in other generation tasks where a tree-like structure underlies the conditioning modality. Code, trained models, and the USCOCO evaluation set are available via Github.1</abstract>
<identifier type="citekey">nuyts-etal-2024-explicitly</identifier>
<identifier type="doi">10.1162/tacl_a_00643</identifier>
<location>
<url>https://aclanthology.org/2024.tacl-1.15</url>
</location>
<part>
<date>2024</date>
<detail type="volume"><number>12</number></detail>
<extent unit="page">
<start>264</start>
<end>282</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Explicitly Representing Syntax Improves Sentence-to-Layout Prediction of Unexpected Situations
%A Nuyts, Wolf
%A Cartuyvels, Ruben
%A Moens, Marie-Francine
%J Transactions of the Association for Computational Linguistics
%D 2024
%V 12
%I MIT Press
%C Cambridge, MA
%F nuyts-etal-2024-explicitly
%X Recognizing visual entities in a natural language sentence and arranging them in a 2D spatial layout require a compositional understanding of language and space. This task of layout prediction is valuable in text-to-image synthesis as it allows localized and controlled in-painting of the image. In this comparative study it is shown that we can predict layouts from language representations that implicitly or explicitly encode sentence syntax, if the sentences mention similar entity-relationships to the ones seen during training. To test compositional understanding, we collect a test set of grammatically correct sentences and layouts describing compositions of entities and relations that unlikely have been seen during training. Performance on this test set substantially drops, showing that current models rely on correlations in the training data and have difficulties in understanding the structure of the input sentences. We propose a novel structural loss function that better enforces the syntactic structure of the input sentence and show large performance gains in the task of 2D spatial layout prediction conditioned on text. The loss has the potential to be used in other generation tasks where a tree-like structure underlies the conditioning modality. Code, trained models, and the USCOCO evaluation set are available via Github.1
%R 10.1162/tacl_a_00643
%U https://aclanthology.org/2024.tacl-1.15
%U https://doi.org/10.1162/tacl_a_00643
%P 264-282
Markdown (Informal)
[Explicitly Representing Syntax Improves Sentence-to-Layout Prediction of Unexpected Situations](https://aclanthology.org/2024.tacl-1.15) (Nuyts et al., TACL 2024)
ACL