@inproceedings{guhan-etal-2024-tame,
title = "{TAME}-{RD}: Text Assisted Replication of Image Multi-Adjustments for Reverse Designing",
author = "Guhan, Pooja and
Bhattacharya, Uttaran and
Sarkhel, Somdeb and
Azizi, Vahid and
Chen, Xiang and
Mitra, Saayan and
Bera, Aniket and
Manocha, Dinesh",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-acl.637",
doi = "10.18653/v1/2024.findings-acl.637",
pages = "10710--10727",
abstract = "Given a source and its edited version performed based on human instructions in natural language, how do we extract the underlying edit operations, to automatically replicate similar edits on other images? This is the problem of reverse designing, and we present TAME-RD, a model to solve this problem. TAME-RD automatically learns from the complex interplay of image editing operations and the natural language instructions to learn fully specified edit operations. It predicts both the underlying image edit operations as discrete categories and their corresponding parameter values in the continuous space.We accomplish this by mapping together the contextual information from the natural language text and the structural differences between the corresponding source and edited images using the concept of pre-post effect. We demonstrate the efficiency of our network through quantitative evaluations on multiple datasets. We observe improvements of 6-10{\%} on various accuracy metrics and 1.01X-4X on the RMSE score and the concordance correlation coefficient for the corresponding parameter values on the benchmark GIER dataset. We also introduce I-MAD, a new two-part dataset: I-MAD-Dense, a collection of approximately 100K source and edited images, together with automatically generated text instructions and annotated edit operations, and I-MAD-Pro, consisting of about 1.6K source and edited images, together with text instructions and annotated edit operations provided by professional editors. On our dataset, we observe absolute improvements of 1-10{\%} on the accuracy metrics and 1.14X{--}5X on the RMSE score.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="guhan-etal-2024-tame">
<titleInfo>
<title>TAME-RD: Text Assisted Replication of Image Multi-Adjustments for Reverse Designing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pooja</namePart>
<namePart type="family">Guhan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Uttaran</namePart>
<namePart type="family">Bhattacharya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Somdeb</namePart>
<namePart type="family">Sarkhel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vahid</namePart>
<namePart type="family">Azizi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiang</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Saayan</namePart>
<namePart type="family">Mitra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aniket</namePart>
<namePart type="family">Bera</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dinesh</namePart>
<namePart type="family">Manocha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Given a source and its edited version performed based on human instructions in natural language, how do we extract the underlying edit operations, to automatically replicate similar edits on other images? This is the problem of reverse designing, and we present TAME-RD, a model to solve this problem. TAME-RD automatically learns from the complex interplay of image editing operations and the natural language instructions to learn fully specified edit operations. It predicts both the underlying image edit operations as discrete categories and their corresponding parameter values in the continuous space.We accomplish this by mapping together the contextual information from the natural language text and the structural differences between the corresponding source and edited images using the concept of pre-post effect. We demonstrate the efficiency of our network through quantitative evaluations on multiple datasets. We observe improvements of 6-10% on various accuracy metrics and 1.01X-4X on the RMSE score and the concordance correlation coefficient for the corresponding parameter values on the benchmark GIER dataset. We also introduce I-MAD, a new two-part dataset: I-MAD-Dense, a collection of approximately 100K source and edited images, together with automatically generated text instructions and annotated edit operations, and I-MAD-Pro, consisting of about 1.6K source and edited images, together with text instructions and annotated edit operations provided by professional editors. On our dataset, we observe absolute improvements of 1-10% on the accuracy metrics and 1.14X–5X on the RMSE score.</abstract>
<identifier type="citekey">guhan-etal-2024-tame</identifier>
<identifier type="doi">10.18653/v1/2024.findings-acl.637</identifier>
<location>
<url>https://aclanthology.org/2024.findings-acl.637</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>10710</start>
<end>10727</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T TAME-RD: Text Assisted Replication of Image Multi-Adjustments for Reverse Designing
%A Guhan, Pooja
%A Bhattacharya, Uttaran
%A Sarkhel, Somdeb
%A Azizi, Vahid
%A Chen, Xiang
%A Mitra, Saayan
%A Bera, Aniket
%A Manocha, Dinesh
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Findings of the Association for Computational Linguistics: ACL 2024
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F guhan-etal-2024-tame
%X Given a source and its edited version performed based on human instructions in natural language, how do we extract the underlying edit operations, to automatically replicate similar edits on other images? This is the problem of reverse designing, and we present TAME-RD, a model to solve this problem. TAME-RD automatically learns from the complex interplay of image editing operations and the natural language instructions to learn fully specified edit operations. It predicts both the underlying image edit operations as discrete categories and their corresponding parameter values in the continuous space.We accomplish this by mapping together the contextual information from the natural language text and the structural differences between the corresponding source and edited images using the concept of pre-post effect. We demonstrate the efficiency of our network through quantitative evaluations on multiple datasets. We observe improvements of 6-10% on various accuracy metrics and 1.01X-4X on the RMSE score and the concordance correlation coefficient for the corresponding parameter values on the benchmark GIER dataset. We also introduce I-MAD, a new two-part dataset: I-MAD-Dense, a collection of approximately 100K source and edited images, together with automatically generated text instructions and annotated edit operations, and I-MAD-Pro, consisting of about 1.6K source and edited images, together with text instructions and annotated edit operations provided by professional editors. On our dataset, we observe absolute improvements of 1-10% on the accuracy metrics and 1.14X–5X on the RMSE score.
%R 10.18653/v1/2024.findings-acl.637
%U https://aclanthology.org/2024.findings-acl.637
%U https://doi.org/10.18653/v1/2024.findings-acl.637
%P 10710-10727
Markdown (Informal)
[TAME-RD: Text Assisted Replication of Image Multi-Adjustments for Reverse Designing](https://aclanthology.org/2024.findings-acl.637) (Guhan et al., Findings 2024)
ACL
- Pooja Guhan, Uttaran Bhattacharya, Somdeb Sarkhel, Vahid Azizi, Xiang Chen, Saayan Mitra, Aniket Bera, and Dinesh Manocha. 2024. TAME-RD: Text Assisted Replication of Image Multi-Adjustments for Reverse Designing. In Findings of the Association for Computational Linguistics: ACL 2024, pages 10710–10727, Bangkok, Thailand. Association for Computational Linguistics.