@inproceedings{takada-etal-2024-direct,
title = "Direct Metric Optimization for Image Captioning through Reward-Weighted Augmented Data Utilization",
author = "Takada, Takumi and
Suzuki, Yuma and
Takushima, Hiroki and
Tanoue, Hayato and
Sato, Haruki and
Kumar, Aiswariya and
Nishihara, Hiroki and
Hori, Takayuki and
Ueki, Kazuya",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.luhme-long.453/",
doi = "10.18653/v1/2024.acl-long.453",
pages = "8333--8346",
abstract = "While image captioning is an essential field of vision language models (VLM), a lack of continuity between the learning objective and final performance metrics of VLMs complicates their training and optimization. Reinforcement learning (RL) can directly optimize such metrics, but it is accompanied by a significant computational cost, making it difficult to apply to recent large-scale VLMs. In this paper, we propose Direct Metric Optimization (DMO), which is a lightweight final-metric-optimizing training method. We replace the computationally expensive exploration process in RL with an offline, diverse text data augmentation and show that self-supervised training on reward-weighted augmented data leads to direct and stable metric optimization. Our experiments demonstrate that DMO achieves performance comparable to those of the state-of-the-art RL method while saving hundreds of times more model forwarding iterations and greater amounts of computation time. This suggests that DMO constitutes a promising alternative for metric optimization in the era of large-scale VLMs."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="takada-etal-2024-direct">
<titleInfo>
<title>Direct Metric Optimization for Image Captioning through Reward-Weighted Augmented Data Utilization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Takumi</namePart>
<namePart type="family">Takada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuma</namePart>
<namePart type="family">Suzuki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hiroki</namePart>
<namePart type="family">Takushima</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hayato</namePart>
<namePart type="family">Tanoue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haruki</namePart>
<namePart type="family">Sato</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aiswariya</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hiroki</namePart>
<namePart type="family">Nishihara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Takayuki</namePart>
<namePart type="family">Hori</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kazuya</namePart>
<namePart type="family">Ueki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>While image captioning is an essential field of vision language models (VLM), a lack of continuity between the learning objective and final performance metrics of VLMs complicates their training and optimization. Reinforcement learning (RL) can directly optimize such metrics, but it is accompanied by a significant computational cost, making it difficult to apply to recent large-scale VLMs. In this paper, we propose Direct Metric Optimization (DMO), which is a lightweight final-metric-optimizing training method. We replace the computationally expensive exploration process in RL with an offline, diverse text data augmentation and show that self-supervised training on reward-weighted augmented data leads to direct and stable metric optimization. Our experiments demonstrate that DMO achieves performance comparable to those of the state-of-the-art RL method while saving hundreds of times more model forwarding iterations and greater amounts of computation time. This suggests that DMO constitutes a promising alternative for metric optimization in the era of large-scale VLMs.</abstract>
<identifier type="citekey">takada-etal-2024-direct</identifier>
<identifier type="doi">10.18653/v1/2024.acl-long.453</identifier>
<location>
<url>https://aclanthology.org/2024.luhme-long.453/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>8333</start>
<end>8346</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Direct Metric Optimization for Image Captioning through Reward-Weighted Augmented Data Utilization
%A Takada, Takumi
%A Suzuki, Yuma
%A Takushima, Hiroki
%A Tanoue, Hayato
%A Sato, Haruki
%A Kumar, Aiswariya
%A Nishihara, Hiroki
%A Hori, Takayuki
%A Ueki, Kazuya
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F takada-etal-2024-direct
%X While image captioning is an essential field of vision language models (VLM), a lack of continuity between the learning objective and final performance metrics of VLMs complicates their training and optimization. Reinforcement learning (RL) can directly optimize such metrics, but it is accompanied by a significant computational cost, making it difficult to apply to recent large-scale VLMs. In this paper, we propose Direct Metric Optimization (DMO), which is a lightweight final-metric-optimizing training method. We replace the computationally expensive exploration process in RL with an offline, diverse text data augmentation and show that self-supervised training on reward-weighted augmented data leads to direct and stable metric optimization. Our experiments demonstrate that DMO achieves performance comparable to those of the state-of-the-art RL method while saving hundreds of times more model forwarding iterations and greater amounts of computation time. This suggests that DMO constitutes a promising alternative for metric optimization in the era of large-scale VLMs.
%R 10.18653/v1/2024.acl-long.453
%U https://aclanthology.org/2024.luhme-long.453/
%U https://doi.org/10.18653/v1/2024.acl-long.453
%P 8333-8346
Markdown (Informal)
[Direct Metric Optimization for Image Captioning through Reward-Weighted Augmented Data Utilization](https://aclanthology.org/2024.luhme-long.453/) (Takada et al., ACL 2024)
ACL
- Takumi Takada, Yuma Suzuki, Hiroki Takushima, Hayato Tanoue, Haruki Sato, Aiswariya Kumar, Hiroki Nishihara, Takayuki Hori, and Kazuya Ueki. 2024. Direct Metric Optimization for Image Captioning through Reward-Weighted Augmented Data Utilization. In Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 8333–8346, Bangkok, Thailand. Association for Computational Linguistics.