@inproceedings{tu-etal-2020-improving,
title = "Improving Joint Training of Inference Networks and Structured Prediction Energy Networks",
author = "Tu, Lifu and
Pang, Richard Yuanzhe and
Gimpel, Kevin",
booktitle = "Proceedings of the Fourth Workshop on Structured Prediction for NLP",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.spnlp-1.8",
doi = "10.18653/v1/2020.spnlp-1.8",
pages = "62--73",
abstract = "Deep energy-based models are powerful, but pose challenges for learning and inference (Belanger and McCallum, 2016). Tu and Gimpel (2018) developed an efficient framework for energy-based models by training {``}inference networks{''} to approximate structured inference instead of using gradient descent. However, their alternating optimization approach suffers from instabilities during training, requiring additional loss terms and careful hyperparameter tuning. In this paper, we contribute several strategies to stabilize and improve this joint training of energy functions and inference networks for structured prediction. We design a compound objective to jointly train both cost-augmented and test-time inference networks along with the energy function. We propose joint parameterizations for the inference networks that encourage them to capture complementary functionality during learning. We empirically validate our strategies on two sequence labeling tasks, showing easier paths to strong performance than prior work, as well as further improvements with global energy terms.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tu-etal-2020-improving">
<titleInfo>
<title>Improving Joint Training of Inference Networks and Structured Prediction Energy Networks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lifu</namePart>
<namePart type="family">Tu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Richard</namePart>
<namePart type="given">Yuanzhe</namePart>
<namePart type="family">Pang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Gimpel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fourth Workshop on Structured Prediction for NLP</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Deep energy-based models are powerful, but pose challenges for learning and inference (Belanger and McCallum, 2016). Tu and Gimpel (2018) developed an efficient framework for energy-based models by training “inference networks” to approximate structured inference instead of using gradient descent. However, their alternating optimization approach suffers from instabilities during training, requiring additional loss terms and careful hyperparameter tuning. In this paper, we contribute several strategies to stabilize and improve this joint training of energy functions and inference networks for structured prediction. We design a compound objective to jointly train both cost-augmented and test-time inference networks along with the energy function. We propose joint parameterizations for the inference networks that encourage them to capture complementary functionality during learning. We empirically validate our strategies on two sequence labeling tasks, showing easier paths to strong performance than prior work, as well as further improvements with global energy terms.</abstract>
<identifier type="citekey">tu-etal-2020-improving</identifier>
<identifier type="doi">10.18653/v1/2020.spnlp-1.8</identifier>
<location>
<url>https://aclanthology.org/2020.spnlp-1.8</url>
</location>
<part>
<date>2020-11</date>
<extent unit="page">
<start>62</start>
<end>73</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Improving Joint Training of Inference Networks and Structured Prediction Energy Networks
%A Tu, Lifu
%A Pang, Richard Yuanzhe
%A Gimpel, Kevin
%S Proceedings of the Fourth Workshop on Structured Prediction for NLP
%D 2020
%8 November
%I Association for Computational Linguistics
%C Online
%F tu-etal-2020-improving
%X Deep energy-based models are powerful, but pose challenges for learning and inference (Belanger and McCallum, 2016). Tu and Gimpel (2018) developed an efficient framework for energy-based models by training “inference networks” to approximate structured inference instead of using gradient descent. However, their alternating optimization approach suffers from instabilities during training, requiring additional loss terms and careful hyperparameter tuning. In this paper, we contribute several strategies to stabilize and improve this joint training of energy functions and inference networks for structured prediction. We design a compound objective to jointly train both cost-augmented and test-time inference networks along with the energy function. We propose joint parameterizations for the inference networks that encourage them to capture complementary functionality during learning. We empirically validate our strategies on two sequence labeling tasks, showing easier paths to strong performance than prior work, as well as further improvements with global energy terms.
%R 10.18653/v1/2020.spnlp-1.8
%U https://aclanthology.org/2020.spnlp-1.8
%U https://doi.org/10.18653/v1/2020.spnlp-1.8
%P 62-73
Markdown (Informal)
[Improving Joint Training of Inference Networks and Structured Prediction Energy Networks](https://aclanthology.org/2020.spnlp-1.8) (Tu et al., spnlp 2020)
ACL