@inproceedings{mtumbuka-lukasiewicz-2022-syntactically,
title = "Syntactically Rich Discriminative Training: An Effective Method for Open Information Extraction",
author = "Mtumbuka, Frank and
Lukasiewicz, Thomas",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.emnlp-main.401",
doi = "10.18653/v1/2022.emnlp-main.401",
pages = "5972--5987",
abstract = "Open information extraction (OIE) is the task of extracting facts ''(Subject, Relation, Object){''} from natural language text. We propose several new methods for training neural OIE models in this paper. First, we propose a novel method for computing syntactically rich text embeddings using the structure of dependency trees. Second, we propose a new discriminative training approach to OIE in which tokens in the generated fact are classified as {``}real{''} or {``}fake{''}, i.e., those tokens that are in both the generated and gold tuples, and those that are only in the generated tuple but not in the gold tuple. We also address the issue of repetitive tokens in generated facts and improve the models{'} ability to generate implicit facts. Our approach reduces repetitive tokens by a factor of 23{\%}. Finally, we present paraphrased versions of the CaRB, OIE2016, and LSOIE datasets, and show that the models{'} performance substantially improves when trained on augmented datasets. Our best model beats the SOTA of IMoJIE on the recent CaRB dataset, with an improvement of 39.63{\%} in F1 score.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mtumbuka-lukasiewicz-2022-syntactically">
<titleInfo>
<title>Syntactically Rich Discriminative Training: An Effective Method for Open Information Extraction</title>
</titleInfo>
<name type="personal">
<namePart type="given">Frank</namePart>
<namePart type="family">Mtumbuka</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Lukasiewicz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Open information extraction (OIE) is the task of extracting facts ”(Subject, Relation, Object)” from natural language text. We propose several new methods for training neural OIE models in this paper. First, we propose a novel method for computing syntactically rich text embeddings using the structure of dependency trees. Second, we propose a new discriminative training approach to OIE in which tokens in the generated fact are classified as “real” or “fake”, i.e., those tokens that are in both the generated and gold tuples, and those that are only in the generated tuple but not in the gold tuple. We also address the issue of repetitive tokens in generated facts and improve the models’ ability to generate implicit facts. Our approach reduces repetitive tokens by a factor of 23%. Finally, we present paraphrased versions of the CaRB, OIE2016, and LSOIE datasets, and show that the models’ performance substantially improves when trained on augmented datasets. Our best model beats the SOTA of IMoJIE on the recent CaRB dataset, with an improvement of 39.63% in F1 score.</abstract>
<identifier type="citekey">mtumbuka-lukasiewicz-2022-syntactically</identifier>
<identifier type="doi">10.18653/v1/2022.emnlp-main.401</identifier>
<location>
<url>https://aclanthology.org/2022.emnlp-main.401</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>5972</start>
<end>5987</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Syntactically Rich Discriminative Training: An Effective Method for Open Information Extraction
%A Mtumbuka, Frank
%A Lukasiewicz, Thomas
%Y Goldberg, Yoav
%Y Kozareva, Zornitsa
%Y Zhang, Yue
%S Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F mtumbuka-lukasiewicz-2022-syntactically
%X Open information extraction (OIE) is the task of extracting facts ”(Subject, Relation, Object)” from natural language text. We propose several new methods for training neural OIE models in this paper. First, we propose a novel method for computing syntactically rich text embeddings using the structure of dependency trees. Second, we propose a new discriminative training approach to OIE in which tokens in the generated fact are classified as “real” or “fake”, i.e., those tokens that are in both the generated and gold tuples, and those that are only in the generated tuple but not in the gold tuple. We also address the issue of repetitive tokens in generated facts and improve the models’ ability to generate implicit facts. Our approach reduces repetitive tokens by a factor of 23%. Finally, we present paraphrased versions of the CaRB, OIE2016, and LSOIE datasets, and show that the models’ performance substantially improves when trained on augmented datasets. Our best model beats the SOTA of IMoJIE on the recent CaRB dataset, with an improvement of 39.63% in F1 score.
%R 10.18653/v1/2022.emnlp-main.401
%U https://aclanthology.org/2022.emnlp-main.401
%U https://doi.org/10.18653/v1/2022.emnlp-main.401
%P 5972-5987
Markdown (Informal)
[Syntactically Rich Discriminative Training: An Effective Method for Open Information Extraction](https://aclanthology.org/2022.emnlp-main.401) (Mtumbuka & Lukasiewicz, EMNLP 2022)
ACL