@article{laha-etal-2019-scalable,
title = "Scalable Micro-planned Generation of Discourse from Structured Data",
author = "Laha, Anirban and
Jain, Parag and
Mishra, Abhijit and
Sankaranarayanan, Karthik",
journal = "Computational Linguistics",
volume = "45",
number = "4",
month = dec,
year = "2019",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/J19-4005/",
doi = "10.1162/coli_a_00363",
pages = "737--763",
abstract = "We present a framework for generating natural language description from structured data such as tables; the problem comes under the category of data-to-text natural language generation (NLG). Modern data-to-text NLG systems typically use end-to-end statistical and neural architectures that learn from a limited amount of task-specific labeled data, and therefore exhibit limited scalability, domain-adaptability, and interpretability. Unlike these systems, ours is a modular, pipeline-based approach, and does not require task-specific parallel data. Rather, it relies on monolingual corpora and basic off-the-shelf NLP tools. This makes our system more scalable and easily adaptable to newer domains. Our system utilizes a three-staged pipeline that: (i) converts entries in the structured data to canonical form, (ii) generates simple sentences for each atomic entry in the canonicalized representation, and (iii) combines the sentences to produce a coherent, fluent, and adequate paragraph description through sentence compounding and co-reference replacement modules. Experiments on a benchmark mixed-domain data set curated for paragraph description from tables reveals the superiority of our system over existing data-to-text approaches. We also demonstrate the robustness of our system in accepting other popular data sets covering diverse data types such as knowledge graphs and key-value maps."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="laha-etal-2019-scalable">
<titleInfo>
<title>Scalable Micro-planned Generation of Discourse from Structured Data</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anirban</namePart>
<namePart type="family">Laha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Parag</namePart>
<namePart type="family">Jain</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abhijit</namePart>
<namePart type="family">Mishra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Karthik</namePart>
<namePart type="family">Sankaranarayanan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>We present a framework for generating natural language description from structured data such as tables; the problem comes under the category of data-to-text natural language generation (NLG). Modern data-to-text NLG systems typically use end-to-end statistical and neural architectures that learn from a limited amount of task-specific labeled data, and therefore exhibit limited scalability, domain-adaptability, and interpretability. Unlike these systems, ours is a modular, pipeline-based approach, and does not require task-specific parallel data. Rather, it relies on monolingual corpora and basic off-the-shelf NLP tools. This makes our system more scalable and easily adaptable to newer domains. Our system utilizes a three-staged pipeline that: (i) converts entries in the structured data to canonical form, (ii) generates simple sentences for each atomic entry in the canonicalized representation, and (iii) combines the sentences to produce a coherent, fluent, and adequate paragraph description through sentence compounding and co-reference replacement modules. Experiments on a benchmark mixed-domain data set curated for paragraph description from tables reveals the superiority of our system over existing data-to-text approaches. We also demonstrate the robustness of our system in accepting other popular data sets covering diverse data types such as knowledge graphs and key-value maps.</abstract>
<identifier type="citekey">laha-etal-2019-scalable</identifier>
<identifier type="doi">10.1162/coli_a_00363</identifier>
<location>
<url>https://aclanthology.org/J19-4005/</url>
</location>
<part>
<date>2019-12</date>
<detail type="volume"><number>45</number></detail>
<detail type="issue"><number>4</number></detail>
<extent unit="page">
<start>737</start>
<end>763</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Scalable Micro-planned Generation of Discourse from Structured Data
%A Laha, Anirban
%A Jain, Parag
%A Mishra, Abhijit
%A Sankaranarayanan, Karthik
%J Computational Linguistics
%D 2019
%8 December
%V 45
%N 4
%I MIT Press
%C Cambridge, MA
%F laha-etal-2019-scalable
%X We present a framework for generating natural language description from structured data such as tables; the problem comes under the category of data-to-text natural language generation (NLG). Modern data-to-text NLG systems typically use end-to-end statistical and neural architectures that learn from a limited amount of task-specific labeled data, and therefore exhibit limited scalability, domain-adaptability, and interpretability. Unlike these systems, ours is a modular, pipeline-based approach, and does not require task-specific parallel data. Rather, it relies on monolingual corpora and basic off-the-shelf NLP tools. This makes our system more scalable and easily adaptable to newer domains. Our system utilizes a three-staged pipeline that: (i) converts entries in the structured data to canonical form, (ii) generates simple sentences for each atomic entry in the canonicalized representation, and (iii) combines the sentences to produce a coherent, fluent, and adequate paragraph description through sentence compounding and co-reference replacement modules. Experiments on a benchmark mixed-domain data set curated for paragraph description from tables reveals the superiority of our system over existing data-to-text approaches. We also demonstrate the robustness of our system in accepting other popular data sets covering diverse data types such as knowledge graphs and key-value maps.
%R 10.1162/coli_a_00363
%U https://aclanthology.org/J19-4005/
%U https://doi.org/10.1162/coli_a_00363
%P 737-763
Markdown (Informal)
[Scalable Micro-planned Generation of Discourse from Structured Data](https://aclanthology.org/J19-4005/) (Laha et al., CL 2019)
ACL