@inproceedings{bella-etal-2020-exploring,
title = "Exploring the Language of Data",
author = "Bella, G{\'a}bor and
Gremes, Linda and
Giunchiglia, Fausto",
editor = "Scott, Donia and
Bel, Nuria and
Zong, Chengqing",
booktitle = "Proceedings of the 28th International Conference on Computational Linguistics",
month = dec,
year = "2020",
address = "Barcelona, Spain (Online)",
publisher = "International Committee on Computational Linguistics",
url = "https://aclanthology.org/2020.coling-main.582",
doi = "10.18653/v1/2020.coling-main.582",
pages = "6638--6648",
abstract = "We set out to uncover the unique grammatical properties of an important yet so far under-researched type of natural language text: that of short labels typically found within structured datasets. We show that such labels obey a specific type of abbreviated grammar that we call the Language of Data, with properties significantly different from the kinds of text typically addressed in computational linguistics and NLP, such as {`}standard{'} written language or social media messages. We analyse orthography, parts of speech, and syntax over a large, bilingual, hand-annotated corpus of data labels collected from a variety of domains. We perform experiments on tokenisation, part-of-speech tagging, and named entity recognition over real-world structured data, demonstrating that models adapted to the Language of Data outperform those trained on standard text. These observations point in a new direction to be explored as future research, in order to develop new NLP tools and models dedicated to the Language of Data.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bella-etal-2020-exploring">
<titleInfo>
<title>Exploring the Language of Data</title>
</titleInfo>
<name type="personal">
<namePart type="given">Gábor</namePart>
<namePart type="family">Bella</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Linda</namePart>
<namePart type="family">Gremes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fausto</namePart>
<namePart type="family">Giunchiglia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 28th International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Donia</namePart>
<namePart type="family">Scott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nuria</namePart>
<namePart type="family">Bel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chengqing</namePart>
<namePart type="family">Zong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Committee on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Barcelona, Spain (Online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We set out to uncover the unique grammatical properties of an important yet so far under-researched type of natural language text: that of short labels typically found within structured datasets. We show that such labels obey a specific type of abbreviated grammar that we call the Language of Data, with properties significantly different from the kinds of text typically addressed in computational linguistics and NLP, such as ‘standard’ written language or social media messages. We analyse orthography, parts of speech, and syntax over a large, bilingual, hand-annotated corpus of data labels collected from a variety of domains. We perform experiments on tokenisation, part-of-speech tagging, and named entity recognition over real-world structured data, demonstrating that models adapted to the Language of Data outperform those trained on standard text. These observations point in a new direction to be explored as future research, in order to develop new NLP tools and models dedicated to the Language of Data.</abstract>
<identifier type="citekey">bella-etal-2020-exploring</identifier>
<identifier type="doi">10.18653/v1/2020.coling-main.582</identifier>
<location>
<url>https://aclanthology.org/2020.coling-main.582</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>6638</start>
<end>6648</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Exploring the Language of Data
%A Bella, Gábor
%A Gremes, Linda
%A Giunchiglia, Fausto
%Y Scott, Donia
%Y Bel, Nuria
%Y Zong, Chengqing
%S Proceedings of the 28th International Conference on Computational Linguistics
%D 2020
%8 December
%I International Committee on Computational Linguistics
%C Barcelona, Spain (Online)
%F bella-etal-2020-exploring
%X We set out to uncover the unique grammatical properties of an important yet so far under-researched type of natural language text: that of short labels typically found within structured datasets. We show that such labels obey a specific type of abbreviated grammar that we call the Language of Data, with properties significantly different from the kinds of text typically addressed in computational linguistics and NLP, such as ‘standard’ written language or social media messages. We analyse orthography, parts of speech, and syntax over a large, bilingual, hand-annotated corpus of data labels collected from a variety of domains. We perform experiments on tokenisation, part-of-speech tagging, and named entity recognition over real-world structured data, demonstrating that models adapted to the Language of Data outperform those trained on standard text. These observations point in a new direction to be explored as future research, in order to develop new NLP tools and models dedicated to the Language of Data.
%R 10.18653/v1/2020.coling-main.582
%U https://aclanthology.org/2020.coling-main.582
%U https://doi.org/10.18653/v1/2020.coling-main.582
%P 6638-6648
Markdown (Informal)
[Exploring the Language of Data](https://aclanthology.org/2020.coling-main.582) (Bella et al., COLING 2020)
ACL
- Gábor Bella, Linda Gremes, and Fausto Giunchiglia. 2020. Exploring the Language of Data. In Proceedings of the 28th International Conference on Computational Linguistics, pages 6638–6648, Barcelona, Spain (Online). International Committee on Computational Linguistics.