@inproceedings{church-bian-2021-data,
title = "Data Collection vs. Knowledge Graph Completion: What is Needed to Improve Coverage?",
author = "Church, Kenneth and
Bian, Yuchen",
editor = "Moens, Marie-Francine and
Huang, Xuanjing and
Specia, Lucia and
Yih, Scott Wen-tau",
booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2021",
address = "Online and Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.emnlp-main.501",
doi = "10.18653/v1/2021.emnlp-main.501",
pages = "6210--6215",
abstract = "This survey/position paper discusses ways to improve coverage of resources such as WordNet. Rapp estimated correlations, rho, between corpus statistics and pyscholinguistic norms. rho improves with quantity (corpus size) and quality (balance). 1M words is enough for simple estimates (unigram frequencies), but at least 100x more is required for good estimates of word associations and embeddings. Given such estimates, WordNet{'}s coverage is remarkable. WordNet was developed on SemCor, a small sample (200k words) from the Brown Corpus. Knowledge Graph Completion (KGC) attempts to learn missing links from subsets. But Rapp{'}s estimates of sizes suggest it would be more profitable to collect more data than to infer missing information that is not there.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="church-bian-2021-data">
<titleInfo>
<title>Data Collection vs. Knowledge Graph Completion: What is Needed to Improve Coverage?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kenneth</namePart>
<namePart type="family">Church</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuchen</namePart>
<namePart type="family">Bian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marie-Francine</namePart>
<namePart type="family">Moens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuanjing</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucia</namePart>
<namePart type="family">Specia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Scott</namePart>
<namePart type="given">Wen-tau</namePart>
<namePart type="family">Yih</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online and Punta Cana, Dominican Republic</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This survey/position paper discusses ways to improve coverage of resources such as WordNet. Rapp estimated correlations, rho, between corpus statistics and pyscholinguistic norms. rho improves with quantity (corpus size) and quality (balance). 1M words is enough for simple estimates (unigram frequencies), but at least 100x more is required for good estimates of word associations and embeddings. Given such estimates, WordNet’s coverage is remarkable. WordNet was developed on SemCor, a small sample (200k words) from the Brown Corpus. Knowledge Graph Completion (KGC) attempts to learn missing links from subsets. But Rapp’s estimates of sizes suggest it would be more profitable to collect more data than to infer missing information that is not there.</abstract>
<identifier type="citekey">church-bian-2021-data</identifier>
<identifier type="doi">10.18653/v1/2021.emnlp-main.501</identifier>
<location>
<url>https://aclanthology.org/2021.emnlp-main.501</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>6210</start>
<end>6215</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Data Collection vs. Knowledge Graph Completion: What is Needed to Improve Coverage?
%A Church, Kenneth
%A Bian, Yuchen
%Y Moens, Marie-Francine
%Y Huang, Xuanjing
%Y Specia, Lucia
%Y Yih, Scott Wen-tau
%S Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing
%D 2021
%8 November
%I Association for Computational Linguistics
%C Online and Punta Cana, Dominican Republic
%F church-bian-2021-data
%X This survey/position paper discusses ways to improve coverage of resources such as WordNet. Rapp estimated correlations, rho, between corpus statistics and pyscholinguistic norms. rho improves with quantity (corpus size) and quality (balance). 1M words is enough for simple estimates (unigram frequencies), but at least 100x more is required for good estimates of word associations and embeddings. Given such estimates, WordNet’s coverage is remarkable. WordNet was developed on SemCor, a small sample (200k words) from the Brown Corpus. Knowledge Graph Completion (KGC) attempts to learn missing links from subsets. But Rapp’s estimates of sizes suggest it would be more profitable to collect more data than to infer missing information that is not there.
%R 10.18653/v1/2021.emnlp-main.501
%U https://aclanthology.org/2021.emnlp-main.501
%U https://doi.org/10.18653/v1/2021.emnlp-main.501
%P 6210-6215
Markdown (Informal)
[Data Collection vs. Knowledge Graph Completion: What is Needed to Improve Coverage?](https://aclanthology.org/2021.emnlp-main.501) (Church & Bian, EMNLP 2021)
ACL