@inproceedings{fankhauser-kupietz-2022-count,
title = "Count-Based and Predictive Language Models for Exploring {D}e{R}e{K}o",
author = "Fankhauser, Peter and
Kupietz, Marc",
editor = {Banski, Piotr and
Barbaresi, Adrien and
Clematide, Simon and
Kupietz, Marc and
L{\"u}ngen, Harald},
booktitle = "Proceedings of the Workshop on Challenges in the Management of Large Corpora (CMLC-10)",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2022.cmlc-1.5",
pages = "27--31",
abstract = "We present the use of count-based and predictive language models for exploring language use in the German Reference Corpus DeReKo. For collocation analysis along the syntagmatic axis we employ traditional association measures based on co-occurrence counts as well as predictive association measures derived from the output weights of skipgram word embeddings. For inspecting the semantic neighbourhood of words along the paradigmatic axis we visualize the high dimensional word embeddings in two dimensions using t-stochastic neighbourhood embeddings. Together, these visualizations provide a complementary, explorative approach to analysing very large corpora in addition to corpus querying. Moreover, we discuss count-based and predictive models w.r.t. scalability and maintainability in very large corpora.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="fankhauser-kupietz-2022-count">
<titleInfo>
<title>Count-Based and Predictive Language Models for Exploring DeReKo</title>
</titleInfo>
<name type="personal">
<namePart type="given">Peter</namePart>
<namePart type="family">Fankhauser</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marc</namePart>
<namePart type="family">Kupietz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop on Challenges in the Management of Large Corpora (CMLC-10)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Piotr</namePart>
<namePart type="family">Banski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adrien</namePart>
<namePart type="family">Barbaresi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Clematide</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marc</namePart>
<namePart type="family">Kupietz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Harald</namePart>
<namePart type="family">Lüngen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present the use of count-based and predictive language models for exploring language use in the German Reference Corpus DeReKo. For collocation analysis along the syntagmatic axis we employ traditional association measures based on co-occurrence counts as well as predictive association measures derived from the output weights of skipgram word embeddings. For inspecting the semantic neighbourhood of words along the paradigmatic axis we visualize the high dimensional word embeddings in two dimensions using t-stochastic neighbourhood embeddings. Together, these visualizations provide a complementary, explorative approach to analysing very large corpora in addition to corpus querying. Moreover, we discuss count-based and predictive models w.r.t. scalability and maintainability in very large corpora.</abstract>
<identifier type="citekey">fankhauser-kupietz-2022-count</identifier>
<location>
<url>https://aclanthology.org/2022.cmlc-1.5</url>
</location>
<part>
<date>2022-06</date>
<extent unit="page">
<start>27</start>
<end>31</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Count-Based and Predictive Language Models for Exploring DeReKo
%A Fankhauser, Peter
%A Kupietz, Marc
%Y Banski, Piotr
%Y Barbaresi, Adrien
%Y Clematide, Simon
%Y Kupietz, Marc
%Y Lüngen, Harald
%S Proceedings of the Workshop on Challenges in the Management of Large Corpora (CMLC-10)
%D 2022
%8 June
%I European Language Resources Association
%C Marseille, France
%F fankhauser-kupietz-2022-count
%X We present the use of count-based and predictive language models for exploring language use in the German Reference Corpus DeReKo. For collocation analysis along the syntagmatic axis we employ traditional association measures based on co-occurrence counts as well as predictive association measures derived from the output weights of skipgram word embeddings. For inspecting the semantic neighbourhood of words along the paradigmatic axis we visualize the high dimensional word embeddings in two dimensions using t-stochastic neighbourhood embeddings. Together, these visualizations provide a complementary, explorative approach to analysing very large corpora in addition to corpus querying. Moreover, we discuss count-based and predictive models w.r.t. scalability and maintainability in very large corpora.
%U https://aclanthology.org/2022.cmlc-1.5
%P 27-31
Markdown (Informal)
[Count-Based and Predictive Language Models for Exploring DeReKo](https://aclanthology.org/2022.cmlc-1.5) (Fankhauser & Kupietz, CMLC 2022)
ACL