@inproceedings{kreutz-daelemans-2020-streaming,
title = "Streaming Language-Specific {T}witter Data with Optimal Keywords",
author = "Kreutz, Tim and
Daelemans, Walter",
editor = {Barbaresi, Adrien and
Bildhauer, Felix and
Sch{\"a}fer, Roland and
Stemle, Egon},
booktitle = "Proceedings of the 12th Web as Corpus Workshop",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2020.wac-1.8",
pages = "57--64",
abstract = "The Twitter Streaming API has been used to create language-specific corpora with varying degrees of success. Selecting a filter of frequent yet distinct keywords for German resulted in a near-complete collection of German tweets. This method is promising as it keeps within Twitter endpoint limitations and could be applied to other languages besides German. But so far no research has compared methods for selecting optimal keywords for this task. This paper proposes a method for finding optimal key phrases based on a greedy solution to the maximum coverage problem. We generate candidate key phrases for the 50 most frequent languages on Twitter. Candidates are then iteratively selected based on a variety of scoring functions applied to their coverage of target tweets. Selecting candidates based on the scoring function that exponentiates the precision of a key phrase and weighs it by recall achieved the best results overall. Some target languages yield lower results than what could be expected from their prevalence on Twitter. Upon analyzing the errors, we find that these are languages that are very close to more prevalent languages. In these cases, key phrases that limit finding the competitive language are selected, and overall recall on the target language also decreases. We publish the resulting optimized lists for each language as a resource. The code to generate lists for other research objectives is also supplied.",
language = "English",
ISBN = "979-10-95546-68-9",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kreutz-daelemans-2020-streaming">
<titleInfo>
<title>Streaming Language-Specific Twitter Data with Optimal Keywords</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tim</namePart>
<namePart type="family">Kreutz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Walter</namePart>
<namePart type="family">Daelemans</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 12th Web as Corpus Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Adrien</namePart>
<namePart type="family">Barbaresi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Felix</namePart>
<namePart type="family">Bildhauer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roland</namePart>
<namePart type="family">Schäfer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Egon</namePart>
<namePart type="family">Stemle</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-68-9</identifier>
</relatedItem>
<abstract>The Twitter Streaming API has been used to create language-specific corpora with varying degrees of success. Selecting a filter of frequent yet distinct keywords for German resulted in a near-complete collection of German tweets. This method is promising as it keeps within Twitter endpoint limitations and could be applied to other languages besides German. But so far no research has compared methods for selecting optimal keywords for this task. This paper proposes a method for finding optimal key phrases based on a greedy solution to the maximum coverage problem. We generate candidate key phrases for the 50 most frequent languages on Twitter. Candidates are then iteratively selected based on a variety of scoring functions applied to their coverage of target tweets. Selecting candidates based on the scoring function that exponentiates the precision of a key phrase and weighs it by recall achieved the best results overall. Some target languages yield lower results than what could be expected from their prevalence on Twitter. Upon analyzing the errors, we find that these are languages that are very close to more prevalent languages. In these cases, key phrases that limit finding the competitive language are selected, and overall recall on the target language also decreases. We publish the resulting optimized lists for each language as a resource. The code to generate lists for other research objectives is also supplied.</abstract>
<identifier type="citekey">kreutz-daelemans-2020-streaming</identifier>
<location>
<url>https://aclanthology.org/2020.wac-1.8</url>
</location>
<part>
<date>2020-05</date>
<extent unit="page">
<start>57</start>
<end>64</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Streaming Language-Specific Twitter Data with Optimal Keywords
%A Kreutz, Tim
%A Daelemans, Walter
%Y Barbaresi, Adrien
%Y Bildhauer, Felix
%Y Schäfer, Roland
%Y Stemle, Egon
%S Proceedings of the 12th Web as Corpus Workshop
%D 2020
%8 May
%I European Language Resources Association
%C Marseille, France
%@ 979-10-95546-68-9
%G English
%F kreutz-daelemans-2020-streaming
%X The Twitter Streaming API has been used to create language-specific corpora with varying degrees of success. Selecting a filter of frequent yet distinct keywords for German resulted in a near-complete collection of German tweets. This method is promising as it keeps within Twitter endpoint limitations and could be applied to other languages besides German. But so far no research has compared methods for selecting optimal keywords for this task. This paper proposes a method for finding optimal key phrases based on a greedy solution to the maximum coverage problem. We generate candidate key phrases for the 50 most frequent languages on Twitter. Candidates are then iteratively selected based on a variety of scoring functions applied to their coverage of target tweets. Selecting candidates based on the scoring function that exponentiates the precision of a key phrase and weighs it by recall achieved the best results overall. Some target languages yield lower results than what could be expected from their prevalence on Twitter. Upon analyzing the errors, we find that these are languages that are very close to more prevalent languages. In these cases, key phrases that limit finding the competitive language are selected, and overall recall on the target language also decreases. We publish the resulting optimized lists for each language as a resource. The code to generate lists for other research objectives is also supplied.
%U https://aclanthology.org/2020.wac-1.8
%P 57-64
Markdown (Informal)
[Streaming Language-Specific Twitter Data with Optimal Keywords](https://aclanthology.org/2020.wac-1.8) (Kreutz & Daelemans, WAC 2020)
ACL