@inproceedings{doughman-etal-2020-time,
title = "Time-Aware Word Embeddings for Three {L}ebanese News Archives",
author = "Doughman, Jad and
Abu Salem, Fatima and
Elbassuoni, Shady",
editor = "Calzolari, Nicoletta and
B{\'e}chet, Fr{\'e}d{\'e}ric and
Blache, Philippe and
Choukri, Khalid and
Cieri, Christopher and
Declerck, Thierry and
Goggi, Sara and
Isahara, Hitoshi and
Maegaard, Bente and
Mariani, Joseph and
Mazo, H{\'e}l{\`e}ne and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Twelfth Language Resources and Evaluation Conference",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2020.lrec-1.580",
pages = "4717--4725",
abstract = "Word embeddings have proven to be an effective method for capturing semantic relations among distinct terms within a large corpus. In this paper, we present a set of word embeddings learnt from three large Lebanese news archives, which collectively consist of 609,386 scanned newspaper images and spanning a total of 151 years, ranging from 1933 till 2011. The diversified ideological nature of the news archives alongside the temporal variability of the embeddings offer a rare glimpse onto the variation of word representation across the left-right political spectrum. To train the word embeddings, Google{'}s Tesseract 4.0 OCR engine was employed to transcribe the scanned news archives, and various archive-level as well as decade-level word embeddings were learnt. To evaluate the accuracy of the learnt word embeddings, a benchmark of analogy tasks was used. Finally, we demonstrate an interactive system that allows the end user to visualize for a given word of interest, the variation of the top-k closest words in the embedding space as a function of time and across news archives using an animated scatter plot.",
language = "English",
ISBN = "979-10-95546-34-4",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="doughman-etal-2020-time">
<titleInfo>
<title>Time-Aware Word Embeddings for Three Lebanese News Archives</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jad</namePart>
<namePart type="family">Doughman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fatima</namePart>
<namePart type="family">Abu Salem</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shady</namePart>
<namePart type="family">Elbassuoni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Twelfth Language Resources and Evaluation Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frédéric</namePart>
<namePart type="family">Béchet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philippe</namePart>
<namePart type="family">Blache</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="family">Cieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Goggi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hitoshi</namePart>
<namePart type="family">Isahara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hélène</namePart>
<namePart type="family">Mazo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asuncion</namePart>
<namePart type="family">Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-34-4</identifier>
</relatedItem>
<abstract>Word embeddings have proven to be an effective method for capturing semantic relations among distinct terms within a large corpus. In this paper, we present a set of word embeddings learnt from three large Lebanese news archives, which collectively consist of 609,386 scanned newspaper images and spanning a total of 151 years, ranging from 1933 till 2011. The diversified ideological nature of the news archives alongside the temporal variability of the embeddings offer a rare glimpse onto the variation of word representation across the left-right political spectrum. To train the word embeddings, Google’s Tesseract 4.0 OCR engine was employed to transcribe the scanned news archives, and various archive-level as well as decade-level word embeddings were learnt. To evaluate the accuracy of the learnt word embeddings, a benchmark of analogy tasks was used. Finally, we demonstrate an interactive system that allows the end user to visualize for a given word of interest, the variation of the top-k closest words in the embedding space as a function of time and across news archives using an animated scatter plot.</abstract>
<identifier type="citekey">doughman-etal-2020-time</identifier>
<location>
<url>https://aclanthology.org/2020.lrec-1.580</url>
</location>
<part>
<date>2020-05</date>
<extent unit="page">
<start>4717</start>
<end>4725</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Time-Aware Word Embeddings for Three Lebanese News Archives
%A Doughman, Jad
%A Abu Salem, Fatima
%A Elbassuoni, Shady
%Y Calzolari, Nicoletta
%Y Béchet, Frédéric
%Y Blache, Philippe
%Y Choukri, Khalid
%Y Cieri, Christopher
%Y Declerck, Thierry
%Y Goggi, Sara
%Y Isahara, Hitoshi
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Mazo, Hélène
%Y Moreno, Asuncion
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Twelfth Language Resources and Evaluation Conference
%D 2020
%8 May
%I European Language Resources Association
%C Marseille, France
%@ 979-10-95546-34-4
%G English
%F doughman-etal-2020-time
%X Word embeddings have proven to be an effective method for capturing semantic relations among distinct terms within a large corpus. In this paper, we present a set of word embeddings learnt from three large Lebanese news archives, which collectively consist of 609,386 scanned newspaper images and spanning a total of 151 years, ranging from 1933 till 2011. The diversified ideological nature of the news archives alongside the temporal variability of the embeddings offer a rare glimpse onto the variation of word representation across the left-right political spectrum. To train the word embeddings, Google’s Tesseract 4.0 OCR engine was employed to transcribe the scanned news archives, and various archive-level as well as decade-level word embeddings were learnt. To evaluate the accuracy of the learnt word embeddings, a benchmark of analogy tasks was used. Finally, we demonstrate an interactive system that allows the end user to visualize for a given word of interest, the variation of the top-k closest words in the embedding space as a function of time and across news archives using an animated scatter plot.
%U https://aclanthology.org/2020.lrec-1.580
%P 4717-4725
Markdown (Informal)
[Time-Aware Word Embeddings for Three Lebanese News Archives](https://aclanthology.org/2020.lrec-1.580) (Doughman et al., LREC 2020)
ACL