## Clear R environment:
rm(list=ls())
## Import necessary libraries:
library(data.table)
library(psych)
library(scales)
library(vtable)
library(stargazer)
library(weights)
library(survey)
library(tidyverse)
library(clipr)
library(knitr)
library(RgoogleMaps)
library(ggmap)
library(mapproj)
library(plyr)
## Set working directory:
setwd("~/Desktop/MaratAndLaruelle2020/GDELT Stuff/DataAndRFiles/RUS-HUN/")
This is the BigQuery code used to import the dataset from GDELT:
# SELECT GLOBALEVENTID, SQLDATE, Actor1Name, Actor1CountryCode, Actor2Name, Actor2CountryCode, IsRootEvent, EventCode, EventBaseCode, EventRootCode, QuadClass, GoldsteinScale, NumMentions, NumArticles, AvgTone, ActionGeo_Type, ActionGeo_Lat, ActionGeo_Long, ActionGeo_FeatureID, DATEADDED, SOURCEURL
# FROM `gdelt-bq.full.events`
# WHERE Actor1CountryCode='RUS' AND Actor2CountryCode='HUN'
Above code in plain English:
If you look at “SELECT” in all caps, it means that I imported all of the above variables after “SELECT”, starting from “GLOBALEVENTID” to “SOURCEURL”
“FROM” in all caps specifies where I extracted the data from. In this case, from GDELT’s “Full Events” dataset.
“WHERE” in all caps specifies the conditions of my data extraction. In this case, I only extracted the events where the origin country (Actor 1) is Russia, and Actor 2 is Hungary.
Import dataset:
## Import dataset:
## Import dataset:
ds1 <- fread("rushun.csv")
## Subset it so it begins at 2015:
ds <- ds1[which(ds1$SQLDATE > 20141231), ]
## Check actor 1:
table(ds$Actor1CountryCode)
##
## RUS
## 7940
## Check actor 2:
table(ds$Actor2CountryCode)
##
## HUN
## 7940
Now, we can see that our dataset has 7940 observations. That is, there were 7940 “events” coded by GDELT between January 2015 and October 2020, in which Actor 1 was Russia and Actor 2 was Hungary. Note that this does not mean there were 7940 news stories, but 7940 events - each event has multiple news stories, which is denoted by the “NumArticles” variable (we’ll visualize it later).
Now, we can temporally visualize the events in our dataset. We can aggregate this to year level to see how many Russia-Hungary events were recorded by GDELT for each of the years that we are interested in:
ds$year <- as.numeric(substr(ds$SQLDATE,1,4))
ds$year <- as.character(substr(ds$SQLDATE,1,4))
ds$month <- as.character(substr(ds$SQLDATE,5,6))
ds$day <- as.character(substr(ds$SQLDATE,7,8))
ds$date <- paste(ds$year,ds$month,ds$day,sep = "-")
ds$date <- as.Date(ds$date)
table(ds$year)
##
## 2015 2016 2017 2018 2019 2020
## 1967 1529 1903 1096 1138 307
The output above (in table form) shows that GDELT recorded 1967 Russia-Hungary events in 2015, 1529 such events in 2016, 1903 in 2017, and so on. Here’s the histogram:
barplot(table(ds$year),
main = "Number of Russia-Hungary Events by Year (n = 7940)",
ylab ="Number of Events",
xlab = "Year", yaxt="n")
axis(2, at = seq(0, 2000, 200), las=2)
Now, we can look at the number of articles per event:
table(ds$NumArticles)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 1581 2393 427 893 236 715 95 544 41 274 2 117 14 37 24 91
## 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
## 7 43 6 49 10 10 5 32 6 16 4 12 2 15 1 8
## 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
## 3 5 4 7 1 3 3 8 2 4 1 2 2 1 1 10
## 50 51 53 54 55 56 57 58 60 63 64 66 68 70 72 73
## 4 3 1 6 4 4 1 1 2 1 3 3 2 4 6 2
## 74 75 76 78 84 86 87 90 92 94 96 97 98 101 104 105
## 2 1 1 2 4 2 1 2 1 1 2 1 1 1 1 2
## 108 110 111 112 114 120 122 124 125 126 128 129 130 132 133 134
## 3 2 1 1 2 3 1 1 2 3 2 1 1 1 1 1
## 136 141 144 149 151 152 159 160 164 168 172 174 180 188 191 194
## 1 2 4 1 2 1 1 1 1 2 1 1 1 1 1 1
## 198 202 204 205 206 207 210 220 226 229 232 234 249 252 262 263
## 2 1 1 1 1 1 1 2 1 1 1 2 1 1 1 1
## 273 294 304 314 334 360 388 396 404 414 428 448 456 470 527 594
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2
## 610 616 620 702 777 834 854 952 992 1074 1204 1228 1512 1575
## 1 1 1 3 1 1 1 1 1 2 1 1 1 1
sum(ds$NumArticles)
## [1] 75165
We can see that, overall, GDELT recorded a total of 75165 articles published about the 7940 ‘events’.
We can use the “sourceurl” variable to see the actual event that inspired the most articles:
mostpopular <- ds[which(ds$NumArticles == max(ds$NumArticles)), ]
mostpopular$SOURCEURL
## [1] "http://www.middletownpress.com/news/politics/article/Hit-list-exposes-Russian-hacking-beyond-US-12325229.php"
The output spits out this url:
The above url is not valid, but you can “human read” the url and google the headlines, based on the date the article was published.
mostpopular$SQLDATE
## [1] 20171102
This news story was published on 11-02-2017. After Googling the headline, I found an alternative url for the same news story, published on the same date (but on a different website):
https://www.businessinsider.com/kremlin-putin-hacking-hit-list-2017-11
Note: it seems that this “event” has very little to do with Hungary, even though GDELT’s coding scheme codes “Actor 2” as Hungary. Basically, the article only mentions Hungary in passing.
Summary stats and histogram of the ‘average tone’ variable:
summary(ds$AvgTone)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -14.0239 -3.2536 -1.4769 -1.4682 0.4351 9.0164
Median tone of Russia-Hungary events is -1.48, mean is -1.47, max is 9.02, and minimum is -14.02.
hist(ds$AvgTone,
main = "Histogram of Average Tone for Russia-Hungary Events, 2015 - 2020
(n = 7940)",
ylab = "Frequency",
xlab = "Average Tone",
yaxt="n")
axis(2, at = seq(0, 2000, 500), las = 2)
We can also look at the particular events that inspired the most negative and most positive tones respectively:
mostnegative <- ds [which(ds$AvgTone == min(ds$AvgTone)), ]
mostpositive <- ds [which(ds$AvgTone == max(ds$AvgTone)), ]
mostnegative$SOURCEURL
## [1] "http://www.sharenet.co.za/news/Russia_restricts_pig_pork_imports_from_Hungary_due_to_disease/2fb307720060553d363c77f3244f5da3"
mostpositive$SOURCEURL
## [1] "https://eng.belta.by/society/view/belarus-ambassador-takes-part-in-victory-campaign-in-hungary-128927-2020/"
## [2] "https://eng.belta.by/society/view/belarus-ambassador-takes-part-in-victory-campaign-in-hungary-128927-2020/"
Event with the most negative tone:
Event with the most positive tone:
Finally, we can also visualize how the tone of the 7940 events varied from year to year.
# eval=FALSE
x <- 0
plot(ds$date, ds$AvgTone,
xlab = "Year",
ylab = "Average Tone",
main = "Average Tone for 7940 Events Between Jan 2015 - October 2020
Actor 1 = Russia; Actor 2 = Hungary",
ylim=c(-20,20))
abline(h=x, col="red", lwd=3)
#axis(1, at = seq(20150000, 20200000, 10000),
# labels = c("2015", "2016", "2017", "2018", "2019", "2020"))
# 
The horizontal red line represents ‘neutral’ tone. Dots above it represent events with positive tone, and the dots below it represent events with negative tone.
## ONLY RUN THIS ONCE (then save the map and call it into the pdf/html document):
# Reshape the data so we get the counts of events for each location
ds$count <- 1
df <- ddply(ds, .(ActionGeo_Lat, ActionGeo_Long), summarize, count = sum(count))
table(df$count)
# define the scope of the map
lat <- c(45, 49)
lon <- c(14, 25)
center = c(lat = mean(lat), lon = mean(lon))
zoom <- min(MaxZoom(range(lat), range(lon)))
## Register GOOGLE API Key:
## ONLY NEED TO DO THIS ONCE:
# register_google(key = "AIzaSyDLw5jMnMby8pLgUayMZ6q-EhXQKj9nmeg", write = TRUE)
# Download a map file:
map <- get_map(location = c(lon = mean(lon), lat = mean(lat)), zoom = 7,
maptype = "roadmap",
crop=TRUE,
source = "google", language = "English", color="bw")
## maptype options:
# see ?get_map()
# or ?get_stamenmap()
## Fill up the map with our data:
print(ggmap(map) + geom_point(data = df, aes(x = ActionGeo_Long, y = ActionGeo_Lat,
size = count), alpha = 0.3, colour="blue")+
ggtitle('Russia-Hungary Events, 2015 - 2020'))
This is a preliminary analysis of the Russia - Hungary dyad. I am still working on fine tuning it. Maybe use other variables besides “actor1” for Russia?
Will need to find a better way to visualize the “year vs tone” plot
Comments/suggestions?