library(dplyr)
library(readr)
#library(fpp2)
#library(forecast)
#library(tseries) # for adf.test() 
#library(FinTS) # for ArchTest()
#library(vrtest) # for Auto.AR()
#library(vars)
#library(MTS)
#library(tsbox)
library(tidyverse)
owid_covid_data <- read_csv("covid-19-data-master/public/data/owid-covid-data.csv")
covid_general <- owid_covid_data %>% 
  dplyr::select(iso_code, continent, location,date, total_vaccinations, people_vaccinated, stringency_index, population, population_density, median_age, aged_65_older, aged_70_older, gdp_per_capita, extreme_poverty, life_expectancy, cardiovasc_death_rate, diabetes_prevalence, handwashing_facilities, hosp_patients, hospital_beds_per_thousand)
climate_zones <- read_csv("climate zones - Munka1.csv") %>% dplyr::select(-1)
climate_zones[climate_zones$Country.Region=="US",]$Country.Region<-"United States"
names(climate_zones)[1] <- "location"
#World Value Survey
WVS <- read.csv2("WVSextra.csv")
names(WVS) <- c("location",'social_utility','conformity','trust','compliance')
#Legatum Institue's 2019 Prosperity Index
prosperity <-  read.csv("Prosperity Index_2019_Data_szerk.csv", sep=";")
names(prosperity)[1] <- "location"
covid_general <- left_join(prosperity,covid_general, by=c("location"="location"))
length(unique(covid_general$location))
## [1] 167
covid_general <- left_join(covid_general, climate_zones, by=c("location"="location"))
length(unique(covid_general$location))
## [1] 167
# filter our ts object
covid_general_ts <- covid_general %>% dplyr::select(location, iso_code, continent,date, stringency_index, total_vaccinations,people_vaccinated, hosp_patients)
# filter out non ts objects
covid_no_ts <- covid_general %>% dplyr::select(-c("date","stringency_index","total_vaccinations","people_vaccinated","hosp_patients","handwashing_facilities","hospital_beds_per_thousand")) %>% distinct()

US ts predicts

test for normality & Box.Cox transformation

test for stationarity

clustering excess deaths alone (by Euclidean distance)

url2 = "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/excess_mortality/excess_mortality.csv"
excess_mortality <- read_csv(url(url2))  
excess_mortality <- excess_mortality %>% 
  mutate(month = as.numeric(substr(date,6,7)),
         year = as.numeric(substr(date,1,4)),
         year_month = substr(date,1,7))

# transfer all weekly to monthly (aggregate)
excess_mortality2 <- excess_mortality %>% 
  dplyr::select(location, date, year_month, year, month, excess_proj_all_ages) %>%
  group_by(location, year_month, year, month) %>% 
  summarise(excess_deaths = sum(excess_proj_all_ages), .groups="keep") %>% 
  filter(location %in% covid_general$location) %>% 
  drop_na() %>% 
  mutate(time_unit = as.numeric(sub("-",".",year_month)))
library(zoo) # for date operations
update_time = excess_mortality2 %>% group_by(location) %>% summarise(latest_date = max(time_unit),earliest = min(time_unit)) # choose update time since 2020/01 until 2021/12
selected_country = (update_time %>% filter(latest_date >= 2021.12 & earliest == 2020.01))$location

# keep all selected countries, filter out data after 2021/12
ex_country <- excess_mortality2 %>% 
  filter(location %in% selected_country & time_unit <= 2021.12) %>% 
  arrange(time_unit) %>% 
  mutate(date = as.Date(as.yearmon(year_month,"%Y-%m")))
library(gghighlight)
ggplot(ex_country, aes(date, excess_deaths,color=location)) +
  geom_line(stat="identity") +
  ylab("Projected Excess Deaths") +
  gghighlight(max(excess_deaths) > 90000,
              max_highlight = 4,
              use_direct_label = TRUE) +
  theme_minimal() +
  theme(legend.position = 'none')

# exclude France because missing data from 2020/05 - 2021-10
ex_per_country <- ex_country[ex_country$location != "France",c("excess_deaths","time_unit","location")]
  
# long to wide
ex_per_country <- ex_per_country %>% 
  spread(location, excess_deaths)
# transpose excess deaths to matrix
deaths <- t(ex_per_country[-1])
deaths_dist <- proxy::dist(deaths, method="Euclidean")
ex_cluster_fit <- hclust(deaths_dist, method = "ward.D")
# plot clusters
ggdendro::ggdendrogram(ex_cluster_fit, rotate=TRUE, theme_dendro = FALSE) + theme_minimal() + xlab("") + ylab("")

#assign the four clusters to the data using cutree() 
clustered_ex <- cutree(ex_cluster_fit, k=4)

clustered_exdata <- as.data.frame(as.table(clustered_ex))

colnames(clustered_exdata) <- c("location","cluster")
# how many countries per cluster
table(clustered_exdata$cluster)
## 
##  1  2  3  4 
## 53  3 12  1
clustered_exdata$location <- as.character(clustered_exdata$location)

joined_clusters <- ex_country %>% 
  inner_join(clustered_exdata, by = "location")
cluster1 <- joined_clusters %>% filter(cluster == "1") 
table(cluster1$location)
## 
##                Albania                Armenia              Australia 
##                     24                     24                     24 
##                Austria             Azerbaijan                Belgium 
##                     24                     24                     24 
##                Bolivia Bosnia and Herzegovina               Bulgaria 
##                     24                     24                     24 
##                  Chile                Croatia                 Cyprus 
##                     24                     24                     24 
##                Czechia                Denmark                Estonia 
##                     24                     24                     24 
##                Finland                Georgia                 Greece 
##                     24                     24                     24 
##              Guatemala                Hungary                Iceland 
##                     24                     24                     24 
##                Ireland                 Israel                  Japan 
##                     24                     24                     24 
##             Kazakhstan             Kyrgyzstan                 Latvia 
##                     24                     24                     24 
##                Lebanon              Lithuania             Luxembourg 
##                     24                     24                     24 
##                  Malta              Mauritius                Moldova 
##                     24                     24                     24 
##               Mongolia            Netherlands            New Zealand 
##                     24                     24                     24 
##        North Macedonia                 Norway                   Oman 
##                     24                     24                     24 
##               Paraguay               Portugal                  Qatar 
##                     24                     24                     24 
##                 Serbia              Singapore               Slovakia 
##                     24                     24                     24 
##               Slovenia            South Korea                 Sweden 
##                     24                     24                     24 
##            Switzerland                 Taiwan               Thailand 
##                     24                     24                     24 
##                Uruguay             Uzbekistan 
##                     24                     24
ggplot(cluster1, aes(date, excess_deaths)) +
  geom_line(color="grey") +
  theme_minimal() +
  ylab("excess deaths") + xlab("") +
  geom_smooth(method="auto",color="red", se=F, size=0.5) +
  facet_wrap(~location)

cluster2 <- joined_clusters %>% filter(cluster == "2") %>% glimpse()
## Rows: 72
## Columns: 8
## Groups: location, year_month, year, month [72]
## $ location      <chr> "Brazil", "Mexico", "United States", "Brazil", "Mexico",…
## $ year_month    <chr> "2020-01", "2020-01", "2020-01", "2020-02", "2020-02", "…
## $ year          <dbl> 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 20…
## $ month         <dbl> 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7,…
## $ excess_deaths <dbl> 1739.2, -486.6, -8926.0, 5310.4, -3121.6, -4753.0, 7351.…
## $ time_unit     <dbl> 2020.01, 2020.01, 2020.01, 2020.02, 2020.02, 2020.02, 20…
## $ date          <date> 2020-01-01, 2020-01-01, 2020-01-01, 2020-02-01, 2020-02…
## $ cluster       <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
table(cluster2$location)
## 
##        Brazil        Mexico United States 
##            24            24            24
ggplot(cluster2, aes(date, excess_deaths)) +
  geom_line(color="grey") +
  theme_minimal() +
  ylab("excess deaths") + xlab("") +
  geom_smooth(method="auto",color="red", se=F, size=0.5) +
  facet_wrap(~location)

cluster3 <- joined_clusters %>% filter(cluster == "3") 
table(cluster3$location)
## 
##       Colombia        Ecuador        Germany           Iran          Italy 
##             24             24             24             24             24 
##           Peru         Poland        Romania   South Africa          Spain 
##             24             24             24             24             24 
##        Ukraine United Kingdom 
##             24             24
ggplot(cluster3, aes(date, excess_deaths)) +
  geom_line(color="grey") +
  theme_minimal() +
  ylab("excess deaths") + xlab("") +
  geom_smooth(method="auto",color="red", se=F, size=0.5) +
  facet_wrap(~location)

cluster4 <- joined_clusters %>% filter(cluster == "4") 
table(cluster4$location)
## 
## Russia 
##     24
ggplot(cluster4, aes(date, excess_deaths)) +
  geom_line(color="grey") +
  theme_minimal() +
  ylab("excess deaths") + xlab("") +
  geom_smooth(method="auto",color="red", se=F, size=0.5) +
  facet_wrap(~location)

different clustering algo for excess deaths

fit dtw hierarchical clustering

#deaths <- t(ex_per_country[-1])
# normalize data
deaths.norm <- BBmisc::normalize(deaths, method="standardize")
deaths_dist_norm <- dtw::dtwDist(deaths.norm) # calculate dtw distance
deaths_dist_norm <- as.dist(deaths_dist_norm) # convert to dist object
ex_cluster_fit2 <- hclust(deaths_dist_norm, method = "ward.D")

ggdendro::ggdendrogram(ex_cluster_fit2, rotate=TRUE, theme_dendro = FALSE) + theme_minimal() + xlab("") + ylab("")

#assign the four clusters to the data using cutree() 
clustered_ex2 <- cutree(ex_cluster_fit2, k=6)

clustered_exdata2 <- as.data.frame(as.table(clustered_ex2))

colnames(clustered_exdata2) <- c("location","cluster")
# how many countries per cluster
table(clustered_exdata2$cluster)
## 
##  1  2  3  4  5  6 
## 13 15  4 12 13 12
clustered_exdata2$location <- as.character(clustered_exdata2$location)

joined_clusters2 <- ex_country %>% 
  inner_join(clustered_exdata2, by = "location")

plot each cluster

joined_clusters2 %>% filter(cluster == "1") %>% 
  ggplot(aes(date, excess_deaths)) +
  geom_line(color="grey") +
  theme_minimal() +
  ylab("excess deaths") + xlab("") +
  geom_smooth(method="auto",color="red", se=F, size=0.5) +
  facet_wrap(~location, scales="free")+
  ggtitle("Excess Deaths Cluster 1")

joined_clusters2 %>% filter(cluster == "2") %>% 
  ggplot(aes(date, excess_deaths)) +
  geom_line(color="grey") +
  theme_minimal() +
  ylab("excess deaths") + xlab("") +
  geom_smooth(method="auto",color="red", se=F, size=0.5) +
  facet_wrap(~location, scales="free")+
  ggtitle("Excess Deaths Cluster 2")

joined_clusters2 %>% filter(cluster == "3") %>% 
  ggplot(aes(date, excess_deaths)) +
  geom_line(color="grey") +
  theme_minimal() +
  ylab("excess deaths") + xlab("") +
  geom_smooth(method="auto",color="red", se=F, size=0.5) +
  facet_wrap(~location, scales = "free")+
  ggtitle("Excess Deaths Cluster 3")

joined_clusters2 %>% filter(cluster == "4") %>% 
  ggplot(aes(date, excess_deaths)) +
  geom_line(color="grey") +
  theme_minimal() +
  ylab("excess deaths") + xlab("") +
  geom_smooth(method="auto",color="red", se=F, size=0.5) +
  facet_wrap(~location, scales = "free")+
  ggtitle("Excess Deaths Cluster 4")

joined_clusters2 %>% filter(cluster == "5") %>% 
  ggplot(aes(date, excess_deaths)) +
  geom_line(color="grey") +
  theme_minimal() +
  ylab("excess deaths") + xlab("") +
  geom_smooth(method="auto",color="red", se=F, size=0.5) +
  facet_wrap(~location, scales = "free")+
  ggtitle("Excess Deaths Cluster 5")

joined_clusters2 %>% filter(cluster == "6") %>% 
  ggplot(aes(date, excess_deaths)) +
  geom_line(color="grey") +
  theme_minimal() +
  ylab("excess deaths") + xlab("") +
  geom_smooth(method="auto",color="red", se=F, size=0.5) +
  facet_wrap(~location, scales = "free")+
  ggtitle("Excess Deaths Cluster 6")

merge excess deaths class to covid_no_ts

ex_class <- joined_clusters2 %>% 
  group_by(location) %>% 
  summarise(excess_death_class = unique(cluster))

covid_no_ts_all <- left_join(ex_class, covid_no_ts, by = "location")

clustering policy stringency alone

data management

stringency <- covid_general_ts %>% dplyr::select(location, date, stringency_index) %>% 
  filter(location %in% ex_class$location) %>% 
  mutate(year_month = substr(date,1,7),
         time_unit = as.numeric(sub("-",".",year_month))) %>% 
  drop_na()
# look at earliest and latest time in all country
summary((stringency %>% 
  filter(!is.na(stringency_index)) %>% 
  group_by(location) %>% 
  summarise(latest_date = max(date),
            earliest_date = min(date))))
##    location          latest_date         earliest_date       
##  Length:67          Min.   :2022-01-24   Min.   :2020-01-01  
##  Class :character   1st Qu.:2022-02-14   1st Qu.:2020-02-01  
##  Mode  :character   Median :2022-02-20   Median :2020-02-25  
##                     Mean   :2022-02-16   Mean   :2020-02-17  
##                     3rd Qu.:2022-02-21   3rd Qu.:2020-03-03  
##                     Max.   :2022-02-22   Max.   :2020-03-18
### --> choose 2020-03-18 as earliest date and 2022-01-24 as latest
# compute stringency level by month (take the average)
month_stringency <- stringency %>% 
  group_by(location, year_month,time_unit) %>% 
  summarise(monthly_stringency = mean(stringency_index),
            .groups = "keep") %>% 
  # filter out date after 2021/12
  filter(time_unit <= 2021.12) %>% 
  arrange(time_unit) %>% 
  mutate(date = as.Date(as.yearmon(year_month,"%Y-%m")))
ggplot(month_stringency, aes(date, monthly_stringency,color=location)) +
  geom_line(stat="identity") +
  ylab("Monthly Policy Stringency Index") +
  gghighlight(max(monthly_stringency) > 95,
              max_highlight = 4,
              use_direct_label = TRUE) +
  theme_minimal() +
  theme(legend.position = 'none')

fit dtw hierarchical clustering

# long to wide
stringency_per_country <- month_stringency[,c("monthly_stringency","time_unit","location")] %>% 
  spread(location, monthly_stringency) %>% 
  drop_na()
policy <- t(stringency_per_country[-1])
# normalize data
policy.norm <- BBmisc::normalize(policy, method="standardize")
policy_dist_norm <- dtw::dtwDist(policy.norm) # calculate dtw distance
policy_dist_norm <- as.dist(policy_dist_norm) # convert to dist object
policy_cluster_fit <- hclust(policy_dist_norm, method = "ward.D")

ggdendro::ggdendrogram(policy_cluster_fit, rotate=TRUE, theme_dendro = FALSE) + theme_minimal() + xlab("") + ylab("")

#assign the four clusters to the data using cutree() 
clustered_policy <- as.data.frame(as.table(cutree(policy_cluster_fit, k=6)))
colnames(clustered_policy) <- c("location","cluster")
# how many countries per cluster
table(clustered_policy$cluster)
## 
##  1  2  3  4  5  6 
##  9  4  8 18 16 12
clustered_policy$location <- as.character(clustered_policy$location)
joined_clusters_policy <- month_stringency %>% 
  inner_join(clustered_policy, by = "location")

plot policy stringency clusters

joined_clusters_policy %>% filter(cluster == "1") %>% 
  ggplot(aes(date, monthly_stringency)) +
  geom_line(color="grey") +
  theme_minimal() +
  ylab("policy stringency") + xlab("") +
  geom_smooth(method="auto",color="red", se=F, size=0.5) +
  facet_wrap(~location, scales="free")+
  ggtitle("Policy Stringency Cluster 1")

joined_clusters_policy %>% filter(cluster == "2") %>% 
  ggplot(aes(date, monthly_stringency)) +
  geom_line(color="grey") +
  theme_minimal() +
  ylab("policy stringency") + xlab("") +
  geom_smooth(method="auto",color="red", se=F, size=0.5) +
  facet_wrap(~location, scales="free")+
  ggtitle("Policy Stringency Cluster 2")

joined_clusters_policy %>% filter(cluster == "3") %>% 
  ggplot(aes(date, monthly_stringency)) +
  geom_line(color="grey") +
  theme_minimal() +
  ylab("policy stringency") + xlab("") +
  geom_smooth(method="auto",color="red", se=F, size=0.5) +
  facet_wrap(~location, scales="free")+
  ggtitle("Policy Stringency Cluster 3")

joined_clusters_policy %>% filter(cluster == "4") %>% 
  ggplot(aes(date, monthly_stringency)) +
  geom_line(color="grey") +
  theme_minimal() +
  ylab("policy stringency") + xlab("") +
  geom_smooth(method="auto",color="red", se=F, size=0.5) +
  facet_wrap(~location, scales="free")+
  ggtitle("Policy Stringency Cluster 4")

joined_clusters_policy %>% filter(cluster == "5") %>% 
  ggplot(aes(date, monthly_stringency)) +
  geom_line(color="grey") +
  theme_minimal() +
  ylab("policy stringency") + xlab("") +
  geom_smooth(method="auto",color="red", se=F, size=0.5) +
  facet_wrap(~location, scales="free")+
  ggtitle("Policy Stringency Cluster 5")

joined_clusters_policy %>% filter(cluster == "1") %>% 
  ggplot(aes(date, monthly_stringency)) +
  geom_line(color="grey") +
  theme_minimal() +
  ylab("policy stringency") + xlab("") +
  geom_smooth(method="auto",color="red", se=F, size=0.5) +
  facet_wrap(~location, scales="free")+
  ggtitle("Policy Stringency Cluster 6")

merge policy to covid_no_ts

policy_class <- joined_clusters_policy %>% 
  group_by(location) %>% 
  summarise(stringency_class = unique(cluster))

covid_no_ts_all <- left_join(policy_class, covid_no_ts, by = "location")