clustering excess deaths alone (by Euclidean distance)
url2 = "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/excess_mortality/excess_mortality.csv"
excess_mortality <- read_csv(url(url2))
excess_mortality <- excess_mortality %>%
mutate(month = as.numeric(substr(date,6,7)),
year = as.numeric(substr(date,1,4)),
year_month = substr(date,1,7))
# transfer all weekly to monthly (aggregate)
excess_mortality2 <- excess_mortality %>%
dplyr::select(location, date, year_month, year, month, excess_proj_all_ages) %>%
group_by(location, year_month, year, month) %>%
summarise(excess_deaths = sum(excess_proj_all_ages), .groups="keep") %>%
filter(location %in% covid_general$location) %>%
drop_na() %>%
mutate(time_unit = as.numeric(sub("-",".",year_month)))
library(zoo) # for date operations
update_time = excess_mortality2 %>% group_by(location) %>% summarise(latest_date = max(time_unit),earliest = min(time_unit)) # choose update time since 2020/01 until 2021/12
selected_country = (update_time %>% filter(latest_date >= 2021.12 & earliest == 2020.01))$location
# keep all selected countries, filter out data after 2021/12
ex_country <- excess_mortality2 %>%
filter(location %in% selected_country & time_unit <= 2021.12) %>%
arrange(time_unit) %>%
mutate(date = as.Date(as.yearmon(year_month,"%Y-%m")))
library(gghighlight)
ggplot(ex_country, aes(date, excess_deaths,color=location)) +
geom_line(stat="identity") +
ylab("Projected Excess Deaths") +
gghighlight(max(excess_deaths) > 90000,
max_highlight = 4,
use_direct_label = TRUE) +
theme_minimal() +
theme(legend.position = 'none')

# exclude France because missing data from 2020/05 - 2021-10
ex_per_country <- ex_country[ex_country$location != "France",c("excess_deaths","time_unit","location")]
# long to wide
ex_per_country <- ex_per_country %>%
spread(location, excess_deaths)
# transpose excess deaths to matrix
deaths <- t(ex_per_country[-1])
deaths_dist <- proxy::dist(deaths, method="Euclidean")
ex_cluster_fit <- hclust(deaths_dist, method = "ward.D")
# plot clusters
ggdendro::ggdendrogram(ex_cluster_fit, rotate=TRUE, theme_dendro = FALSE) + theme_minimal() + xlab("") + ylab("")

#assign the four clusters to the data using cutree()
clustered_ex <- cutree(ex_cluster_fit, k=4)
clustered_exdata <- as.data.frame(as.table(clustered_ex))
colnames(clustered_exdata) <- c("location","cluster")
# how many countries per cluster
table(clustered_exdata$cluster)
##
## 1 2 3 4
## 53 3 12 1
clustered_exdata$location <- as.character(clustered_exdata$location)
joined_clusters <- ex_country %>%
inner_join(clustered_exdata, by = "location")
cluster1 <- joined_clusters %>% filter(cluster == "1")
table(cluster1$location)
##
## Albania Armenia Australia
## 24 24 24
## Austria Azerbaijan Belgium
## 24 24 24
## Bolivia Bosnia and Herzegovina Bulgaria
## 24 24 24
## Chile Croatia Cyprus
## 24 24 24
## Czechia Denmark Estonia
## 24 24 24
## Finland Georgia Greece
## 24 24 24
## Guatemala Hungary Iceland
## 24 24 24
## Ireland Israel Japan
## 24 24 24
## Kazakhstan Kyrgyzstan Latvia
## 24 24 24
## Lebanon Lithuania Luxembourg
## 24 24 24
## Malta Mauritius Moldova
## 24 24 24
## Mongolia Netherlands New Zealand
## 24 24 24
## North Macedonia Norway Oman
## 24 24 24
## Paraguay Portugal Qatar
## 24 24 24
## Serbia Singapore Slovakia
## 24 24 24
## Slovenia South Korea Sweden
## 24 24 24
## Switzerland Taiwan Thailand
## 24 24 24
## Uruguay Uzbekistan
## 24 24
ggplot(cluster1, aes(date, excess_deaths)) +
geom_line(color="grey") +
theme_minimal() +
ylab("excess deaths") + xlab("") +
geom_smooth(method="auto",color="red", se=F, size=0.5) +
facet_wrap(~location)

cluster2 <- joined_clusters %>% filter(cluster == "2") %>% glimpse()
## Rows: 72
## Columns: 8
## Groups: location, year_month, year, month [72]
## $ location <chr> "Brazil", "Mexico", "United States", "Brazil", "Mexico",…
## $ year_month <chr> "2020-01", "2020-01", "2020-01", "2020-02", "2020-02", "…
## $ year <dbl> 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 20…
## $ month <dbl> 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7,…
## $ excess_deaths <dbl> 1739.2, -486.6, -8926.0, 5310.4, -3121.6, -4753.0, 7351.…
## $ time_unit <dbl> 2020.01, 2020.01, 2020.01, 2020.02, 2020.02, 2020.02, 20…
## $ date <date> 2020-01-01, 2020-01-01, 2020-01-01, 2020-02-01, 2020-02…
## $ cluster <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
table(cluster2$location)
##
## Brazil Mexico United States
## 24 24 24
ggplot(cluster2, aes(date, excess_deaths)) +
geom_line(color="grey") +
theme_minimal() +
ylab("excess deaths") + xlab("") +
geom_smooth(method="auto",color="red", se=F, size=0.5) +
facet_wrap(~location)

cluster3 <- joined_clusters %>% filter(cluster == "3")
table(cluster3$location)
##
## Colombia Ecuador Germany Iran Italy
## 24 24 24 24 24
## Peru Poland Romania South Africa Spain
## 24 24 24 24 24
## Ukraine United Kingdom
## 24 24
ggplot(cluster3, aes(date, excess_deaths)) +
geom_line(color="grey") +
theme_minimal() +
ylab("excess deaths") + xlab("") +
geom_smooth(method="auto",color="red", se=F, size=0.5) +
facet_wrap(~location)

cluster4 <- joined_clusters %>% filter(cluster == "4")
table(cluster4$location)
##
## Russia
## 24
ggplot(cluster4, aes(date, excess_deaths)) +
geom_line(color="grey") +
theme_minimal() +
ylab("excess deaths") + xlab("") +
geom_smooth(method="auto",color="red", se=F, size=0.5) +
facet_wrap(~location)

different clustering algo for excess deaths
fit dtw hierarchical clustering
#deaths <- t(ex_per_country[-1])
# normalize data
deaths.norm <- BBmisc::normalize(deaths, method="standardize")
deaths_dist_norm <- dtw::dtwDist(deaths.norm) # calculate dtw distance
deaths_dist_norm <- as.dist(deaths_dist_norm) # convert to dist object
ex_cluster_fit2 <- hclust(deaths_dist_norm, method = "ward.D")
ggdendro::ggdendrogram(ex_cluster_fit2, rotate=TRUE, theme_dendro = FALSE) + theme_minimal() + xlab("") + ylab("")

#assign the four clusters to the data using cutree()
clustered_ex2 <- cutree(ex_cluster_fit2, k=6)
clustered_exdata2 <- as.data.frame(as.table(clustered_ex2))
colnames(clustered_exdata2) <- c("location","cluster")
# how many countries per cluster
table(clustered_exdata2$cluster)
##
## 1 2 3 4 5 6
## 13 15 4 12 13 12
clustered_exdata2$location <- as.character(clustered_exdata2$location)
joined_clusters2 <- ex_country %>%
inner_join(clustered_exdata2, by = "location")
plot each cluster
joined_clusters2 %>% filter(cluster == "1") %>%
ggplot(aes(date, excess_deaths)) +
geom_line(color="grey") +
theme_minimal() +
ylab("excess deaths") + xlab("") +
geom_smooth(method="auto",color="red", se=F, size=0.5) +
facet_wrap(~location, scales="free")+
ggtitle("Excess Deaths Cluster 1")

joined_clusters2 %>% filter(cluster == "2") %>%
ggplot(aes(date, excess_deaths)) +
geom_line(color="grey") +
theme_minimal() +
ylab("excess deaths") + xlab("") +
geom_smooth(method="auto",color="red", se=F, size=0.5) +
facet_wrap(~location, scales="free")+
ggtitle("Excess Deaths Cluster 2")

joined_clusters2 %>% filter(cluster == "3") %>%
ggplot(aes(date, excess_deaths)) +
geom_line(color="grey") +
theme_minimal() +
ylab("excess deaths") + xlab("") +
geom_smooth(method="auto",color="red", se=F, size=0.5) +
facet_wrap(~location, scales = "free")+
ggtitle("Excess Deaths Cluster 3")

joined_clusters2 %>% filter(cluster == "4") %>%
ggplot(aes(date, excess_deaths)) +
geom_line(color="grey") +
theme_minimal() +
ylab("excess deaths") + xlab("") +
geom_smooth(method="auto",color="red", se=F, size=0.5) +
facet_wrap(~location, scales = "free")+
ggtitle("Excess Deaths Cluster 4")

joined_clusters2 %>% filter(cluster == "5") %>%
ggplot(aes(date, excess_deaths)) +
geom_line(color="grey") +
theme_minimal() +
ylab("excess deaths") + xlab("") +
geom_smooth(method="auto",color="red", se=F, size=0.5) +
facet_wrap(~location, scales = "free")+
ggtitle("Excess Deaths Cluster 5")

joined_clusters2 %>% filter(cluster == "6") %>%
ggplot(aes(date, excess_deaths)) +
geom_line(color="grey") +
theme_minimal() +
ylab("excess deaths") + xlab("") +
geom_smooth(method="auto",color="red", se=F, size=0.5) +
facet_wrap(~location, scales = "free")+
ggtitle("Excess Deaths Cluster 6")

merge excess deaths class to covid_no_ts
ex_class <- joined_clusters2 %>%
group_by(location) %>%
summarise(excess_death_class = unique(cluster))
covid_no_ts_all <- left_join(ex_class, covid_no_ts, by = "location")
clustering policy stringency alone
data management
stringency <- covid_general_ts %>% dplyr::select(location, date, stringency_index) %>%
filter(location %in% ex_class$location) %>%
mutate(year_month = substr(date,1,7),
time_unit = as.numeric(sub("-",".",year_month))) %>%
drop_na()
# look at earliest and latest time in all country
summary((stringency %>%
filter(!is.na(stringency_index)) %>%
group_by(location) %>%
summarise(latest_date = max(date),
earliest_date = min(date))))
## location latest_date earliest_date
## Length:67 Min. :2022-01-24 Min. :2020-01-01
## Class :character 1st Qu.:2022-02-14 1st Qu.:2020-02-01
## Mode :character Median :2022-02-20 Median :2020-02-25
## Mean :2022-02-16 Mean :2020-02-17
## 3rd Qu.:2022-02-21 3rd Qu.:2020-03-03
## Max. :2022-02-22 Max. :2020-03-18
### --> choose 2020-03-18 as earliest date and 2022-01-24 as latest
# compute stringency level by month (take the average)
month_stringency <- stringency %>%
group_by(location, year_month,time_unit) %>%
summarise(monthly_stringency = mean(stringency_index),
.groups = "keep") %>%
# filter out date after 2021/12
filter(time_unit <= 2021.12) %>%
arrange(time_unit) %>%
mutate(date = as.Date(as.yearmon(year_month,"%Y-%m")))
ggplot(month_stringency, aes(date, monthly_stringency,color=location)) +
geom_line(stat="identity") +
ylab("Monthly Policy Stringency Index") +
gghighlight(max(monthly_stringency) > 95,
max_highlight = 4,
use_direct_label = TRUE) +
theme_minimal() +
theme(legend.position = 'none')

fit dtw hierarchical clustering
# long to wide
stringency_per_country <- month_stringency[,c("monthly_stringency","time_unit","location")] %>%
spread(location, monthly_stringency) %>%
drop_na()
policy <- t(stringency_per_country[-1])
# normalize data
policy.norm <- BBmisc::normalize(policy, method="standardize")
policy_dist_norm <- dtw::dtwDist(policy.norm) # calculate dtw distance
policy_dist_norm <- as.dist(policy_dist_norm) # convert to dist object
policy_cluster_fit <- hclust(policy_dist_norm, method = "ward.D")
ggdendro::ggdendrogram(policy_cluster_fit, rotate=TRUE, theme_dendro = FALSE) + theme_minimal() + xlab("") + ylab("")

#assign the four clusters to the data using cutree()
clustered_policy <- as.data.frame(as.table(cutree(policy_cluster_fit, k=6)))
colnames(clustered_policy) <- c("location","cluster")
# how many countries per cluster
table(clustered_policy$cluster)
##
## 1 2 3 4 5 6
## 9 4 8 18 16 12
clustered_policy$location <- as.character(clustered_policy$location)
joined_clusters_policy <- month_stringency %>%
inner_join(clustered_policy, by = "location")
plot policy stringency clusters
joined_clusters_policy %>% filter(cluster == "1") %>%
ggplot(aes(date, monthly_stringency)) +
geom_line(color="grey") +
theme_minimal() +
ylab("policy stringency") + xlab("") +
geom_smooth(method="auto",color="red", se=F, size=0.5) +
facet_wrap(~location, scales="free")+
ggtitle("Policy Stringency Cluster 1")

joined_clusters_policy %>% filter(cluster == "2") %>%
ggplot(aes(date, monthly_stringency)) +
geom_line(color="grey") +
theme_minimal() +
ylab("policy stringency") + xlab("") +
geom_smooth(method="auto",color="red", se=F, size=0.5) +
facet_wrap(~location, scales="free")+
ggtitle("Policy Stringency Cluster 2")

joined_clusters_policy %>% filter(cluster == "3") %>%
ggplot(aes(date, monthly_stringency)) +
geom_line(color="grey") +
theme_minimal() +
ylab("policy stringency") + xlab("") +
geom_smooth(method="auto",color="red", se=F, size=0.5) +
facet_wrap(~location, scales="free")+
ggtitle("Policy Stringency Cluster 3")

joined_clusters_policy %>% filter(cluster == "4") %>%
ggplot(aes(date, monthly_stringency)) +
geom_line(color="grey") +
theme_minimal() +
ylab("policy stringency") + xlab("") +
geom_smooth(method="auto",color="red", se=F, size=0.5) +
facet_wrap(~location, scales="free")+
ggtitle("Policy Stringency Cluster 4")

joined_clusters_policy %>% filter(cluster == "5") %>%
ggplot(aes(date, monthly_stringency)) +
geom_line(color="grey") +
theme_minimal() +
ylab("policy stringency") + xlab("") +
geom_smooth(method="auto",color="red", se=F, size=0.5) +
facet_wrap(~location, scales="free")+
ggtitle("Policy Stringency Cluster 5")

joined_clusters_policy %>% filter(cluster == "1") %>%
ggplot(aes(date, monthly_stringency)) +
geom_line(color="grey") +
theme_minimal() +
ylab("policy stringency") + xlab("") +
geom_smooth(method="auto",color="red", se=F, size=0.5) +
facet_wrap(~location, scales="free")+
ggtitle("Policy Stringency Cluster 6")

merge policy to covid_no_ts
policy_class <- joined_clusters_policy %>%
group_by(location) %>%
summarise(stringency_class = unique(cluster))
covid_no_ts_all <- left_join(policy_class, covid_no_ts, by = "location")