In this practice, you will work with real and actual data on the infection with the virus COVID-19. You will work with the number of cases of confirmed, recovered and deaths for some example countries. This practice focus more on the statistical analysis and presentation of data.
For this practice, use the following files:
Which can be downloaded from the Github repository database on Corona
As for every new session of R, you should set your working directory and load the necessary libraries:
ipak <- function(pkg){
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
if (length(new.pkg))
install.packages(new.pkg, dependencies = TRUE)
sapply(pkg, require, character.only = TRUE)
}
packages <- c("dplyr", "magrittr", "lubridate", "xts", "tmap", "rgdal","spData")
ipak(packages)
Like for every new session of R, you should set your working directory.
# Set the working directory ("wd")
setwd("YOURDRIVE\\yourpath\\yourfolder")
# read csv
countries_aggregated <- read.table("data\\countries_aggregated_csv.csv", header = TRUE, sep = ",")
reference <- read.table("data\\reference_csv.csv", header = TRUE, sep = ",", fill = TRUE, quote = "")
# check data
head(countries_aggregated)
## Date Country Confirmed Recovered Deaths
## 1 2020-01-22 Afghanistan 0 0 0
## 2 2020-01-23 Afghanistan 0 0 0
## 3 2020-01-24 Afghanistan 0 0 0
## 4 2020-01-25 Afghanistan 0 0 0
## 5 2020-01-26 Afghanistan 0 0 0
## 6 2020-01-27 Afghanistan 0 0 0
head(reference)
## UID iso2 iso3 code3 FIPS Admin2 Province_State Country_Region Lat
## 1 4 AF AFG 4 NA Afghanistan 33.93911
## 2 8 AL ALB 8 NA Albania 41.1533
## 3 12 DZ DZA 12 NA Algeria 28.0339
## 4 20 AD AND 20 NA Andorra 42.5063
## 5 24 AO AGO 24 NA Angola -11.2027
## 6 28 AG ATG 28 NA Antigua and Barbuda 17.0608
## Long_ Combined_Key Population
## 1 67.70995 Afghanistan 38928341
## 2 20.16830 Albania 2877800
## 3 1.65960 Algeria 43851043
## 4 1.52180 Andorra 77265
## 5 17.87390 Angola 32866268
## 6 -61.79640 Antigua and Barbuda 97928
Since the CSV provides daily data, depending on the analysis to perform, it makes sense to aggregate the data down to monthly values. Before doing this, we need to identify which dates correspond to the same months.
# check start and end date
# use lubridate to transform the date format to be able to determine min and max
countries_aggregated$Date = ymd(countries_aggregated$Date)
summarize(countries_aggregated, min = min(Date), max = max(Date))
## min max
## 1 2020-01-22 2020-09-03
For this practice, the mortality rate defined as [deaths / confirmed cases] will be calculated for Germany, Italy and South Korea, from March to early September 2020. Please feel free to choose other countries for your analysis.
# calculate mortality rate and add that to the df
countries_aggregated$Mortality_rate <- with(countries_aggregated, Deaths / Confirmed * 100) #creates new column named 'Mortality_rate'.
#'with' is a function that allows you to evaluate an expression in the context of a data frame.
countries_aggregated$Mortality_rate[is.na(countries_aggregated$Mortality_rate)] <- 0 # replace NaN results with 0
countries_aggregated$Date <- as.Date(countries_aggregated$Date, format = "%m/%d/%y") # convert ymd to Date to be able to plot it easier
# create subsets for each country
sub_Germany <- subset(countries_aggregated, Country == "Germany")
sub_Italy <- subset(countries_aggregated, Country == "Italy")
sub_SK <- subset(countries_aggregated, Country == "Korea, South")
#check the first lines of your results with 'head()'
head(countries_aggregated)
## Date Country Confirmed Recovered Deaths Mortality_rate
## 1 2020-01-22 Afghanistan 0 0 0 0
## 2 2020-01-23 Afghanistan 0 0 0 0
## 3 2020-01-24 Afghanistan 0 0 0 0
## 4 2020-01-25 Afghanistan 0 0 0 0
## 5 2020-01-26 Afghanistan 0 0 0 0
## 6 2020-01-27 Afghanistan 0 0 0 0
head(sub_SK)
## Date Country Confirmed Recovered Deaths Mortality_rate
## 20454 2020-01-22 Korea, South 1 0 0 0
## 20455 2020-01-23 Korea, South 1 0 0 0
## 20456 2020-01-24 Korea, South 2 0 0 0
## 20457 2020-01-25 Korea, South 2 0 0 0
## 20458 2020-01-26 Korea, South 3 0 0 0
## 20459 2020-01-27 Korea, South 4 0 0 0
# plot values
time <- countries_aggregated$Date
plot(sub_Germany$Date, sub_Germany$Mortality_rate, type = "l", col = "red", xlab = "Month", ylab = "Mortality Rate [%]", ylim = c(0, 20), main = "Development of Mortality rate in select countries", xaxt="n") # 'xaxt="n"' removes the x axis and allows us to define our own
lines(sub_Italy$Date, sub_Italy$Mortality_rate, col = "green") # lines adds an additional line graph into the plot
lines(sub_SK$Date, sub_SK$Mortality_rate, col = "blue")
axis.Date(side=1,at=seq(min(time),max(time),by="month"),format="%m") # defines new x axis with each tick being a month, it is required to have the date in Date format for that
grid(nx = NA, ny = NULL) # add a grid for better readability of the mortality rate
legend(x = "topleft", legend = c("Germany","Italy","South Korea"),
lty=c(1,1,1), lwd=c(2.5,2.5,2.5),
col=c("red", "green", "blue"))
Figure 1. Development of mortality rate in select countries
par(mfrow=c(3,1))
# Germany
plot(sub_Germany$Date, sub_Germany$Confirmed, type = "l", col = "red", xlab = "Month", ylab = "Number of Cases",ylim = c(0,300000), main = "Development of Confirmed and Recovered cases in Germany", xaxt="n")
lines(sub_Germany$Date, sub_Germany$Recovered, col = "blue") # lines adds an additional line graph into the plot
axis.Date(side=1,at=seq(min(time),max(time),by="month"),format="%m") # defines new x axis with each tick being a month, it is required to have the date in Date format for that
grid(nx = NA, ny = NULL) # add a grid for better readability of the mortality rate
legend(x = "topleft", legend = c("Confirmed","Recovered"),
lty=c(1,1), lwd=c(2.5,2.5),
col=c("red", "blue"))
# Italy
plot(sub_Italy$Date, sub_Italy$Confirmed, type = "l", col = "red", xlab = "Month", ylab = "Number of Cases",ylim = c(0,300000), main = "Development of Confirmed and Recovered cases in Italy", xaxt="n")
lines(sub_Italy$Date, sub_Italy$Recovered, col = "blue") # lines adds an additional line graph into the plot
axis.Date(side=1,at=seq(min(time),max(time),by="month"),format="%m") # defines new x axis with each tick being a month, it is required to have the date in Date format for that
grid(nx = NA, ny = NULL) # add a grid for better readability of the mortality rate
legend(x = "topleft", legend = c("Confirmed","Recovered"),
lty=c(1,1), lwd=c(2.5,2.5),
col=c("red", "blue"))
# South Korea
plot(sub_SK$Date, sub_SK$Confirmed, type = "l", col = "red", xlab = "Month", ylab = "Number of Cases",ylim = c(0,300000), main = "Development of Confirmed and Recovered cases in South Korea", xaxt="n")
lines(sub_SK$Date, sub_SK$Recovered, col = "blue") # lines adds an additional line graph into the plot
axis.Date(side=1,at=seq(min(time),max(time),by="month"),format="%m") # defines new x axis with each tick being a month, it is required to have the date in Date format for that
grid(nx = NA, ny = NULL) # add a grid for better readability of the mortality rate
legend(x = "topleft", legend = c("Confirmed","Recovered"),
lty=c(1,1), lwd=c(2.5,2.5),
col=c("red", "blue"))
# alternative plot
plot(sub_Germany$Date, sub_Germany$Confirmed, type = "l",lty = 1,lwd = 1.75, col = "darkorange2", xlab = "Month", ylab = "Number of Cases",ylim = c(0,300000),yaxt="n", main = "Development of Confirmed and Recovered cases in select countries", xaxt="n")
lines(sub_Germany$Date, sub_Germany$Recovered, col = "darkorange2", lty = 2,lwd = 1.75) # lty changes the line type, lwd the line width
lines(sub_Italy$Date, sub_Italy$Confirmed, lty = 1, col = "chartreuse3",lwd = 1.75)
lines(sub_Italy$Date, sub_Italy$Recovered,lty = 2, col = "chartreuse3",lwd = 1.75)
lines(sub_SK$Date, sub_SK$Confirmed, type = "l", col = "blueviolet", lty = 1,lwd = 1.75)
lines(sub_SK$Date, sub_SK$Recovered, col = "blueviolet", lty = 2,lwd = 1.75)
axis.Date(side=1,at=seq(min(time),max(time),by="month"),format="%m")
axis(2, seq(0,300000,10000), las = 2, cex.axis = 0.6) # via axis and seq, we can set how many ticks are displayed on the y-axis, cex.axis controls the font size of the tick labels, las rotates the tick labels
grid(NA, NULL)
abline(h = c(10000,20000,30000,40000), lty = 3, col = "grey") # add additional horizontal grid lines
legend(x = "topleft", legend = c("Germany","Italy","South Korea","Confirmed","Recovered"),
lty=c(1,1,1,1,2), lwd=c(2.5,2.5,2.5,2.5,2.5),
col=c("darkorange2","chartreuse3","blueviolet","black", "black"))
ger_pop <- subset(reference, Country_Region == "Germany", select = Population)
ger_pop <- ger_pop[1,1]
ger_pop
## [1] "83783945"
# as dataframe
sub_Germany <- sub_Germany %>% mutate(Month = format(Date, "%m"))
sum_Germany <- aggregate(list(sub_Germany["Deaths"], sub_Germany["Confirmed"], sub_Germany["Recovered"]), by=sub_Germany["Month"], sum) # aggregate groups values in the first given list, by the second given list(s), the values are aggregated by the function specified (in this case they are summed up)
sum_Germany <- cbind(sum_Germany, aggregate(round(sub_Germany["Mortality_rate"],3), by=list(sub_Germany$Month), max))
sum_Germany <- sum_Germany[-5] # remove the additional month column added by the second aggregate
sum_Germany
## Month Deaths Confirmed Recovered Mortality_rate
## 1 01 0 18 0 0.000
## 2 02 0 561 174 0.000
## 3 03 3876 588179 68067 1.079
## 4 04 115270 3942925 2191330 4.063
## 5 05 242992 5427815 4664012 4.656
## 6 06 264281 5670762 5179784 4.699
## 7 07 281460 6265079 5770005 4.592
## 8 08 286250 7024733 6291811 4.338
## 9 09 27951 742266 659832 3.783
# via summarize
sub_Germany %>% mutate(Month = format(Date, "%m")) %>% group_by(Month) %>% summarize(Deaths = sum(Deaths), Confirmed = sum(Confirmed), Recovered = sum(Recovered), Mortality_rate = max(round(Mortality_rate,3)))
## # A tibble: 9 x 5
## Month Deaths Confirmed Recovered Mortality_rate
## <chr> <int> <int> <int> <dbl>
## 1 01 0 18 0 0
## 2 02 0 561 174 0
## 3 03 3876 588179 68067 1.08
## 4 04 115270 3942925 2191330 4.06
## 5 05 242992 5427815 4664012 4.66
## 6 06 264281 5670762 5179784 4.70
## 7 07 281460 6265079 5770005 4.59
## 8 08 286250 7024733 6291811 4.34
## 9 09 27951 742266 659832 3.78
# read and load vector file that has a subset of Central Europe
shp_eu <- readOGR("data\\shape\\ger_ita.shp")
## OGR data source with driver: ESRI Shapefile
## Source: "W:\Ulloa\SHK\Lehre\pandemic_analysis\data\shape\ger_ita.shp", layer: "ger_ita"
## with 34 features
## It has 2 fields
sub_shp <- subset(shp_eu, CNTRY_NAME == "Germany" | CNTRY_NAME == "Italy") # create a subset of only your chosen countries
tmap_mode("plot")
sub_Italy <- sub_Italy %>% mutate(Month = format(Date, "%m"))
sum_Italy <- aggregate(list(sub_Italy["Deaths"], sub_Italy["Confirmed"], sub_Italy["Recovered"]), by=sub_Italy["Month"], sum)
sum_Italy <- cbind(sum_Italy, aggregate(round(sub_Italy["Mortality_rate"],3), by=list(sub_Italy$Month), max))
sum_Italy <- sum_Italy[-5]
#copy shape for each month to display
sub_shp_03 <- sub_shp
sub_shp_05 <- sub_shp
sub_shp_08 <- sub_shp
#add Mortality Rate data to the feature
sub_shp_03$Mortality_rate <- c(sum_Germany[3,5],sum_Italy[3,5])
sub_shp_05$Mortality_rate <- c(sum_Germany[5,5],sum_Italy[5,5])
sub_shp_08$Mortality_rate <- c(sum_Germany[8,5],sum_Italy[8,5])
#for the first layer, add the general shape just with borders and filled for better visibility
map_base <- tm_shape(shp_eu) + tm_polygons()
#add additional layers
#'tm_fill' fills the in 'tm_shape' specified shape with the in 'col' specified color. you can also give a variable of the shape, this way each class of variable will have its own color. (use 'legend.show = FALSE' to prevent every displayed country to show up in the legend)
#use 'tm_add_legend' to create your own legend
#'tm_bubbles' creates bubbles according to the given variable
#'tm_text' displays the value of the selected variable as label. use 'ymod' and 'xmod' to alter the position of the text
#use 'tm_layout' to add title, change position of the legend, text color, etc.
map_03 <- map_base + tm_shape(sub_shp_03) + tm_fill("CNTRY_NAME", legend.show = FALSE) + tm_bubbles("Mortality_rate", col = "black", scale = 2, border.col = "grey20") + tm_text("Mortality_rate", size = 0.7, col = "grey20", ymod = 1.5, xmod = -0.5) + tm_layout(title = "March")
map_05 <- map_base + tm_shape(sub_shp_05) + tm_fill(col = "firebrick3") + tm_bubbles("Mortality_rate", col = "black", scale = 2, border.col = "grey20") + tm_text("Mortality_rate", size = 0.7, col = "grey20", ymod = 1.5, xmod = -0.5) + tm_layout(title = "May")
map_08 <- map_base + tm_shape(sub_shp_08) + tm_fill(col = "firebrick3") + tm_bubbles("Mortality_rate", col = "black", scale = 2, border.col = "grey20") + tm_text("Mortality_rate", size = 0.7, col = "grey20", ymod = 1.5, xmod = -0.5) + tm_layout(title = "August")
#use arrange to display all created maps simultaneously
#you can also use 'tm_facets' if your data has a variable by which you want to split it. this will create a map for each different value. with 'tm_facets' it is possible to have one legend shared by all maps
tmap_arrange(map_03, map_05, map_08)