set working directory

Load relevant pacakges for data Cleadning

library(tidyverse,warn.conflicts=F, quietly=T)
library(rstan,warn.conflicts=F, quietly=T)
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())

Read datasets

#these libraries need to be loaded
library(utils)
#read the Dataset sheet into “R”. 
#The dataset will be called "covidata".
covidata <- read.csv("https://opendata.ecdc.europa.eu/covid19/nationalcasedeath_eueea_daily_ei/csv", na.strings = "", fileEncoding = "UTF-8-BOM")   #takes some time!
#read the Dataset sheet of countries' responses into “R”. 
#The dataset will be called "resp".
resp <- read.csv("Code Build/response_graphs_data_2022-08-25.csv")

Data Cleaning

Step 1: set as date

covidata$dateRep=as.Date(covidata$dateRep,'%d/%m/%Y')
resp$date_start=as.Date(resp$date_start,"%Y/%m/%d")
resp$date_end=as.Date(resp$date_end,"%Y/%m/%d",optional=TRUE)
resp=na.omit(resp)

Step 2:duplicate response measures by day

respbydate<-resp %>% 
    rowwise() %>% 
    do(data.frame(DATE=seq(.$date_start, .$date_end, by='day'),
       Response_measure= rep(.$Response_measure, .$date_start-.$date_start+1),
       Country=rep(.$Country, .$date_start-.$date_start+1)))   
respbydate$ID=paste(respbydate$Country,respbydate$DATE)
respbydate<-respbydate%>%spread(Response_measure, 1)

Trouble-shooting: if the reviewer got error when running these lines, please check the reading process of the data. We found that using xlsx downloaded from Canvas or using “csv2” package may failed to run these lines

Data Merge

covidata$ID=paste(covidata$countriesAndTerritories,covidata$dateRep)
mgdata<-merge(covidata,respbydate,by.x="ID",by.y="ID",all=TRUE)
mgdata_<-apply(mgdata[,c(14:79)], 2, function(x){
  x[!is.na(x)]=1;x
})
mgdata_<-apply(mgdata_,2, function(x){
  x[is.na(x)]=0;x
})
mgdata=cbind (mgdata[,c(1:13)],mgdata_)
mgdata$Country <- ifelse(is.na(mgdata$Country), mgdata$countriesAndTerritories,mgdata$Country)

Categorize responses

for (x in seq(14,79,by=1)){
  mgdata[,x]=as.numeric(mgdata[,x])
}
grep("Work|work",colnames(mgdata))
mgdata$wk <- rowSums(mgdata[,c(14, 76, 78)])
grep("Mask",colnames(mgdata))
mgdata$mask <- rowSums(mgdata[,c(40 ,41 ,42, 43)])
grep("Quarantine|Hotel",colnames(mgdata))
mgdata$trav <- rowSums(mgdata[,c(34, 35, 62 ,63)])
grep("StayHome",colnames(mgdata))
mgdata$home <- rowSums(mgdata[,c(64, 65, 70, 71, 72, 73, 74,75)])
mgdata$home<-ifelse(mgdata$home!=0,1,mgdata$home)
grep("Ban|Mass|Indoor|Outdoor",colnames(mgdata))
mgdata$pbg <- rowSums(mgdata[,c(16, 36, 38, 48, 50,54,  56)])
mgdata$pbg<-ifelse(mgdata$pbg!=0,1,mgdata$pbg)
grep("Private|SocialCircle",colnames(mgdata))
mgdata$prg <- rowSums(mgdata[,c(60, 61, 68 ,69)])
grep("^Clos",colnames(mgdata))
mgdata$sch <- rowSums(mgdata[,c(18, 20, 22, 24, 26)])
grep("PublicTransport",colnames(mgdata))
mgdata$pbtf <- rowSums(mgdata[,c(28,29)])
grep("Entertainment|Gyms|Shops|Worship|Cafes",colnames(mgdata))
mgdata$cps <- rowSums(mgdata[,c(30, 32,52, 58, 66)])
mgdata$pysum <- rowSums(mgdata[,c(80:88)])

Data Cleaning

Step 3-Delete outliers

library(dplyr) 
mgdata<-filter(mgdata, mgdata$cases>=0)
mgdata<-filter(mgdata, mgdata$deaths>=0)

Step 4-Clear our environment

remove(mgdata_)
remove(covidata)
remove(resp)
remove(respbydate)
gc() #Free unused memory

##           used (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 1498523 80.1    4193250 224.0  3036041 162.2
## Vcells 5422381 41.4   23420751 178.7 23408072 178.6

Visuaslization

ggplot(data=mgdata,aes(x = dateRep, y = cases, color=Country))+geom_line()+labs(title = "Newly Reported COVID-19 cases in Europe by day",
       x = "Date of Report",
       y = "Country")+theme(text = element_text(size = 20))

mgdata%>%ggplot()+geom_line(aes(x = dateRep, y = Country, color=pysum))+
  scale_colour_gradient(low = "light blue", high = "red")+labs(title = "Number of Policies implemented in Europe by day",
       x = "Date",
       y = "Country")+theme(text = element_text(size = 20))

We also use these code to view the distribution of policies in countries over time. We adjusted our deininations of policies after failing to see enough differences of policies adopted in countries at the same time period.

wktl<-mgdata%>%ggplot()+geom_line(aes(x = dateRep, y = Country, color=wk))+
  scale_colour_gradient(low = "light blue", high = "blue", guide = "none")
masktl<-mgdata%>%ggplot()+geom_line(aes(x = dateRep, y = Country, color=mask))+
  scale_colour_gradient(low = "light blue", high = "blue", guide = "none")
travtl<-mgdata%>%ggplot()+geom_line(aes(x = dateRep, y = Country, color=trav))+
  scale_colour_gradient(low = "light blue", high = "blue", guide = "none")
hometl<-mgdata%>%ggplot()+geom_line(aes(x = dateRep, y = Country, color=home))+
  scale_colour_gradient(low = "light blue", high = "blue", guide = "none")
pbgtl<-mgdata%>%ggplot()+geom_line(aes(x = dateRep, y = Country, color=pbg))+
  scale_colour_gradient(low = "light blue", high = "blue", guide = "none")
prgtl<-mgdata%>%ggplot()+geom_line(aes(x = dateRep, y = Country, color=prg))+
  scale_colour_gradient(low = "light blue", high = "blue", guide = "none")
schtl<-mgdata%>%ggplot()+geom_line(aes(x = dateRep, y = Country, color=sch))+
  scale_colour_gradient(low = "light blue", high = "blue", guide = "none")
pbtftl<-mgdata%>%ggplot()+geom_line(aes(x = dateRep, y = Country, color=pbtf))+
  scale_colour_gradient(low = "light blue", high = "blue", guide = "none")
cpstl<-mgdata%>%ggplot()+geom_line(aes(x = dateRep, y = Country, color=cps))+
  scale_colour_gradient(low = "light blue", high = "blue", guide = "none")

GLM Analysis

options("scipen"=100, "digits"=4)   #remove scientific notation
#Package required"glm"
mgdata$cases_lead=lead(mgdata$cases,7L)
mgdata$deaths_lead=lead(mgdata$deaths,7L)
fm1<-cases_lead~ wk+mask+trav+home+pbg+prg+sch+pbtf+cps + popData2020
fm2<-cases_lead~ wk+mask+trav+home+pbg+prg+sch+pbtf+cps + log(popData2020)
glm1 <- glm(fm1, data = na.omit(mgdata), family = poisson(link="log"))
glm2 <- glm(fm2, data = na.omit(mgdata), family = poisson(link="log"))
summary(glm1)

## 
## Call:
## glm(formula = fm1, family = poisson(link = "log"), data = na.omit(mgdata))
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -410.4   -64.7   -42.1    -6.2  1698.8  
## 
## Coefficients:
##                      Estimate        Std. Error z value            Pr(>|z|)    
## (Intercept)  7.68827375765606  0.00018857203887   40771 <0.0000000000000002 ***
## wk          -0.03431230805708  0.00015086790492    -227 <0.0000000000000002 ***
## mask         0.43724186060443  0.00021319159810    2051 <0.0000000000000002 ***
## trav        -0.24506386580498  0.00014537633111   -1686 <0.0000000000000002 ***
## home         0.07420345938909  0.00018831020599     394 <0.0000000000000002 ***
## pbg         -0.58467124593211  0.00019397380998   -3014 <0.0000000000000002 ***
## prg          0.28982071006519  0.00018342127432    1580 <0.0000000000000002 ***
## sch         -0.25717285466034  0.00013270283216   -1938 <0.0000000000000002 ***
## pbtf         0.15481072938831  0.00023305461706     664 <0.0000000000000002 ***
## cps         -0.05859722754361  0.00010185468799    -575 <0.0000000000000002 ***
## popData2020  0.00000003737789  0.00000000000254   14690 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 594917927  on 28305  degrees of freedom
## Residual deviance: 318927391  on 28295  degrees of freedom
## AIC: 319154561
## 
## Number of Fisher Scoring iterations: 6

summary(glm2)  #with smaller AIC, adopted

## 
## Call:
## glm(formula = fm2, family = poisson(link = "log"), data = na.omit(mgdata))
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -336.5   -60.1   -23.8     3.3  1581.3  
## 
## Coefficients:
##                    Estimate Std. Error z value            Pr(>|z|)    
## (Intercept)      -7.5194546  0.0013252   -5674 <0.0000000000000002 ***
## wk               -0.2106104  0.0001451   -1452 <0.0000000000000002 ***
## mask              0.3952246  0.0002184    1810 <0.0000000000000002 ***
## trav             -0.1203145  0.0001528    -787 <0.0000000000000002 ***
## home             -0.0928037  0.0001914    -485 <0.0000000000000002 ***
## pbg              -0.6001896  0.0001994   -3010 <0.0000000000000002 ***
## prg               0.3095710  0.0001814    1707 <0.0000000000000002 ***
## sch              -0.2718481  0.0001294   -2101 <0.0000000000000002 ***
## pbtf              0.3264349  0.0002292    1425 <0.0000000000000002 ***
## cps              -0.0253386  0.0001052    -241 <0.0000000000000002 ***
## log(popData2020)  0.9864175  0.0000769   12826 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 594917927  on 28305  degrees of freedom
## Residual deviance: 293368429  on 28295  degrees of freedom
## AIC: 293595599
## 
## Number of Fisher Scoring iterations: 6

anova(glm1,glm2,test="Chisq")  #Cannot understand the result

fm3<-deaths_lead~ wk+mask+trav+home+pbg+prg+sch+pbtf+cps + log(popData2020)
glm3 <- glm(fm3, data = na.omit(mgdata), family = poisson(link="log"))
summary(glm3)

## 
## Call:
## glm(formula = fm3, family = poisson(link = "log"), data = na.omit(mgdata))
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
##  -28.1    -4.7    -1.9     0.8   388.6  
## 
## Coefficients:
##                    Estimate Std. Error z value            Pr(>|z|)    
## (Intercept)      -12.209270   0.016772  -727.9 <0.0000000000000002 ***
## wk                 0.025921   0.001323    19.6 <0.0000000000000002 ***
## mask               0.409350   0.002685   152.4 <0.0000000000000002 ***
## trav               0.032495   0.001502    21.6 <0.0000000000000002 ***
## home               0.265870   0.002438   109.1 <0.0000000000000002 ***
## pbg               -0.077288   0.002380   -32.5 <0.0000000000000002 ***
## prg                0.183327   0.002082    88.1 <0.0000000000000002 ***
## sch               -0.014353   0.000864   -16.6 <0.0000000000000002 ***
## pbtf               0.145629   0.002491    58.5 <0.0000000000000002 ***
## cps                0.274627   0.000862   318.4 <0.0000000000000002 ***
## log(popData2020)   0.914457   0.000973   940.0 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 3334090  on 28305  degrees of freedom
## Residual deviance: 1453286  on 28295  degrees of freedom
## AIC: 1547197
## 
## Number of Fisher Scoring iterations: 6

EPH-505 CodeBuild#2 Inferring the effectiveness of government interventions against COVID-19

Chenyi Huang, Linshuoshuo Lyu

2022-11-1

set working directory

Load relevant pacakges for data Cleadning

Read datasets

Data Cleaning

Step 1: set as date

Step 2:duplicate response measures by day

Data Merge

Categorize responses

Data Cleaning

Step 3-Delete outliers

Step 4-Clear our environment

Visuaslization

GLM Analysis