The dataset covers 163 countries and almost 3 full months from 2020 till April 17, which is enough data to get some clues about the pandemic. Let’s see a few plots of the worldwide tendency to see if we can extract some insights:
You can point out different country to see more detailed information about ConfirmedCases
maps<-train%>%
filter(Date==max(Date))%>%group_by(Country_Region)%>%summarize(ConfirmedCases=sum(ConfirmedCases),Fatalities=sum(Fatalities))%>%
mutate(iso3=countrycode(Country_Region,origin='country.name',destination="iso3c"))
highchart() %>%
hc_add_series_map(worldgeojson, maps, value = 'ConfirmedCases', joinBy = 'iso3') %>%
hc_title(text = 'COVID-19 worldwide total confirmedcases in April 17') %>%
hc_colorAxis(minColor = "#3366ff", maxColor = "#ff0000") %>%
hc_tooltip(useHTML = TRUE, headerFormat = "", pointFormat = "{point.Country_Region}: {point.ConfirmedCases} Cases")
Obviously, situation in United States is worst in whole world and some Europe countries like Itality, Spain, United Kingdom are also undergoing worse threat of pandemic.
From the charts, it is clearly the exponential growth of the COVID-19 is still in high speed and current situation is still worse that we can see the slope in the last days of datasets was still high enough. And in the near future, the confirmed cases may be reach 4 million level. And unfortunately, the Fatalities cases are in a rapid increase. The pandemic is still in a worse situation that none of us should relax vigilance.
Since China was the initial infected country, the COVID-19 performance is different from the rest of the world. The medical system was not prepared for the pandemic, in fact no one was aware of the virus until several cases were reported. However, China government took strong and intense measures in a considerable short period of time and, while the virus is widely spread, they have been able to control the situation of the infections. While the terrible situation in China still didn’t warn other countries, some countries missed the best time to control the infections and it resulted in more worse situation in these countries. So we display some plot to compare the COVID-19 situation in different countries which are experiencing larger increase.
country<-function(x){
data<-train%>%
filter(Country_Region==paste0(x,''))%>%
group_by(Date) %>% summarize(ConfirmedCases=sum(ConfirmedCases),Fatalities=sum(Fatalities))%>%mutate(Country_Region=x)
return(data)
}
CN=country('China')
US=country('US')
Italy=country('Italy')
UK=country('United Kingdom')
Spain=country('Spain')
df<-rbind(CN,US,UK,Italy,UK,Spain)
options(scipen = 9)
p<-ggplot(df,aes(x=Date,y=ConfirmedCases,group=Country_Region,color=Country_Region))+
geom_line(aes(linetype=Country_Region))+
ggtitle('Confirmed Cases in different country region tendency over times')
p+geom_point()+transition_reveal(Date)
q<-ggplot(df,aes(x=Date,y=Fatalities,group=Country_Region,color=Country_Region))+
geom_line(aes(linetype=Country_Region))+
ggtitle('Fatalities Cases in different country region tendency over times')
q+geom_point()+transition_reveal(Date)
mapCN<-train%>%
filter(Country_Region=='China'&Date==max(Date))
hcmap(map="countries/cn/custom/cn-all-sar",data=mapCN,
name='Province_State',value='ConfirmedCases',joinBy=c("woe-name","Province_State"),borderColor = "#FAFAFA", borderWidth = 0.1)%>%
hc_colorAxis(mincolor="#FEE5D9",maxcolor="#A50F15",type='logarithmic')%>%
hc_title(text = 'COVID-19 total confirmedcases in China April 17')%>%
hc_tooltip(useHTML = TRUE, headerFormat = "", pointFormat = "{point.Province_State}: {point.ConfirmedCases} Cases")
mapUS<-train%>%
filter(Country_Region=='US'&Date==max(Date))
hcmap(map="countries/us/us-all",data=mapUS,
name='Province_State',value='ConfirmedCases',joinBy=c("woe-name","Province_State"),borderColor = "#FAFAFA", borderWidth = 0.1)%>%
hc_colorAxis(mincolor="#FEE5D9",maxcolor="#A50F15",type='logarithmic')%>%
hc_title(text = 'COVID-19 total confirmedcases in China April 17')%>%
hc_tooltip(useHTML = TRUE, headerFormat = "", pointFormat = "{point.Province_State}: {point.ConfirmedCases} Cases")
mapUS%>%arrange(desc(ConfirmedCases))%>%
top_n(6,ConfirmedCases)%>%gather(Status,Num,-c("Id","Province_State","Country_Region","Date"))%>%
hchart(type='column',hcaes(x='Province_State',y=Num,group="Status"))%>%hc_title(text = "COVID-19 cases in top 6 province state in United States", style = list(fontWeight = "bold"), align = "center")
At the begining, complete the necessary data preprocess
train[which(is.na(train$Province_State)),'Province_State']=train[which(is.na(train$Province_State)),'Country_Region']
test[which(is.na(test$Province_State)),'Province_State']=test[which(is.na(test$Province_State)),'Country_Region']
train$Days<-as.numeric(train$Date-min(train$Date))
test$Days<-as.numeric(test$Date-min(train$Date))
train$regionID<-paste0(train$Country_Region,'-',train$Province_State)
test$regionID<-paste0(test$Country_Region,'-',test$Province_State)
str(train)
#> tibble [27,231 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
#> $ Id : num [1:27231] 1 2 3 4 5 6 7 8 9 10 ...
#> $ Province_State: chr [1:27231] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
#> $ Country_Region: chr [1:27231] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
#> $ Date : Date[1:27231], format: "2020-01-22" "2020-01-23" ...
#> $ ConfirmedCases: num [1:27231] 0 0 0 0 0 0 0 0 0 0 ...
#> $ Fatalities : num [1:27231] 0 0 0 0 0 0 0 0 0 0 ...
#> $ Days : num [1:27231] 0 1 2 3 4 5 6 7 8 9 ...
#> $ regionID : chr [1:27231] "Afghanistan-Afghanistan" "Afghanistan-Afghanistan" "Afghanistan-Afghanistan" "Afghanistan-Afghanistan" ...
#> - attr(*, "spec")=
#> .. cols(
#> .. Id = col_double(),
#> .. Province_State = col_character(),
#> .. Country_Region = col_character(),
#> .. Date = col_date(format = ""),
#> .. ConfirmedCases = col_double(),
#> .. Fatalities = col_double()
#> .. )
the model is applied and evaluated on worldwide data provided by the Johns Hopkins University Center for Systems Science and Engineering (JHU CSSE) by 04/17.
#forecasting
LoessConfirmedCases<-list()
LoessFatalities<-list()
df<-list()
for (region in unique(train$regionID)) {
traindf<-train[train$regionID==region, ]
rownames(traindf) <- NULL
df[[region]] <- traindf
data<-df[[region]]
LoessConfirmedCases[[region]]<-loess(ConfirmedCases~Days, data=data,span = 0.4,control=loess.control(surface = "direct"))
LoessFatalities[[region]] <- loess(Fatalities ~ Days, data = data, span=0.4,control=loess.control(surface = "direct"))
}
trainpred<-data.frame()
for (region in unique(train$regionID)){
pred<-train[train$regionID==region, ]
pred$predictConfirmedCases<-round(predict(LoessConfirmedCases[[region]],pred),0)
pred$predictConfirmedCases[pred$predictConfirmedCases<0]<-0
pred$predictFatalities<-round(predict(LoessFatalities[[region]],pred),0)
pred$predictFatalities[pred$predictFatalities<0]<-0
trainpred<-rbind(trainpred,pred)
}
trainpred<-trainpred%>%select(Days,regionID,ConfirmedCases,predictConfirmedCases,Fatalities,predictFatalities)
Take US for an example, I want to visualize difference between the raw data and my prediction.
USpred<-trainpred%>%filter(str_detect(regionID, "US"))%>%
group_by(Days)%>%
summarize(ConfirmedCases=sum(ConfirmedCases),predictConfirmedCases=sum(predictConfirmedCases),
Fatalities=sum(Fatalities),predictFatalities=sum(predictFatalities))
USpred%>%
gather(key='measure',value='Num',-Days)%>%
hchart("line",hcaes(x=Days,y=Num,group=measure),color = c("#e5b13a", "#4bd5ee", "#4AA942", "#FAFAFA"))%>%
hc_title(
text = "Predict VS Actual ConfirmedCases/Fatalities in US",
useHTML = TRUE) %>%
hc_tooltip(table = TRUE, sort = TRUE) %>%
hc_add_theme(
hc_theme_flatdark(
chart = list(
backgroundColor = "transparent",
divBackgroundImage = "http://www.wired.com/images_blogs/underwire/2013/02/xwing-bg.gif"
)
)
)%>%
hc_xAxis(min=42)%>%
hc_yAxis(type = "logarithmic")
##Forecasting for later future I apply the model established above to analyze the future in different country and region.
testpred<-data.frame()
for (region in unique(test$regionID)){
predtest<-test[test$regionID==region, ]
predtest$predictConfirmedCases<-round(predict(LoessConfirmedCases[[region]],predtest),0)
predtest$predictConfirmedCases[predtest$predictConfirmedCases<0]<-0
predtest$predictFatalities<-round(predict(LoessFatalities[[region]],predtest),0)
predtest$predictFatalities[predtest$predictFatalities<0]<-0
testpred<-rbind(testpred,predtest)
}
testpred<-testpred%>%
select(ForecastId,Province_State,Country_Region,Date,predictConfirmedCases,predictFatalities)
I also visualized the prediction in US for several weeks.
USpred<-testpred%>%filter(Country_Region=='US')%>%
group_by(Date)%>%
summarise(ConfirmedCases=sum(predictConfirmedCases),Fatalities=sum(predictFatalities))
US<-select(US,-Country_Region)
US_actual_pred<-rbind(US,USpred)
US_actual_pred$ID<-c('predicted')
US_actual_pred$ID[1:87]<-c('Raw')
US_actual_pred%>%
gather(key='measure',value='Num',-Date,-ID)%>%mutate(measureID=paste0(ID,'-',measure))%>%
hchart("line",hcaes(x=Date,y=Num,group=measureID),color = c("#e5b13a", "#4bd5ee", "#4AA942", "#FAFAFA"))%>%
hc_title(
text = "Predict VS Actual ConfirmedCases/Fatalities in US",
useHTML = TRUE) %>%
hc_tooltip(table = TRUE, sort = TRUE) %>%
hc_add_theme(
hc_theme_flatdark(
chart = list(
backgroundColor = "transparent",
divBackgroundImage = "http://www.wired.com/images_blogs/underwire/2013/02/xwing-bg.gif"
)
)
)#%>%
#hc_yAxis(type = "logarithmic")