A small Exploratory Analysis Project based on COVID-19 situation in INDIA for the year 2020.
library(COVID19)
## Warning: package 'COVID19' was built under R version 3.6.3
library(ggplot2)
library(lubridate)
## Warning: package 'lubridate' was built under R version 3.6.3
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
In the following commands we fetch the data for INDIA & DELHI from the COVID19 package.
IND<-covid19("India",level=1)
## We have invested a lot of time and effort in creating COVID-19 Data Hub, please cite the following when using it:
##
## Guidotti, E., Ardia, D., (2020), "COVID-19 Data Hub", Journal of Open
## Source Software 5(51):2376, doi: 10.21105/joss.02376.
##
## A BibTeX entry for LaTeX users is
##
## @Article{,
## title = {COVID-19 Data Hub},
## year = {2020},
## doi = {10.21105/joss.02376},
## author = {Emanuele Guidotti and David Ardia},
## journal = {Journal of Open Source Software},
## volume = {5},
## number = {51},
## pages = {2376},
## }
##
## To retrieve citation and metadata of the data sources see ?covid19cite. To hide this message use 'verbose = FALSE'.
dim(IND)
## [1] 469 36
View(IND)
IND2<-covid19("India",level=2)
## We have invested a lot of time and effort in creating COVID-19 Data Hub, please cite the following when using it:
##
## Guidotti, E., Ardia, D., (2020), "COVID-19 Data Hub", Journal of Open
## Source Software 5(51):2376, doi: 10.21105/joss.02376.
##
## A BibTeX entry for LaTeX users is
##
## @Article{,
## title = {COVID-19 Data Hub},
## year = {2020},
## doi = {10.21105/joss.02376},
## author = {Emanuele Guidotti and David Ardia},
## journal = {Journal of Open Source Software},
## volume = {5},
## number = {51},
## pages = {2376},
## }
##
## To retrieve citation and metadata of the data sources see ?covid19cite. To hide this message use 'verbose = FALSE'.
View(IND2)
DL<-IND2[IND2$administrative_area_level_2=="Delhi",]
In the following commands we restructure the data on the basis of months and year.
IND<-mutate(IND,Month=factor(month(date)),Year=year(date))
str(IND$Month)
## Factor w/ 12 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
IND_2020<-IND[IND$Year==2020,]
IND_MONTH<-split(IND_2020,IND_2020$Month)
Using this data we can do analysis on the Monthly basis.
Jan<-IND_MONTH$'1'
Feb<-IND_MONTH$'2'
Mar<-IND_MONTH$'3'
Apr<-IND_MONTH$'4'
May<-IND_MONTH$'5'
Jun<-IND_MONTH$'6'
Jul<-IND_MONTH$'7'
Aug<-IND_MONTH$'8'
Sep<-IND_MONTH$'9'
Oct<-IND_MONTH$'10'
In this overall analysis we have plotted the following graphs:
Through the following commands we have plotted the number of confirmed cases per day.
CTperday<-numeric(length(IND$confirmed))
for(i in 1:(length(IND$confirmed)-1)){
CTperday[i]<-IND$confirmed[i+1]-IND$confirmed[i]
}
ct<-data.frame(Days=1:length(IND$confirmed),NC=CTperday)
ggplot(ct,aes(x=Days,y=NC,fill=NC))+
geom_bar(stat="Identity")+
xlab("Days")+
ylab("No. of Cases Per Day")
Through the following commands we have plotted the number of deaths per day.
DTperday<-numeric(length(IND$deaths))
for(i in 1:(length(IND$deaths)-1)){
DTperday[i]<-IND$deaths[i+1]-IND$deaths[i]
}
dt<-data.frame(Days=1:length(IND$deaths),ND=DTperday)
ggplot(dt,aes(x=Days,y=ND,fill=ND))+
geom_bar(stat="Identity")+
xlab("Days")+
ylab("No. of Deaths Per Day")
ggplot(dt,aes(x=Days,y=20*ND))+
geom_bar(stat="Identity")+
xlab("Days")+
ylab("No. of Cases Per Day")+
geom_line(aes(x=ct$Days,y=ct$NC),size=1,col="red")+
scale_y_continuous(sec.axis=sec_axis(~./20,name="No. of Deaths Per Day "))
conf<-sapply(IND_MONTH,function(x){x$confirmed[length(x$confirmed)]-x$confirmed[1]})
dea<-sapply(IND_MONTH,function(x){x$deaths[length(x$deaths)]-x$deaths[1]})
rec<-sapply(IND_MONTH,function(x){x$recovered[length(x$recovered)]-x$recovered[1]})
test<-sapply(IND_MONTH,function(x){x$tests[length(x$tests)]-x$tests[1]})
Month<-factor(month.abb[1:length(IND_MONTH)],levels=month.abb[1:length(IND_MONTH)])
df<-data.frame(Month,Confirmed_Cases=conf,Recovered_Cases=rec,Deaths=dea,Tests=test)
df<-mutate(df,Death_Per=(Deaths/Confirmed_Cases)*100)
df<-mutate(df,Positivity_Per=(Confirmed_Cases/Tests)*100)
df
## Month Confirmed_Cases Recovered_Cases Deaths Tests Death_Per
## 1 Jan 1 0 0 0 0.000000
## 2 Feb 2 3 0 0 0.000000
## 3 Mar 1632 147 47 27688 2.879902
## 4 Apr 32807 8890 1101 802513 3.355991
## 5 May 153387 81841 4175 2834373 2.721873
## 6 Jun 387423 252096 11804 4771447 3.046799
## 7 Jul 1091844 735634 18717 10006385 1.714256
## 8 Aug 1935768 1690464 28025 22949255 1.447746
## 9 Sep 2544155 2370478 31785 30871895 1.249334
## 10 Oct 1791257 2140702 22338 33176283 1.247057
## 11 Nov 1233933 1345690 15018 30492673 1.217084
## 12 Dec 786582 949929 10858 30699976 1.380403
## Positivity_Per
## 1 Inf
## 2 Inf
## 3 5.894250
## 4 4.088033
## 5 5.411673
## 6 8.119612
## 7 10.911473
## 8 8.434993
## 9 8.241007
## 10 5.399209
## 11 4.046654
## 12 2.562158
Through the following commands we have plotted the breakdown of number of confirmed cases per month.
ggplot(df,aes(x=Month,y=Confirmed_Cases,label=Confirmed_Cases,fill=Month))+
geom_bar(stat="Identity")+
xlab("Month")+
ylab("Confirmed Cases Per Month")+
geom_label(color="white")
Through the following commands we have plotted the breakdown of number of recovered cases per month.
ggplot(df,aes(Month,Recovered_Cases,label=Recovered_Cases,fill=Month))+
geom_bar(stat="Identity")+
xlab("Month")+
ylab("Recovered Cases Per Month")+
geom_label(color="white")
Through the following commands we have plotted the breakdown of number of confirmed cases per month.
ggplot(df,aes(Month,Deaths,label=Deaths,fill=Month))+
geom_bar(stat="Identity")+
xlab("Month")+
ylab("Deaths Per Month")+
geom_label(color="white")
Through the following commands we have plotted the breakdown of number of tests per month.
ggplot(df,aes(Month,test,label=test,fill=Month))+
geom_bar(stat="Identity")+
xlab("Month")+
ylab("Tests Per Month")+
geom_label(color="white")
DL<-mutate(DL,Month=factor(month(date)),Year=year(date))
View(DL)
DLCperday<-numeric(length(DL$confirmed))
for(i in 1:(length(DL$confirmed)-1)){
DLCperday[i]<-DL$confirmed[i+1]-DL$confirmed[i]
}
dc<-data.frame(Days=1:length(DL$confirmed),No_Cases=DLCperday)
ggplot(dc,aes(Days,No_Cases,fill=No_Cases))+
geom_bar(stat="Identity")+
xlab("Days")+
ylab("No. of Cases Per Day")
DLDperday<-numeric(length(DL$deaths))
for(i in 1:(length(DL$deaths)-1)){
DLDperday[i]<-DL$deaths[i+1]-DL$deaths[i]
}
dc<-data.frame(Days=1:length(DL$deaths),No_Deaths=DLDperday)
ggplot(dc,aes(Days,No_Deaths,fill=No_Deaths))+
geom_bar(stat="Identity")+
xlab("Days")+
ylab("No. of Deaths Per Day")
DL_2020<-DL[DL$Year==2020,]
DL_Month<-split(DL_2020,DL_2020$Month)
dlconf<-numeric(length(DL_Month))
dlrec<-numeric(length(DL_Month))
dldea<-numeric(length(DL_Month))
dlconf<-sapply(DL_Month,function(x){x$confirmed[length(x$confirmed)]-x$confirmed[1]})
dldea<-sapply(DL_Month,function(x){x$deaths[length(x$deaths)]-x$deaths[1]})
dlrec<-sapply(DL_Month,function(x){x$recovered[length(x$recovered)]-x$recovered[1]})
dldf<-data.frame(Month,Confirmed_Cases=dlconf,Recovered_Cases=dlrec,Deaths=dldea)
dldf
## Month Confirmed_Cases Recovered_Cases Deaths
## 1 Jan 0 0 0
## 2 Feb 0 0 0
## 3 Mar 120 6 2
## 4 Apr 3363 1088 57
## 5 May 16106 7311 412
## 6 Jun 66526 49602 2219
## 7 Jul 45796 60938 1160
## 8 Aug 38032 33547 455
## 9 Sep 102655 90718 899
## 10 Oct 103954 96863 1110
## 11 Nov 178004 176680 2612
## 12 Dec 50989 75971 1276
ggplot(dldf,aes(x=Month,y=Confirmed_Cases,label=Confirmed_Cases,fill=Month))+
geom_bar(stat="Identity")+
xlab("Month")+
ylab("Confirmed Cases Per Month in Delhi")+
geom_label(color="white")
ggplot(dldf,aes(Month,Deaths,label=Deaths,fill=Month))+
geom_bar(stat="Identity")+
xlab("Month")+
ylab("Deaths Per Month")+
geom_label(color="white")
There is a stark contrast in the ‘No. of Cases Per Day’ graph. If we look at India’s graph then it can be easily seen that the first peak hit around 250th day(September) from the onset of first case and the Nation was heading towards its second peak around 400th day(April) while Delhi was in the middle of its second wave around 250th day(September) and by 400th day(April) was heading towards its fourth peak.
Also, the peak of second wave is way higher in comparison to first one.