Project Details

A small Exploratory Analysis Project based on COVID-19 situation in INDIA for the year 2020.

Loading Required Package for Analysis

library(COVID19)
## Warning: package 'COVID19' was built under R version 3.6.3
library(ggplot2)
library(lubridate)
## Warning: package 'lubridate' was built under R version 3.6.3
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Fetching Data from COVID19 package for INDIA & DELHI

In the following commands we fetch the data for INDIA & DELHI from the COVID19 package.

IND<-covid19("India",level=1)
## We have invested a lot of time and effort in creating COVID-19 Data Hub, please cite the following when using it:
## 
##   Guidotti, E., Ardia, D., (2020), "COVID-19 Data Hub", Journal of Open
##   Source Software 5(51):2376, doi: 10.21105/joss.02376.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Article{,
##     title = {COVID-19 Data Hub},
##     year = {2020},
##     doi = {10.21105/joss.02376},
##     author = {Emanuele Guidotti and David Ardia},
##     journal = {Journal of Open Source Software},
##     volume = {5},
##     number = {51},
##     pages = {2376},
##   }
## 
## To retrieve citation and metadata of the data sources see ?covid19cite. To hide this message use 'verbose = FALSE'.
dim(IND)
## [1] 469  36
View(IND)

IND2<-covid19("India",level=2)
## We have invested a lot of time and effort in creating COVID-19 Data Hub, please cite the following when using it:
## 
##   Guidotti, E., Ardia, D., (2020), "COVID-19 Data Hub", Journal of Open
##   Source Software 5(51):2376, doi: 10.21105/joss.02376.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Article{,
##     title = {COVID-19 Data Hub},
##     year = {2020},
##     doi = {10.21105/joss.02376},
##     author = {Emanuele Guidotti and David Ardia},
##     journal = {Journal of Open Source Software},
##     volume = {5},
##     number = {51},
##     pages = {2376},
##   }
## 
## To retrieve citation and metadata of the data sources see ?covid19cite. To hide this message use 'verbose = FALSE'.
View(IND2)
DL<-IND2[IND2$administrative_area_level_2=="Delhi",]

Structuring Data

In the following commands we restructure the data on the basis of months and year.

IND<-mutate(IND,Month=factor(month(date)),Year=year(date))
str(IND$Month)
##  Factor w/ 12 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...

Fetching 2020 Data

IND_2020<-IND[IND$Year==2020,]
IND_MONTH<-split(IND_2020,IND_2020$Month)

Monthly Data

Using this data we can do analysis on the Monthly basis.

Jan<-IND_MONTH$'1'
Feb<-IND_MONTH$'2'
Mar<-IND_MONTH$'3'
Apr<-IND_MONTH$'4'
May<-IND_MONTH$'5'
Jun<-IND_MONTH$'6'
Jul<-IND_MONTH$'7'
Aug<-IND_MONTH$'8'
Sep<-IND_MONTH$'9'
Oct<-IND_MONTH$'10'

Overall Analysis(From 1st January 2020 - 14th April 2021)

In this overall analysis we have plotted the following graphs:

  1. Number of Confirmed Cases Per Day.
  2. Number of Deaths Per Day.
  3. Comparison of Deaths Per Day and Confirmed Cases Per Day.
  4. Number of Confirmed Cases Per Month.
  5. Number of Recovered Cases Per Month.
  6. Number of Deaths Per Month.
  7. Number of Tests Per Month.
  8. Delhi Analysis.

1. Number of Confirmed Cases Per Day

Through the following commands we have plotted the number of confirmed cases per day.

CTperday<-numeric(length(IND$confirmed))

for(i in 1:(length(IND$confirmed)-1)){
  CTperday[i]<-IND$confirmed[i+1]-IND$confirmed[i]
}
ct<-data.frame(Days=1:length(IND$confirmed),NC=CTperday)

ggplot(ct,aes(x=Days,y=NC,fill=NC))+
  geom_bar(stat="Identity")+
  xlab("Days")+
  ylab("No. of Cases Per Day")

2. Number of Deaths Per Day

Through the following commands we have plotted the number of deaths per day.

DTperday<-numeric(length(IND$deaths))

for(i in 1:(length(IND$deaths)-1)){
  DTperday[i]<-IND$deaths[i+1]-IND$deaths[i]
}
dt<-data.frame(Days=1:length(IND$deaths),ND=DTperday)

ggplot(dt,aes(x=Days,y=ND,fill=ND))+
  geom_bar(stat="Identity")+
  xlab("Days")+
  ylab("No. of Deaths Per Day")

3. Comparison of Deaths Per Day and Confirmed Cases Per Day.

ggplot(dt,aes(x=Days,y=20*ND))+
  geom_bar(stat="Identity")+
  xlab("Days")+
  ylab("No. of Cases Per Day")+
  geom_line(aes(x=ct$Days,y=ct$NC),size=1,col="red")+
  scale_y_continuous(sec.axis=sec_axis(~./20,name="No. of Deaths Per Day "))

Month-Wise Analysis

conf<-sapply(IND_MONTH,function(x){x$confirmed[length(x$confirmed)]-x$confirmed[1]})
dea<-sapply(IND_MONTH,function(x){x$deaths[length(x$deaths)]-x$deaths[1]})
rec<-sapply(IND_MONTH,function(x){x$recovered[length(x$recovered)]-x$recovered[1]})
test<-sapply(IND_MONTH,function(x){x$tests[length(x$tests)]-x$tests[1]})

Month<-factor(month.abb[1:length(IND_MONTH)],levels=month.abb[1:length(IND_MONTH)])
df<-data.frame(Month,Confirmed_Cases=conf,Recovered_Cases=rec,Deaths=dea,Tests=test)
df<-mutate(df,Death_Per=(Deaths/Confirmed_Cases)*100)
df<-mutate(df,Positivity_Per=(Confirmed_Cases/Tests)*100)
df
##    Month Confirmed_Cases Recovered_Cases Deaths    Tests Death_Per
## 1    Jan               1               0      0        0  0.000000
## 2    Feb               2               3      0        0  0.000000
## 3    Mar            1632             147     47    27688  2.879902
## 4    Apr           32807            8890   1101   802513  3.355991
## 5    May          153387           81841   4175  2834373  2.721873
## 6    Jun          387423          252096  11804  4771447  3.046799
## 7    Jul         1091844          735634  18717 10006385  1.714256
## 8    Aug         1935768         1690464  28025 22949255  1.447746
## 9    Sep         2544155         2370478  31785 30871895  1.249334
## 10   Oct         1791257         2140702  22338 33176283  1.247057
## 11   Nov         1233933         1345690  15018 30492673  1.217084
## 12   Dec          786582          949929  10858 30699976  1.380403
##    Positivity_Per
## 1             Inf
## 2             Inf
## 3        5.894250
## 4        4.088033
## 5        5.411673
## 6        8.119612
## 7       10.911473
## 8        8.434993
## 9        8.241007
## 10       5.399209
## 11       4.046654
## 12       2.562158

4. Number of Confirmed Cases Per Month

Through the following commands we have plotted the breakdown of number of confirmed cases per month.

  ggplot(df,aes(x=Month,y=Confirmed_Cases,label=Confirmed_Cases,fill=Month))+
  geom_bar(stat="Identity")+
  xlab("Month")+
  ylab("Confirmed Cases Per Month")+
  geom_label(color="white")

5. Number of Recovered Cases Per Month

Through the following commands we have plotted the breakdown of number of recovered cases per month.

  ggplot(df,aes(Month,Recovered_Cases,label=Recovered_Cases,fill=Month))+
  geom_bar(stat="Identity")+
  xlab("Month")+
  ylab("Recovered Cases Per Month")+
  geom_label(color="white")

6. Number of Deaths Per Month

Through the following commands we have plotted the breakdown of number of confirmed cases per month.

  ggplot(df,aes(Month,Deaths,label=Deaths,fill=Month))+
  geom_bar(stat="Identity")+
  xlab("Month")+
  ylab("Deaths Per Month")+
  geom_label(color="white")

7. Number of Tests Per Month

Through the following commands we have plotted the breakdown of number of tests per month.

  ggplot(df,aes(Month,test,label=test,fill=Month))+
  geom_bar(stat="Identity")+
  xlab("Month")+
  ylab("Tests Per Month")+
  geom_label(color="white")

8. Analyzing Delhi

DL<-mutate(DL,Month=factor(month(date)),Year=year(date))
View(DL)

Overall Analysis(From 1st January 2020 - 14th April 2021)

DLCperday<-numeric(length(DL$confirmed))
for(i in 1:(length(DL$confirmed)-1)){
  DLCperday[i]<-DL$confirmed[i+1]-DL$confirmed[i]
}
dc<-data.frame(Days=1:length(DL$confirmed),No_Cases=DLCperday)

ggplot(dc,aes(Days,No_Cases,fill=No_Cases))+
  geom_bar(stat="Identity")+
  xlab("Days")+
  ylab("No. of Cases Per Day")

DLDperday<-numeric(length(DL$deaths))
for(i in 1:(length(DL$deaths)-1)){
  DLDperday[i]<-DL$deaths[i+1]-DL$deaths[i]
}
dc<-data.frame(Days=1:length(DL$deaths),No_Deaths=DLDperday)

ggplot(dc,aes(Days,No_Deaths,fill=No_Deaths))+
  geom_bar(stat="Identity")+
  xlab("Days")+
  ylab("No. of Deaths Per Day")

DL_2020<-DL[DL$Year==2020,]
DL_Month<-split(DL_2020,DL_2020$Month)

dlconf<-numeric(length(DL_Month))
dlrec<-numeric(length(DL_Month))
dldea<-numeric(length(DL_Month))

dlconf<-sapply(DL_Month,function(x){x$confirmed[length(x$confirmed)]-x$confirmed[1]})
dldea<-sapply(DL_Month,function(x){x$deaths[length(x$deaths)]-x$deaths[1]})
dlrec<-sapply(DL_Month,function(x){x$recovered[length(x$recovered)]-x$recovered[1]})

dldf<-data.frame(Month,Confirmed_Cases=dlconf,Recovered_Cases=dlrec,Deaths=dldea)
dldf
##    Month Confirmed_Cases Recovered_Cases Deaths
## 1    Jan               0               0      0
## 2    Feb               0               0      0
## 3    Mar             120               6      2
## 4    Apr            3363            1088     57
## 5    May           16106            7311    412
## 6    Jun           66526           49602   2219
## 7    Jul           45796           60938   1160
## 8    Aug           38032           33547    455
## 9    Sep          102655           90718    899
## 10   Oct          103954           96863   1110
## 11   Nov          178004          176680   2612
## 12   Dec           50989           75971   1276
ggplot(dldf,aes(x=Month,y=Confirmed_Cases,label=Confirmed_Cases,fill=Month))+
  geom_bar(stat="Identity")+
  xlab("Month")+
  ylab("Confirmed Cases Per Month in Delhi")+
  geom_label(color="white")

ggplot(dldf,aes(Month,Deaths,label=Deaths,fill=Month))+
  geom_bar(stat="Identity")+
  xlab("Month")+
  ylab("Deaths Per Month")+
  geom_label(color="white")

Summary

  1. There is a stark contrast in the ‘No. of Cases Per Day’ graph. If we look at India’s graph then it can be easily seen that the first peak hit around 250th day(September) from the onset of first case and the Nation was heading towards its second peak around 400th day(April) while Delhi was in the middle of its second wave around 250th day(September) and by 400th day(April) was heading towards its fourth peak.

  2. Also, the peak of second wave is way higher in comparison to first one.