Coronavirus disease 2019 (COVID-19) is a contagious disease caused by severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2). The first known case was identified in Wuhan, China, in December 2019.The disease has since spread worldwide, leading to an ongoing pandemic.

Symptoms of COVID-19 are variable, but often include fever,cough, headache,fatigue, breathing difficulties, and loss of smell and taste.Symptoms may begin one to fourteen days after exposure to the virus. At least a third of people who are infected do not develop noticeable symptoms.

Lets study and visualize covid dataset

This data is sourced from governments, national and subnational agencies across the world — a full list of data sources for each country is published on Johns Hopkins GitHub site.

Load the required packages

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.5
library(plotly)
## Warning: package 'plotly' was built under R version 4.0.4
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.0.4
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

Import the dataset

owid_dataset<-read.csv("owid-covid-data.csv",header=T)
glimpse(owid_dataset)
## Rows: 105,010
## Columns: 60
## $ iso_code                              <chr> "AFG", "AFG", "AFG", "AFG", "...
## $ continent                             <chr> "Asia", "Asia", "Asia", "Asia...
## $ location                              <chr> "Afghanistan", "Afghanistan",...
## $ date                                  <chr> "2020-02-24", "2020-02-25", "...
## $ total_cases                           <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 2, 4,...
## $ new_cases                             <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 1, 2,...
## $ new_cases_smoothed                    <dbl> NA, NA, NA, NA, NA, 0.143, 0....
## $ total_deaths                          <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ new_deaths                            <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ new_deaths_smoothed                   <dbl> NA, NA, NA, NA, NA, 0, 0, 0, ...
## $ total_cases_per_million               <dbl> 0.026, 0.026, 0.026, 0.026, 0...
## $ new_cases_per_million                 <dbl> 0.026, 0.000, 0.000, 0.000, 0...
## $ new_cases_smoothed_per_million        <dbl> NA, NA, NA, NA, NA, 0.004, 0....
## $ total_deaths_per_million              <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ new_deaths_per_million                <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ new_deaths_smoothed_per_million       <dbl> NA, NA, NA, NA, NA, 0, 0, 0, ...
## $ reproduction_rate                     <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ icu_patients                          <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ icu_patients_per_million              <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ hosp_patients                         <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ hosp_patients_per_million             <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ weekly_icu_admissions                 <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ weekly_icu_admissions_per_million     <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ weekly_hosp_admissions                <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ weekly_hosp_admissions_per_million    <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ new_tests                             <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ total_tests                           <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ total_tests_per_thousand              <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ new_tests_per_thousand                <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ new_tests_smoothed                    <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ new_tests_smoothed_per_thousand       <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ positive_rate                         <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ tests_per_case                        <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ tests_units                           <chr> "", "", "", "", "", "", "", "...
## $ total_vaccinations                    <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ people_vaccinated                     <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ people_fully_vaccinated               <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ new_vaccinations                      <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ new_vaccinations_smoothed             <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ total_vaccinations_per_hundred        <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ people_vaccinated_per_hundred         <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ people_fully_vaccinated_per_hundred   <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ new_vaccinations_smoothed_per_million <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ stringency_index                      <dbl> 8.33, 8.33, 8.33, 8.33, 8.33,...
## $ population                            <dbl> 38928341, 38928341, 38928341,...
## $ population_density                    <dbl> 54.422, 54.422, 54.422, 54.42...
## $ median_age                            <dbl> 18.6, 18.6, 18.6, 18.6, 18.6,...
## $ aged_65_older                         <dbl> 2.581, 2.581, 2.581, 2.581, 2...
## $ aged_70_older                         <dbl> 1.337, 1.337, 1.337, 1.337, 1...
## $ gdp_per_capita                        <dbl> 1803.987, 1803.987, 1803.987,...
## $ extreme_poverty                       <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ cardiovasc_death_rate                 <dbl> 597.029, 597.029, 597.029, 59...
## $ diabetes_prevalence                   <dbl> 9.59, 9.59, 9.59, 9.59, 9.59,...
## $ female_smokers                        <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ male_smokers                          <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ handwashing_facilities                <dbl> 37.746, 37.746, 37.746, 37.74...
## $ hospital_beds_per_thousand            <dbl> 0.5, 0.5, 0.5, 0.5, 0.5, 0.5,...
## $ life_expectancy                       <dbl> 64.83, 64.83, 64.83, 64.83, 6...
## $ human_development_index               <dbl> 0.511, 0.511, 0.511, 0.511, 0...
## $ excess_mortality                      <dbl> NA, NA, NA, NA, NA, NA, NA, N...

Clean the data

#choose the required columns 
owid_dataset<-owid_dataset[,c("continent","location","date","total_cases","new_cases","total_deaths","new_deaths")]

#remove the continenets data from the location column
owid_dataset<-owid_dataset%>%
  filter(location!="World",location!="Asia",location!="Europe",location!="Africa",location!="North America",location!="South America",location!="Oceania",location!="",location!="Australia",location!="European Union")

glimpse(owid_dataset)
## Rows: 100,113
## Columns: 7
## $ continent    <chr> "Asia", "Asia", "Asia", "Asia", "Asia", "Asia", "Asia"...
## $ location     <chr> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanis...
## $ date         <chr> "2020-02-24", "2020-02-25", "2020-02-26", "2020-02-27"...
## $ total_cases  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 2, 4, 4, 4, 4, 5, 7, 8, 11, 12...
## $ new_cases    <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 1, 2, 1, 3, 1, ...
## $ total_deaths <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ new_deaths   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...

Total confirmed cases by date

confirmed_cases_by_date<-owid_dataset%>%
  select(date,new_cases,location)%>%
  na.omit()%>%
  mutate(new_cases=abs(new_cases))%>%
  group_by(date)%>%
plot_ly(x=~date,y=~new_cases,name = "cases",type='scatter',mode='lines',line = list(color = 'rgb(100, 80, 90)'))
confirmed_cases_by_date

Total confirmed deaths by date

confirmed_deaths_by_date<-owid_dataset%>%
  select(date,new_deaths,location)%>%
  na.omit()%>%
  mutate(new_deaths=abs(new_deaths))%>%
  group_by(date)%>%
  plot_ly(x=~date,y=~new_deaths,name = "deaths",type='scatter',mode='lines',line = list(color = 'rgb(200, 150, 90)'))
confirmed_deaths_by_date

Set date into date format

#add na values with 0
owid_dataset[is.na(owid_dataset)]=0

owid_dataset$date = ymd(owid_dataset$date)

Add Trend line for new cases by date plot

new_cases_by_date<-owid_dataset%>%
  mutate(new_cases=abs(new_cases))%>%
  select(date,new_cases)%>%
  group_by(date=date)
new_cases_by_date<- aggregate(new_cases_by_date$new_cases, by=list(new_cases_by_date$date), sum)
new_cases_by_date<-new_cases_by_date%>%
  rename(new_cases=x,date=Group.1)
p1<-new_cases_by_date%>%
  ggplot(aes(x=date,y=new_cases))+geom_line(col="#6633CC")+
  geom_smooth(method="lm",se=FALSE,col="black",linetype="dashed")+
  ylab("Cumulative confirmed cases")+
  scale_y_continuous(labels = function(x) format(x, scientific =FALSE))
ggplotly(p1)
## `geom_smooth()` using formula 'y ~ x'

From the above graph we can see that the confirmed cases by day is linearly increasing till 20-10-21 .After that,there is a sudden change of new cases count from oct-2020 to jan-2021 and there is decrease in feb-21 and march-21 and sunden increase of new cases per day on april,may of 2021 .

Trend line for new deaths by date plot

new_deaths_by_date<-owid_dataset%>%
  mutate(new_deaths=abs(new_deaths))%>%
  select(date,new_deaths)%>%
  group_by(date=date)
new_deaths_by_date<- aggregate(new_deaths_by_date$new_deaths, by=list(new_deaths_by_date$date), sum)
new_deaths_by_date<-new_deaths_by_date%>%
  rename(new_deaths=x,date=Group.1)
p2<-new_deaths_by_date%>%
  ggplot(aes(x=date,y=new_deaths))+geom_line(col="#006666")+
  geom_smooth(method="lm",se=FALSE,col="black",linetype="dashed")+
  ylab("Cumulative confirmed deaths")
ggplotly(p2)
## `geom_smooth()` using formula 'y ~ x'

Corresponding to the cases confirmed graph , this graph also is similar beacause if there is increase in count of new cases the death rate will also increase .

Top 5 maximum cases filed countries

top_5_max_cases_filed<-owid_dataset%>%
  select(location,total_cases)%>%
  na.omit()%>%
  group_by(location)%>%
  summarize(cases=max(total_cases))%>%
  top_n(5)%>%
  arrange(desc(cases))
## Selecting by cases
ggplot(top_5_max_cases_filed, aes(x = location, y = cases)) +
  geom_segment(aes(x = location, xend = location, y = 0, yend = cases)) +
  geom_point(col=2,size=25) +
  scale_y_continuous(labels = function(x) format(x, scientific =FALSE))+
  geom_text(aes(label=cases))

top 5 maximum cases filed countries - United States,India,Brazil,France,Russia

Graph of top 5 maximum cases filed countries by date

#add the value of top 5 location in a variable
top_5_max_cases_filed_location<-top_5_max_cases_filed$location

top_cases_graph<-owid_dataset%>%
  select(date,new_cases,location)%>%
  filter(location %in% top_5_max_cases_filed_location)%>%
  mutate(cases=abs(new_cases))%>%
  group_by(date,location)%>%
 ggplot(aes(x=date,y=cases,color=location))+geom_line()
ggplotly(top_cases_graph)

Top 5 maximum deaths filed countries

top_5_max_deaths_filed<-owid_dataset%>%
  select(location,total_deaths)%>%
  na.omit()%>%
  group_by(location)%>%
  summarize(deaths=max(total_deaths))%>%
  top_n(5)%>%
  arrange(desc(deaths))
## Selecting by deaths
ggplot(top_5_max_deaths_filed, aes(x = location, y = deaths)) +
  geom_segment(aes(x = location, xend = location, y = 0, yend = deaths)) +
  geom_point(col=7,size=25)+ 
  scale_y_continuous(labels = function(x) format(x, scientific =FALSE)) +
  geom_text(aes(label=deaths))

top 5 maximum deaths filed countries - United states,Brazil,India,Mexico,Peru

Graph of top 5 maximum deaths filed countries by date

#add the value of top 5 location in a variable
top_5_max_deaths_filed_location<-top_5_max_deaths_filed$location

top_deaths_graph<-owid_dataset%>%
  select(date,new_deaths,location)%>%
  filter(location %in% top_5_max_deaths_filed_location)%>%
  mutate(deaths=abs(new_deaths))%>%
  ggplot(aes(x=date,y=deaths,color=location))+geom_line()
ggplotly(top_deaths_graph)