Coronavirus disease 2019 (COVID-19) is a contagious disease caused by severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2). The first known case was identified in Wuhan, China, in December 2019.The disease has since spread worldwide, leading to an ongoing pandemic.
Symptoms of COVID-19 are variable, but often include fever,cough, headache,fatigue, breathing difficulties, and loss of smell and taste.Symptoms may begin one to fourteen days after exposure to the virus. At least a third of people who are infected do not develop noticeable symptoms.
Lets study and visualize covid dataset
This data is sourced from governments, national and subnational agencies across the world — a full list of data sources for each country is published on Johns Hopkins GitHub site.
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.5
library(plotly)
## Warning: package 'plotly' was built under R version 4.0.4
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.0.4
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
owid_dataset<-read.csv("owid-covid-data.csv",header=T)
glimpse(owid_dataset)
## Rows: 105,010
## Columns: 60
## $ iso_code <chr> "AFG", "AFG", "AFG", "AFG", "...
## $ continent <chr> "Asia", "Asia", "Asia", "Asia...
## $ location <chr> "Afghanistan", "Afghanistan",...
## $ date <chr> "2020-02-24", "2020-02-25", "...
## $ total_cases <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 2, 4,...
## $ new_cases <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 1, 2,...
## $ new_cases_smoothed <dbl> NA, NA, NA, NA, NA, 0.143, 0....
## $ total_deaths <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ new_deaths <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ new_deaths_smoothed <dbl> NA, NA, NA, NA, NA, 0, 0, 0, ...
## $ total_cases_per_million <dbl> 0.026, 0.026, 0.026, 0.026, 0...
## $ new_cases_per_million <dbl> 0.026, 0.000, 0.000, 0.000, 0...
## $ new_cases_smoothed_per_million <dbl> NA, NA, NA, NA, NA, 0.004, 0....
## $ total_deaths_per_million <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ new_deaths_per_million <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ new_deaths_smoothed_per_million <dbl> NA, NA, NA, NA, NA, 0, 0, 0, ...
## $ reproduction_rate <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ icu_patients <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ icu_patients_per_million <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ hosp_patients <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ hosp_patients_per_million <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ weekly_icu_admissions <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ weekly_icu_admissions_per_million <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ weekly_hosp_admissions <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ weekly_hosp_admissions_per_million <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ new_tests <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ total_tests <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ total_tests_per_thousand <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ new_tests_per_thousand <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ new_tests_smoothed <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ new_tests_smoothed_per_thousand <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ positive_rate <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ tests_per_case <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ tests_units <chr> "", "", "", "", "", "", "", "...
## $ total_vaccinations <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ people_vaccinated <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ people_fully_vaccinated <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ new_vaccinations <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ new_vaccinations_smoothed <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ total_vaccinations_per_hundred <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ people_vaccinated_per_hundred <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ people_fully_vaccinated_per_hundred <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ new_vaccinations_smoothed_per_million <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ stringency_index <dbl> 8.33, 8.33, 8.33, 8.33, 8.33,...
## $ population <dbl> 38928341, 38928341, 38928341,...
## $ population_density <dbl> 54.422, 54.422, 54.422, 54.42...
## $ median_age <dbl> 18.6, 18.6, 18.6, 18.6, 18.6,...
## $ aged_65_older <dbl> 2.581, 2.581, 2.581, 2.581, 2...
## $ aged_70_older <dbl> 1.337, 1.337, 1.337, 1.337, 1...
## $ gdp_per_capita <dbl> 1803.987, 1803.987, 1803.987,...
## $ extreme_poverty <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ cardiovasc_death_rate <dbl> 597.029, 597.029, 597.029, 59...
## $ diabetes_prevalence <dbl> 9.59, 9.59, 9.59, 9.59, 9.59,...
## $ female_smokers <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ male_smokers <dbl> NA, NA, NA, NA, NA, NA, NA, N...
## $ handwashing_facilities <dbl> 37.746, 37.746, 37.746, 37.74...
## $ hospital_beds_per_thousand <dbl> 0.5, 0.5, 0.5, 0.5, 0.5, 0.5,...
## $ life_expectancy <dbl> 64.83, 64.83, 64.83, 64.83, 6...
## $ human_development_index <dbl> 0.511, 0.511, 0.511, 0.511, 0...
## $ excess_mortality <dbl> NA, NA, NA, NA, NA, NA, NA, N...
#choose the required columns
owid_dataset<-owid_dataset[,c("continent","location","date","total_cases","new_cases","total_deaths","new_deaths")]
#remove the continenets data from the location column
owid_dataset<-owid_dataset%>%
filter(location!="World",location!="Asia",location!="Europe",location!="Africa",location!="North America",location!="South America",location!="Oceania",location!="",location!="Australia",location!="European Union")
glimpse(owid_dataset)
## Rows: 100,113
## Columns: 7
## $ continent <chr> "Asia", "Asia", "Asia", "Asia", "Asia", "Asia", "Asia"...
## $ location <chr> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanis...
## $ date <chr> "2020-02-24", "2020-02-25", "2020-02-26", "2020-02-27"...
## $ total_cases <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 2, 4, 4, 4, 4, 5, 7, 8, 11, 12...
## $ new_cases <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 1, 2, 1, 3, 1, ...
## $ total_deaths <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ new_deaths <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
confirmed_cases_by_date<-owid_dataset%>%
select(date,new_cases,location)%>%
na.omit()%>%
mutate(new_cases=abs(new_cases))%>%
group_by(date)%>%
plot_ly(x=~date,y=~new_cases,name = "cases",type='scatter',mode='lines',line = list(color = 'rgb(100, 80, 90)'))
confirmed_cases_by_date
confirmed_deaths_by_date<-owid_dataset%>%
select(date,new_deaths,location)%>%
na.omit()%>%
mutate(new_deaths=abs(new_deaths))%>%
group_by(date)%>%
plot_ly(x=~date,y=~new_deaths,name = "deaths",type='scatter',mode='lines',line = list(color = 'rgb(200, 150, 90)'))
confirmed_deaths_by_date
#add na values with 0
owid_dataset[is.na(owid_dataset)]=0
owid_dataset$date = ymd(owid_dataset$date)
new_cases_by_date<-owid_dataset%>%
mutate(new_cases=abs(new_cases))%>%
select(date,new_cases)%>%
group_by(date=date)
new_cases_by_date<- aggregate(new_cases_by_date$new_cases, by=list(new_cases_by_date$date), sum)
new_cases_by_date<-new_cases_by_date%>%
rename(new_cases=x,date=Group.1)
p1<-new_cases_by_date%>%
ggplot(aes(x=date,y=new_cases))+geom_line(col="#6633CC")+
geom_smooth(method="lm",se=FALSE,col="black",linetype="dashed")+
ylab("Cumulative confirmed cases")+
scale_y_continuous(labels = function(x) format(x, scientific =FALSE))
ggplotly(p1)
## `geom_smooth()` using formula 'y ~ x'
From the above graph we can see that the confirmed cases by day is linearly increasing till 20-10-21 .After that,there is a sudden change of new cases count from oct-2020 to jan-2021 and there is decrease in feb-21 and march-21 and sunden increase of new cases per day on april,may of 2021 .
new_deaths_by_date<-owid_dataset%>%
mutate(new_deaths=abs(new_deaths))%>%
select(date,new_deaths)%>%
group_by(date=date)
new_deaths_by_date<- aggregate(new_deaths_by_date$new_deaths, by=list(new_deaths_by_date$date), sum)
new_deaths_by_date<-new_deaths_by_date%>%
rename(new_deaths=x,date=Group.1)
p2<-new_deaths_by_date%>%
ggplot(aes(x=date,y=new_deaths))+geom_line(col="#006666")+
geom_smooth(method="lm",se=FALSE,col="black",linetype="dashed")+
ylab("Cumulative confirmed deaths")
ggplotly(p2)
## `geom_smooth()` using formula 'y ~ x'
Corresponding to the cases confirmed graph , this graph also is similar beacause if there is increase in count of new cases the death rate will also increase .
top_5_max_cases_filed<-owid_dataset%>%
select(location,total_cases)%>%
na.omit()%>%
group_by(location)%>%
summarize(cases=max(total_cases))%>%
top_n(5)%>%
arrange(desc(cases))
## Selecting by cases
ggplot(top_5_max_cases_filed, aes(x = location, y = cases)) +
geom_segment(aes(x = location, xend = location, y = 0, yend = cases)) +
geom_point(col=2,size=25) +
scale_y_continuous(labels = function(x) format(x, scientific =FALSE))+
geom_text(aes(label=cases))
top 5 maximum cases filed countries - United States,India,Brazil,France,Russia
#add the value of top 5 location in a variable
top_5_max_cases_filed_location<-top_5_max_cases_filed$location
top_cases_graph<-owid_dataset%>%
select(date,new_cases,location)%>%
filter(location %in% top_5_max_cases_filed_location)%>%
mutate(cases=abs(new_cases))%>%
group_by(date,location)%>%
ggplot(aes(x=date,y=cases,color=location))+geom_line()
ggplotly(top_cases_graph)
top_5_max_deaths_filed<-owid_dataset%>%
select(location,total_deaths)%>%
na.omit()%>%
group_by(location)%>%
summarize(deaths=max(total_deaths))%>%
top_n(5)%>%
arrange(desc(deaths))
## Selecting by deaths
ggplot(top_5_max_deaths_filed, aes(x = location, y = deaths)) +
geom_segment(aes(x = location, xend = location, y = 0, yend = deaths)) +
geom_point(col=7,size=25)+
scale_y_continuous(labels = function(x) format(x, scientific =FALSE)) +
geom_text(aes(label=deaths))
top 5 maximum deaths filed countries - United states,Brazil,India,Mexico,Peru
#add the value of top 5 location in a variable
top_5_max_deaths_filed_location<-top_5_max_deaths_filed$location
top_deaths_graph<-owid_dataset%>%
select(date,new_deaths,location)%>%
filter(location %in% top_5_max_deaths_filed_location)%>%
mutate(deaths=abs(new_deaths))%>%
ggplot(aes(x=date,y=deaths,color=location))+geom_line()
ggplotly(top_deaths_graph)