Covid_19

Introduction

The World Health Organization (WHO) declared the novel human coronavirus disease (COVID-19) outbreak, which began in Wuhan China on December 8, 2019 a Public Health Emergency of International Concern (PHEIC) on January 30, 2020 (WHO, 2020)The data is the product of dozens of journalists working across several time zones to monitor news conferences, analyze data releases and seek clarification from public officials on how they categorize cases. The dataset contains confirmed cases,death,date,state and FIPS columns.

Installing and loading of necessary R packages

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

loading of the dataset

states_data <- read_csv("us-states.csv")
## Rows: 2553 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): state, fips
## dbl  (2): cases, deaths
## date (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

An overview of the datasets

str(states_data)
## spc_tbl_ [2,553 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ date  : Date[1:2553], format: "2020-01-21" "2020-01-22" ...
##  $ state : chr [1:2553] "Washington" "Washington" "Washington" "Illinois" ...
##  $ fips  : chr [1:2553] "53" "53" "53" "17" ...
##  $ cases : num [1:2553] 1 1 1 1 1 1 1 1 1 2 ...
##  $ deaths: num [1:2553] 0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   date = col_date(format = ""),
##   ..   state = col_character(),
##   ..   fips = col_character(),
##   ..   cases = col_double(),
##   ..   deaths = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
head(states_data)
## # A tibble: 6 × 5
##   date       state      fips  cases deaths
##   <date>     <chr>      <chr> <dbl>  <dbl>
## 1 2020-01-21 Washington 53        1      0
## 2 2020-01-22 Washington 53        1      0
## 3 2020-01-23 Washington 53        1      0
## 4 2020-01-24 Illinois   17        1      0
## 5 2020-01-24 Washington 53        1      0
## 6 2020-01-25 California 06        1      0

Adding extra columns for month and days of the week

states_data$month <- format(as.Date(states_data$date), "%m")
states_data$day_of_the_week <- format(as.Date(states_data$date), "%A")

To confirm the new variables created

colnames(states_data)
## [1] "date"            "state"           "fips"            "cases"          
## [5] "deaths"          "month"           "day_of_the_week"

Filter out only the USA 50 states and drop the rest

states_data <- states_data [!states_data$state %in% c("District of Columbia","Guam","American Samoa","Puerto Rico","Northern Mariana Islands","Virgin Islands"),]

Removing some characters from the number of death columns

states_data$deaths <- gsub("\n","", as.numeric(states_data$deaths))

converting the data type from list to vector

states_data$deaths <- unlist(states_data$deaths)
states_data$deaths <- as.vector(states_data$deaths,"numeric")

Filter out some variables(columns) that are not needed for the analysis

American_states <- states_data %>% 
  select(-c(fips))

An overview of the data set

colnames(American_states)
## [1] "date"            "state"           "cases"           "deaths"         
## [5] "month"           "day_of_the_week"
is.numeric(American_states$deaths)
## [1] TRUE

Descriptive analysis on the dataset

The total number of cases per US states

American_states %>% 
  group_by(state) %>% 
  summarise(num_of_cases = sum(cases)) %>% 
  arrange(-num_of_cases) %>% 
  print(n = 20)
## # A tibble: 50 × 2
##    state          num_of_cases
##    <chr>                 <dbl>
##  1 New York            3163712
##  2 New Jersey           958684
##  3 California           393001
##  4 Michigan             389710
##  5 Massachusetts        371399
##  6 Pennsylvania         329230
##  7 Florida              317387
##  8 Illinois             316853
##  9 Louisiana            316753
## 10 Texas                207441
## 11 Washington           201808
## 12 Georgia              199359
## 13 Connecticut          180006
## 14 Colorado             124181
## 15 Indiana              117919
## 16 Maryland             116436
## 17 Ohio                 106847
## 18 Tennessee             88685
## 19 Virginia              78811
## 20 North Carolina        71266
## # … with 30 more rows

The total number of death per US states

American_states %>% 
  group_by(state) %>% 
  summarise(num_of_death = sum(deaths)) %>% 
  arrange(-num_of_death) %>% 
  print(n = 50)
## # A tibble: 50 × 2
##    state          num_of_death
##    <chr>                 <dbl>
##  1 New York             128801
##  2 New Jersey            32137
##  3 Michigan              20720
##  4 Louisiana             13196
##  5 California            10715
##  6 Massachusetts         10407
##  7 Illinois              10067
##  8 Washington             9864
##  9 Connecticut            7808
## 10 Pennsylvania           7404
## 11 Georgia                7208
## 12 Florida                6950
## 13 Indiana                4638
## 14 Texas                  4298
## 15 Colorado               4197
## 16 Ohio                   3889
## 17 Maryland               3121
## 18 Wisconsin              2117
## 19 Virginia               2030
## 20 Missouri               1721
## 21 Arizona                1700
## 22 Nevada                 1592
## 23 Tennessee              1548
## 24 Oklahoma               1442
## 25 Kentucky               1428
## 26 Alabama                1385
## 27 Mississippi            1367
## 28 South Carolina         1279
## 29 North Carolina         1255
## 30 Minnesota               942
## 31 Rhode Island            861
## 32 Oregon                  839
## 33 Kansas                  816
## 34 Iowa                    566
## 35 Delaware                512
## 36 Vermont                 510
## 37 Idaho                   429
## 38 Arkansas                419
## 39 New Mexico              383
## 40 New Hampshire           336
## 41 Maine                   282
## 42 Utah                    263
## 43 Nebraska                261
## 44 Montana                 117
## 45 West Virginia           110
## 46 North Dakota            107
## 47 South Dakota            106
## 48 Hawaii                  105
## 49 Alaska                   89
## 50 Wyoming                   8

Total number of cases by the days of the week in United States

American_states %>%
  group_by(day_of_the_week) %>% 
  summarise(num_of_cases = sum(cases)) %>% 
  arrange(-num_of_cases,day_of_the_week)
## # A tibble: 7 × 2
##   day_of_the_week num_of_cases
##   <chr>                  <dbl>
## 1 Friday               1587355
## 2 Thursday             1467887
## 3 Wednesday            1351548
## 4 Tuesday              1247076
## 5 Monday               1153151
## 6 Sunday               1065562
## 7 Saturday              985928

Total number of deaths by week days

American_states %>% 
  group_by(day_of_the_week) %>% 
  summarise(num_of_death = sum(deaths)) %>% 
  arrange(-num_of_death)
## # A tibble: 7 × 2
##   day_of_the_week num_of_death
##   <chr>                  <dbl>
## 1 Friday                 60444
## 2 Thursday               54891
## 3 Wednesday              49200
## 4 Tuesday                43638
## 5 Monday                 38129
## 6 Sunday                 34563
## 7 Saturday               31480

Visualisation of analysis and recommendation

Visualise the total number of cases per US states

American_states %>% 
  group_by(state) %>% 
  summarise(num_of_cases = sum(cases)) %>% 
  arrange(-num_of_cases) %>% 
  ggplot(aes(x = state, y = num_of_cases))+geom_col(position = "dodge")+
  labs(title = "Number of covid-19 cases by state")+
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

The total number of death per US states

American_states %>% 
  group_by(state) %>% 
  summarise(num_of_death = sum(deaths)) %>% 
  arrange(-num_of_death) %>% 
  ggplot(aes(x = state, y = num_of_death))+geom_col(position = "dodge")+
  labs(title = "Number of covid-19 deaths by state")+
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Total number of cases by the days of the week in United States

American_states %>%
  group_by(day_of_the_week) %>% 
  summarise(num_of_cases = sum(cases)) %>% 
  arrange(num_of_cases,day_of_the_week) %>%
   ggplot(aes(x = day_of_the_week, y = num_of_cases,fill = day_of_the_week ))+geom_col(position = "dodge")+
  labs(title = "Number of covid-19 cases by week day")+
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Average number of deaths by week days

American_states %>% 
  group_by(day_of_the_week) %>% 
  summarise(num_of_death = sum(deaths)) %>% 
  arrange(-num_of_death) %>% 
  ggplot(aes(x = day_of_the_week, y = num_of_death,fill = day_of_the_week ))+geom_col(position = "dodge")+
  labs(title = "Number of covid-19 death by week day")+
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))