Covid_19
The World Health Organization (WHO) declared the novel human coronavirus disease (COVID-19) outbreak, which began in Wuhan China on December 8, 2019 a Public Health Emergency of International Concern (PHEIC) on January 30, 2020 (WHO, 2020)The data is the product of dozens of journalists working across several time zones to monitor news conferences, analyze data releases and seek clarification from public officials on how they categorize cases. The dataset contains confirmed cases,death,date,state and FIPS columns.
Confirmed Cases Confirmed cases are patients who test positive for the coronavirus. We consider a case confirmed when it is reported by a federal, state, territorial or local government agency.
Dates For each date, we show the cumulative number of confirmed cases and deaths as reported that day in that county or state. All cases and deaths are counted on the date they are first announced.
States In some instances, we report data from multiple counties or other non-county geographies as a single county. For instance, we report a single value for New York City, comprising the cases for New York, Kings, Queens, Bronx and Richmond Counties. In these instances the FIPS code field will be empty. (We may assign FIPS codes to these geographies in the future)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
loading of the dataset
states_data <- read_csv("us-states.csv")
## Rows: 2553 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): state, fips
## dbl (2): cases, deaths
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
An overview of the datasets
str(states_data)
## spc_tbl_ [2,553 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ date : Date[1:2553], format: "2020-01-21" "2020-01-22" ...
## $ state : chr [1:2553] "Washington" "Washington" "Washington" "Illinois" ...
## $ fips : chr [1:2553] "53" "53" "53" "17" ...
## $ cases : num [1:2553] 1 1 1 1 1 1 1 1 1 2 ...
## $ deaths: num [1:2553] 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, "spec")=
## .. cols(
## .. date = col_date(format = ""),
## .. state = col_character(),
## .. fips = col_character(),
## .. cases = col_double(),
## .. deaths = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
head(states_data)
## # A tibble: 6 × 5
## date state fips cases deaths
## <date> <chr> <chr> <dbl> <dbl>
## 1 2020-01-21 Washington 53 1 0
## 2 2020-01-22 Washington 53 1 0
## 3 2020-01-23 Washington 53 1 0
## 4 2020-01-24 Illinois 17 1 0
## 5 2020-01-24 Washington 53 1 0
## 6 2020-01-25 California 06 1 0
Adding extra columns for month and days of the week
states_data$month <- format(as.Date(states_data$date), "%m")
states_data$day_of_the_week <- format(as.Date(states_data$date), "%A")
To confirm the new variables created
colnames(states_data)
## [1] "date" "state" "fips" "cases"
## [5] "deaths" "month" "day_of_the_week"
Filter out only the USA 50 states and drop the rest
states_data <- states_data [!states_data$state %in% c("District of Columbia","Guam","American Samoa","Puerto Rico","Northern Mariana Islands","Virgin Islands"),]
Removing some characters from the number of death columns
states_data$deaths <- gsub("\n","", as.numeric(states_data$deaths))
converting the data type from list to vector
states_data$deaths <- unlist(states_data$deaths)
states_data$deaths <- as.vector(states_data$deaths,"numeric")
Filter out some variables(columns) that are not needed for the analysis
American_states <- states_data %>%
select(-c(fips))
An overview of the data set
colnames(American_states)
## [1] "date" "state" "cases" "deaths"
## [5] "month" "day_of_the_week"
is.numeric(American_states$deaths)
## [1] TRUE
The total number of cases per US states
American_states %>%
group_by(state) %>%
summarise(num_of_cases = sum(cases)) %>%
arrange(-num_of_cases) %>%
print(n = 20)
## # A tibble: 50 × 2
## state num_of_cases
## <chr> <dbl>
## 1 New York 3163712
## 2 New Jersey 958684
## 3 California 393001
## 4 Michigan 389710
## 5 Massachusetts 371399
## 6 Pennsylvania 329230
## 7 Florida 317387
## 8 Illinois 316853
## 9 Louisiana 316753
## 10 Texas 207441
## 11 Washington 201808
## 12 Georgia 199359
## 13 Connecticut 180006
## 14 Colorado 124181
## 15 Indiana 117919
## 16 Maryland 116436
## 17 Ohio 106847
## 18 Tennessee 88685
## 19 Virginia 78811
## 20 North Carolina 71266
## # … with 30 more rows
The total number of death per US states
American_states %>%
group_by(state) %>%
summarise(num_of_death = sum(deaths)) %>%
arrange(-num_of_death) %>%
print(n = 50)
## # A tibble: 50 × 2
## state num_of_death
## <chr> <dbl>
## 1 New York 128801
## 2 New Jersey 32137
## 3 Michigan 20720
## 4 Louisiana 13196
## 5 California 10715
## 6 Massachusetts 10407
## 7 Illinois 10067
## 8 Washington 9864
## 9 Connecticut 7808
## 10 Pennsylvania 7404
## 11 Georgia 7208
## 12 Florida 6950
## 13 Indiana 4638
## 14 Texas 4298
## 15 Colorado 4197
## 16 Ohio 3889
## 17 Maryland 3121
## 18 Wisconsin 2117
## 19 Virginia 2030
## 20 Missouri 1721
## 21 Arizona 1700
## 22 Nevada 1592
## 23 Tennessee 1548
## 24 Oklahoma 1442
## 25 Kentucky 1428
## 26 Alabama 1385
## 27 Mississippi 1367
## 28 South Carolina 1279
## 29 North Carolina 1255
## 30 Minnesota 942
## 31 Rhode Island 861
## 32 Oregon 839
## 33 Kansas 816
## 34 Iowa 566
## 35 Delaware 512
## 36 Vermont 510
## 37 Idaho 429
## 38 Arkansas 419
## 39 New Mexico 383
## 40 New Hampshire 336
## 41 Maine 282
## 42 Utah 263
## 43 Nebraska 261
## 44 Montana 117
## 45 West Virginia 110
## 46 North Dakota 107
## 47 South Dakota 106
## 48 Hawaii 105
## 49 Alaska 89
## 50 Wyoming 8
Total number of cases by the days of the week in United States
American_states %>%
group_by(day_of_the_week) %>%
summarise(num_of_cases = sum(cases)) %>%
arrange(-num_of_cases,day_of_the_week)
## # A tibble: 7 × 2
## day_of_the_week num_of_cases
## <chr> <dbl>
## 1 Friday 1587355
## 2 Thursday 1467887
## 3 Wednesday 1351548
## 4 Tuesday 1247076
## 5 Monday 1153151
## 6 Sunday 1065562
## 7 Saturday 985928
Total number of deaths by week days
American_states %>%
group_by(day_of_the_week) %>%
summarise(num_of_death = sum(deaths)) %>%
arrange(-num_of_death)
## # A tibble: 7 × 2
## day_of_the_week num_of_death
## <chr> <dbl>
## 1 Friday 60444
## 2 Thursday 54891
## 3 Wednesday 49200
## 4 Tuesday 43638
## 5 Monday 38129
## 6 Sunday 34563
## 7 Saturday 31480
Visualise the total number of cases per US states
American_states %>%
group_by(state) %>%
summarise(num_of_cases = sum(cases)) %>%
arrange(-num_of_cases) %>%
ggplot(aes(x = state, y = num_of_cases))+geom_col(position = "dodge")+
labs(title = "Number of covid-19 cases by state")+
scale_y_continuous(labels = function(x) format(x, scientific = FALSE))+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
The total number of death per US states
American_states %>%
group_by(state) %>%
summarise(num_of_death = sum(deaths)) %>%
arrange(-num_of_death) %>%
ggplot(aes(x = state, y = num_of_death))+geom_col(position = "dodge")+
labs(title = "Number of covid-19 deaths by state")+
scale_y_continuous(labels = function(x) format(x, scientific = FALSE))+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
Total number of cases by the days of the week in United States
American_states %>%
group_by(day_of_the_week) %>%
summarise(num_of_cases = sum(cases)) %>%
arrange(num_of_cases,day_of_the_week) %>%
ggplot(aes(x = day_of_the_week, y = num_of_cases,fill = day_of_the_week ))+geom_col(position = "dodge")+
labs(title = "Number of covid-19 cases by week day")+
scale_y_continuous(labels = function(x) format(x, scientific = FALSE))+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
Average number of deaths by week days
American_states %>%
group_by(day_of_the_week) %>%
summarise(num_of_death = sum(deaths)) %>%
arrange(-num_of_death) %>%
ggplot(aes(x = day_of_the_week, y = num_of_death,fill = day_of_the_week ))+geom_col(position = "dodge")+
labs(title = "Number of covid-19 death by week day")+
scale_y_continuous(labels = function(x) format(x, scientific = FALSE))+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))