I am using dataset from Moiya Josephs Moiya Josephs that has all the deaths and cases in the United States from COVID
Possible Analysis:
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.7
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
df<-read.csv("https://raw.githubusercontent.com/deepasharma06/Data-607/main/United_States_COVID-19_Cases_and_Deaths_by_State_over_Time.csv", stringsAsFactors = TRUE)
head(df)
## ï..submission_date state tot_cases conf_cases prob_cases new_case pnew_case
## 1 12/01/2021 ND 163,565 135,705 27,860 589 220
## 2 08/17/2020 MD 100,715 503 0
## 3 07/20/2021 MD 464,491 155 0
## 4 03/18/2020 ME 44 44 0 12 0
## 5 02/06/2020 NE 0 0
## 6 02/02/2021 IL 1,130,917 1,130,917 0 2,304 0
## tot_death conf_death prob_death new_death pnew_death created_at
## 1 1,907 9 0 12/02/2021 02:35:20 PM
## 2 3,765 3,616 149 3 0 08/19/2020 12:00:00 AM
## 3 9,822 9,604 218 3 1 07/22/2021 12:00:00 AM
## 4 0 0 0 0 0 03/20/2020 12:00:00 AM
## 5 0 0 03/26/2020 04:22:39 PM
## 6 21,336 19,306 2,030 63 16 02/03/2021 02:55:58 PM
## consent_cases consent_deaths
## 1 Agree Not agree
## 2 N/A Agree
## 3 N/A Agree
## 4 Agree Agree
## 5 Agree Agree
## 6 Agree Agree
library(tidyr)
df <- tidyr::separate(df, ï..submission_date, c('month', 'day', 'year'), sep = "/",remove = FALSE)
head(df)
## ï..submission_date month day year state tot_cases conf_cases prob_cases
## 1 12/01/2021 12 01 2021 ND 163,565 135,705 27,860
## 2 08/17/2020 08 17 2020 MD 100,715
## 3 07/20/2021 07 20 2021 MD 464,491
## 4 03/18/2020 03 18 2020 ME 44 44 0
## 5 02/06/2020 02 06 2020 NE 0
## 6 02/02/2021 02 02 2021 IL 1,130,917 1,130,917 0
## new_case pnew_case tot_death conf_death prob_death new_death pnew_death
## 1 589 220 1,907 9 0
## 2 503 0 3,765 3,616 149 3 0
## 3 155 0 9,822 9,604 218 3 1
## 4 12 0 0 0 0 0 0
## 5 0 0 0
## 6 2,304 0 21,336 19,306 2,030 63 16
## created_at consent_cases consent_deaths
## 1 12/02/2021 02:35:20 PM Agree Not agree
## 2 08/19/2020 12:00:00 AM N/A Agree
## 3 07/22/2021 12:00:00 AM N/A Agree
## 4 03/20/2020 12:00:00 AM Agree Agree
## 5 03/26/2020 04:22:39 PM Agree Agree
## 6 02/03/2021 02:55:58 PM Agree Agree
df1 <- df[, c("year", "state", "new_case", "new_death")]
head(df1)
## year state new_case new_death
## 1 2021 ND 589 9
## 2 2020 MD 503 3
## 3 2021 MD 155 3
## 4 2020 ME 12 0
## 5 2020 NE 0 0
## 6 2021 IL 2,304 63
df1$new_case <- as.numeric(df1$new_case)
df1$new_death <- as.numeric(df1$new_death)
sapply(df1, class)
## year state new_case new_death
## "character" "factor" "numeric" "numeric"
df2 <- df1 %>%
group_by(year, state) %>%
summarise_all(sum)
df2
## # A tibble: 180 x 4
## # Groups: year [3]
## year state new_case new_death
## <chr> <fct> <dbl> <dbl>
## 1 2020 AK 1159699 29616
## 2 2020 AL 955789 67412
## 3 2020 AR 1253350 61730
## 4 2020 AS 11187 14835
## 5 2020 AZ 1123729 72308
## 6 2020 CA 1170749 77734
## 7 2020 CO 1196825 68069
## 8 2020 CT 871765 66008
## 9 2020 DC 1244375 49760
## 10 2020 DE 1232362 45375
## # ... with 170 more rows
# what are the states in my dataframe?
dfstate <- unique(df2$state)
# What are the valid states in the US? I am using the state.abb data available in R
state.abb
## [1] "AL" "AK" "AZ" "AR" "CA" "CO" "CT" "DE" "FL" "GA" "HI" "ID" "IL" "IN" "IA"
## [16] "KS" "KY" "LA" "ME" "MD" "MA" "MI" "MN" "MS" "MO" "MT" "NE" "NV" "NH" "NJ"
## [31] "NM" "NY" "NC" "ND" "OH" "OK" "OR" "PA" "RI" "SC" "SD" "TN" "TX" "UT" "VT"
## [46] "VA" "WA" "WV" "WI" "WY"
usstate <- state.abb
# What are the nonstates in my dataframe?
nonstates <- setdiff(dfstate, usstate)
nonstates
## [1] "AS" "DC" "FSM" "GU" "MP" "NYC" "PR" "PW" "RMI" "VI"
sapply(df2, class)
## year state new_case new_death
## "character" "factor" "numeric" "numeric"
df2$state <- as.character(df2$state)
sapply(df2, class)
## year state new_case new_death
## "character" "character" "numeric" "numeric"
df2$state[df2$state == 'AS'] = NA
df2$state[df2$state == 'DC'] = NA
df2$state[df2$state == 'FSM'] = NA
df2$state[df2$state == 'GU'] = NA
df2$state[df2$state == 'MP'] = NA
df2$state[df2$state == 'NYC'] = NA
df2$state[df2$state == 'PR'] = NA
df2$state[df2$state == 'PW'] = NA
df2$state[df2$state == 'RMI'] = NA
df2$state[df2$state == 'VI'] = NA
df3 <- na.omit(df2)
head(df3)
## # A tibble: 6 x 4
## # Groups: year [1]
## year state new_case new_death
## <chr> <chr> <dbl> <dbl>
## 1 2020 AK 1159699 29616
## 2 2020 AL 955789 67412
## 3 2020 AR 1253350 61730
## 4 2020 AZ 1123729 72308
## 5 2020 CA 1170749 77734
## 6 2020 CO 1196825 68069
Find the state with the highest number of cases in any particular year.
df3[which.max(df3$new_case), ]
## # A tibble: 1 x 4
## # Groups: year [1]
## year state new_case new_death
## <chr> <chr> <dbl> <dbl>
## 1 2021 HI 1581555 47425
Find the state with the highest deaths in each year.
From the code below, we see that Alabama has the highest death in 2021. I was surprised to see this and did some research and found that California had the highest COVID death in 2021. My inference is that the data provided in by this source must be wrong.
df3[which.max(df3$new_death), ]
## # A tibble: 1 x 4
## # Groups: year [1]
## year state new_case new_death
## <chr> <chr> <dbl> <dbl>
## 1 2021 AL 1357245 117945
From the above analysis, it is seen that untidy data can be cleaned up and analysed using R script. However, the limitation is that if the source data is incorrect, then the result will look skewed.