#getwd()
#install.packages("cdlTools") for FIPS numbers
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(cdlTools)
library(BSDA)
## Loading required package: lattice
##
## Attaching package: 'BSDA'
## The following object is masked from 'package:datasets':
##
## Orange
Data Used: Dynata, T. N. (2020, July). Estimates from The New York Times, based on roughly 250,000 interviews conducted by Dynata from July 2 to July 14. Retrieved from https://github.com: https://github.com/nytimes/covid-19-data/tree/master/mask-use;
Thomas Hale, S. W. (2020). https://github.com. Retrieved from Oxford COVID-19 Government Response Tracker. Blavatnik School of Government.: https://github.com/OxCGRT/covid-policy-
maskuse <- read_csv("C:/Users/libcl/OneDrive/Documents/DATA110/NYTimesMaskWearing.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## COUNTYFP = col_double(),
## NEVER = col_double(),
## RARELY = col_double(),
## SOMETIMES = col_double(),
## FREQUENTLY = col_double(),
## ALWAYS = col_double()
## )
census2019 <- read.csv("C:/Users/libcl/OneDrive/Documents/DATA110/Co_est_census_2019.csv")
covid <- read_csv("C:/Users/libcl/OneDrive/Documents/OxfordData11_30.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## CountryName = col_character(),
## CountryCode = col_character(),
## RegionName = col_character(),
## RegionCode = col_character(),
## Jurisdiction = col_character(),
## C1_Notes = col_character(),
## C2_Notes = col_character(),
## C3_Notes = col_character(),
## C4_Notes = col_character(),
## C5_Notes = col_character(),
## C6_Notes = col_character(),
## C7_Notes = col_character(),
## C8_Notes = col_character(),
## E1_Notes = col_character(),
## E2_Notes = col_character(),
## E3_Notes = col_character(),
## E4_Notes = col_logical(),
## H1_Notes = col_character(),
## H2_Notes = col_character(),
## H3_Notes = col_character()
## # ... with 6 more columns
## )
## i Use `spec()` for the full column specifications.
## Warning: 5345 parsing failures.
## row col expected actual file
## 1009 H5_Investment in vaccines 1/0/T/F/TRUE/FALSE 0.00 'C:/Users/libcl/OneDrive/Documents/OxfordData11_30.csv'
## 1010 H5_Investment in vaccines 1/0/T/F/TRUE/FALSE 0.00 'C:/Users/libcl/OneDrive/Documents/OxfordData11_30.csv'
## 1011 H5_Investment in vaccines 1/0/T/F/TRUE/FALSE 0.00 'C:/Users/libcl/OneDrive/Documents/OxfordData11_30.csv'
## 1012 H5_Investment in vaccines 1/0/T/F/TRUE/FALSE 0.00 'C:/Users/libcl/OneDrive/Documents/OxfordData11_30.csv'
## 1013 H5_Investment in vaccines 1/0/T/F/TRUE/FALSE 0.00 'C:/Users/libcl/OneDrive/Documents/OxfordData11_30.csv'
## .... ......................... .................. ...... .......................................................
## See problems(...) for more details.
#clean maskuse and census datasets
#change column heading to lower case
names(maskuse) <- tolower(names(maskuse))
names(census2019) <- tolower(names(census2019))
#drop last 3 digits of countyfp to get state fip
x <- as.integer(maskuse$countyfp)
statefp <- floor(x/1000)
#convert state fips code to state name
maskuse <- maskuse %>%
mutate(regionname = fips(statefp, to="Name"))
#write.csv(maskuse, "C:/Users/libcl/OneDrive/Documents/MyNYTimesMaskUseSurvey.csv", row.names = FALSE)
#select population estimate for 2019 to be able to join by FIPS code
census2019 <- census2019 %>%
summarize(stname, countyfp = state*1000 + county, state, county, popestimate2019)
#join by FIPS code
masks <- maskuse %>% left_join(census2019, by = "countyfp")
#write.csv(census2019, "C:/Users/libcl/OneDrive/Documents/MyCensus2019CountyFIPS.csv", row.names = FALSE)
#clean covid dataset
#change column heading to lower case
names(covid) <- tolower(names(covid))
#replace . with _ in column headings
names(covid) <- gsub(".","_",names(covid), fixed = TRUE)
names(covid) <- gsub(" ","_",names(covid), fixed = TRUE)
#write.csv(covid, "C:/Users/libcl/OneDrive/Documents/MyOxfordUSCovid.csv", row.names = FALSE)
#sum masks column by state into new column, state_pop
#mutate weighted county average of compliance categories into st_* new names
masks <- masks %>%
group_by(regionname) %>%
mutate(state_pop = sum(popestimate2019))
masks <- masks %>%
group_by(regionname) %>%
mutate(st_never = never*popestimate2019/state_pop, st_rarely = rarely*popestimate2019/state_pop, st_sometimes = sometimes*popestimate2019/state_pop, st_frequently = frequently*popestimate2019/state_pop, st_always = always*popestimate2019/state_pop)
# get a county score to use instead of 5 categories; give it a 100 index score to average for each state in the next chunk. 0*rarely, 25*rarely, 50*sometimes, 70*frequently, 100*always. then add across to score each county
masks <- masks %>%
group_by(regionname) %>%
mutate(score = 0*never + 25*rarely + 50*sometimes + 75*frequently + 100*always)
summary(masks)
## countyfp never rarely sometimes
## Min. : 1001 Min. :0.00000 Min. :0.00000 Min. :0.0010
## 1st Qu.:18178 1st Qu.:0.03400 1st Qu.:0.04000 1st Qu.:0.0790
## Median :29176 Median :0.06800 Median :0.07300 Median :0.1150
## Mean :30384 Mean :0.07994 Mean :0.08292 Mean :0.1213
## 3rd Qu.:45081 3rd Qu.:0.11300 3rd Qu.:0.11500 3rd Qu.:0.1560
## Max. :56045 Max. :0.43200 Max. :0.38400 Max. :0.4220
## frequently always regionname stname
## Min. :0.0290 Min. :0.1150 Length:3142 Length:3142
## 1st Qu.:0.1640 1st Qu.:0.3932 Class :character Class :character
## Median :0.2040 Median :0.4970 Mode :character Mode :character
## Mean :0.2077 Mean :0.5081
## 3rd Qu.:0.2470 3rd Qu.:0.6138
## Max. :0.5490 Max. :0.8890
## state county popestimate2019 state_pop
## Min. : 1.00 Min. : 1.0 Min. : 86 Min. : 578759
## 1st Qu.:18.00 1st Qu.: 35.0 1st Qu.: 10902 1st Qu.: 3155070
## Median :29.00 Median : 79.0 Median : 25726 Median : 6045680
## Mean :30.28 Mean :103.6 Mean : 104468 Mean : 8932447
## 3rd Qu.:45.00 3rd Qu.:133.0 3rd Qu.: 68073 3rd Qu.:10617423
## Max. :56.00 Max. :840.0 Max. :10039107 Max. :39512223
## st_never st_rarely st_sometimes
## Min. :0.0000000 Min. :0.0000000 Min. :4.700e-07
## 1st Qu.:0.0001108 1st Qu.:0.0001217 1st Qu.:2.119e-04
## Median :0.0003244 Median :0.0003396 Median :5.397e-04
## Mean :0.0009059 Mean :0.0009753 Mean :1.522e-03
## 3rd Qu.:0.0008565 3rd Qu.:0.0008824 3rd Qu.:1.407e-03
## Max. :0.0317992 Max. :0.0408248 Max. :6.900e-02
## st_frequently st_always score
## Min. :1.290e-06 Min. :0.0000032 Min. :35.83
## 1st Qu.:3.823e-04 1st Qu.:0.0008993 1st Qu.:67.05
## Median :9.492e-04 Median :0.0021227 Median :74.79
## Mean :3.099e-03 Mean :0.0097284 Mean :74.53
## 3rd Qu.:2.518e-03 3rd Qu.:0.0059780 3rd Qu.:82.29
## Max. :1.640e-01 Max. :0.7430000 Max. :96.22
# get state score by summing county score then dividing by number of counties for each state
masks <- masks %>%
group_by(regionname) %>%
summarize(regionname, st_score = sum(score)/n())
## `summarise()` regrouping output by 'regionname' (override with `.groups` argument)
#In later analysis we should filter regionname into 2 groups:st_score >= 82.29 and st_score <= 67.05 for 1st quartile and 3rd quartile from summary above and boxplot below for clearer mask compliance demarcation between states
p1 <- boxplot(masks$st_score)
#state only population data
population <- read_csv("C:/Users/libcl/OneDrive/Documents/DATA110/populationUSstates.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## rank = col_double(),
## State = col_character(),
## Pop = col_double(),
## Growth = col_double(),
## Pop2018 = col_double(),
## Pop2010 = col_double(),
## growthSince2010 = col_double(),
## Percent = col_double(),
## density = col_double()
## )
#clean data
names(population) <- tolower(names(population))
names(population) <- gsub(".","_",names(population), fixed = TRUE)
names(population) <- gsub(" ","_",names(population), fixed = TRUE)
#write.csv(covid, "C:/Users/libcl/OneDrive/Documents/MyPopulation2018State.csv", row.names = FALSE)
head(covid)
covid1 <- covid %>%
select(countryname, countrycode, regionname, date, confirmedcases, confirmeddeaths, containmenthealthindexfordisplay, economicsupportindexfordisplay)
head(covid1)
covidpop <- right_join(covid1, population,
by = c("regionname" = "state"))
covidpop <- covidpop %>%
filter(!is.na(countrycode)) %>% #remove puerto rico and dc and data not reported
group_by(regionname) %>%
#add new variable lub_date of class date, and compute new cases and deaths.
mutate(lub_date = ymd(date), newcases = confirmedcases - lag(confirmedcases), newdeaths = confirmeddeaths - lag(confirmeddeaths))
#use pop2018 to get a rate of for new cases and new deaths per 100 people
covidpop <- covidpop %>% #first drop calculated NAs
drop_na(newcases)
covidpop <- covidpop %>%
drop_na(newdeaths)
covidpop <- covidpop %>%
filter(pop2018 != 0) %>%
group_by(regionname) %>%
summarize(regionname, lub_date, containmenthealthindexfordisplay, economicsupportindexfordisplay, confirmedcases, confirmeddeaths, newcases, newdeaths, pop2018, newcases_rate = newcases*100/pop2018, newdeaths_rate = newdeaths*100/pop2018)
## `summarise()` regrouping output by 'regionname' (override with `.groups` argument)
covidpop <- covidpop %>%
mutate(status_cases = if_else(confirmedcases > lag(confirmedcases), "failure", "success"), status_deaths = if_else(confirmeddeaths > lag(confirmeddeaths), "failure", "success"))
covidpop %>%
ggplot() +
geom_boxplot(aes(x = newcases_rate)) +
xlab("New Case Rate (per 100 people)") +
ggtitle("New Covid Cases") +
coord_flip()
covidpop %>%
ggplot() +
geom_boxplot(aes(x = newdeaths_rate)) +
xlab("Death Rate (per 100 people)") +
ggtitle("New Covid Deaths") +
coord_flip()
covidpop %>%
ggplot() +
geom_histogram(aes(x = newcases_rate)) +
xlab("New Case Rate (per 100 people)") +
ggtitle("New Covid Cases")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
covidpop %>%
ggplot() +
geom_histogram(aes(x = newdeaths_rate)) +
xlab("New Death Rate (per 100 people)") +
ggtitle("New Covid Deaths")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
covidpop %>%
ggplot() +
geom_point(aes(x = lub_date, y = newcases_rate, alpha = 0.05)) +
xlab("Date") +
ylab("New Cases Rate (per 100 people)") +
ggtitle("New Covid Cases")
covidpop %>%
ggplot() +
geom_point(aes(x = lub_date, y = newdeaths_rate, alpha = 0.05)) +
xlab("Date") +
ylab("New Death Rate (per 100 people)") +
ggtitle("New Covid Deaths")
fit_cases <- lm(newcases_rate~lub_date, data = covidpop)
fit_deaths <- lm(newdeaths_rate~lub_date, data = covidpop)
summary(fit_cases)
##
## Call:
## lm(formula = newcases_rate ~ lub_date, data = covidpop)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.039228 -0.009511 -0.001381 0.004270 0.268024
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.579e+00 3.175e-02 -81.24 <2e-16 ***
## lub_date 1.406e-04 1.722e-06 81.69 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01946 on 15648 degrees of freedom
## Multiple R-squared: 0.299, Adjusted R-squared: 0.2989
## F-statistic: 6674 on 1 and 15648 DF, p-value: < 2.2e-16
summary(fit_deaths)
##
## Call:
## lm(formula = newdeaths_rate ~ lub_date, data = covidpop)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0017083 -0.0001827 -0.0001142 0.0000159 0.0105700
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.662e-02 7.536e-04 -22.05 <2e-16 ***
## lub_date 9.141e-07 4.087e-08 22.37 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0004619 on 15648 degrees of freedom
## Multiple R-squared: 0.03099, Adjusted R-squared: 0.03092
## F-statistic: 500.4 on 1 and 15648 DF, p-value: < 2.2e-16
qqplot(covidpop$lub_date, covidpop$newcases_rate)
qqplot(covidpop$lub_date, covidpop$newdeaths_rate)
#join covid and mask datasets
covidmasks <- covidpop %>% right_join(masks, by = "regionname")
#look at data only from 6/1/2000 forward
covidmasks <- covidmasks %>%
group_by(regionname) %>%
filter(lub_date >= "2020-06-01")
#write.csv(covid, "C:/Users/libcl/OneDrive/Documents/MyJoinedUSCovidMaskUse.csv", row.names = FALSE)
covidmasks <- covidmasks %>%
mutate(comply_quartile = if_else(st_score <= 67.05, 0,
if_else(st_score >= 82.29, 1, 99)))
covidrep7 <- covidmasks %>%
filter((regionname == "New York" | regionname == "Maryland" | regionname == "Florida" | regionname == "California" | regionname == "North Dakota" | regionname == "Iowa" | regionname == "Arizona" & !is.na(confirmedcases)))
summary(covidrep7)
## regionname lub_date containmenthealthindexfordisplay
## Length:69174 Min. :2020-06-01 Min. :27.43
## Class :character 1st Qu.:2020-07-16 1st Qu.:37.15
## Mode :character Median :2020-08-31 Median :48.96
## Mean :2020-08-31 Mean :51.31
## 3rd Qu.:2020-10-16 3rd Qu.:65.97
## Max. :2020-11-30 Max. :80.21
## NA's :395
## economicsupportindexfordisplay confirmedcases confirmeddeaths
## Min. : 0.00 Min. : 2625 Min. : 61
## 1st Qu.: 50.00 1st Qu.: 56446 1st Qu.: 975
## Median : 75.00 Median : 178888 Median : 4009
## Mean : 65.38 Mean : 307327 Mean : 9912
## 3rd Qu.: 87.50 3rd Qu.: 496655 3rd Qu.:16210
## Max. :100.00 Max. :1230264 Max. :34605
## NA's :515
## newcases newdeaths pop2018 newcases_rate
## Min. : 0 Min. :-39.00 Min. : 760077 Min. :0.000000
## 1st Qu.: 540 1st Qu.: 5.00 1st Qu.: 3156140 1st Qu.:0.008177
## Median : 1114 Median : 15.00 Median : 7171650 Median :0.015043
## Mean : 2578 Mean : 37.07 Mean :14651634 Mean :0.026995
## 3rd Qu.: 3649 3rd Qu.: 50.00 3rd Qu.:21299300 3rd Qu.:0.030702
## Max. :17344 Max. :278.00 Max. :39557000 Max. :0.298654
##
## newdeaths_rate status_cases status_deaths st_score
## Min. :-2.535e-04 Length:69174 Length:69174 Min. :56.01
## 1st Qu.: 9.505e-05 Class :character Class :character 1st Qu.:67.03
## Median : 1.945e-04 Mode :character Mode :character Median :79.78
## Mean : 3.262e-04 Mean :76.70
## 3rd Qu.: 4.070e-04 3rd Qu.:87.31
## Max. : 4.868e-03 Max. :90.17
##
## comply_quartile
## Min. : 0.00
## 1st Qu.: 0.00
## Median : 1.00
## Mean :17.97
## 3rd Qu.: 1.00
## Max. :99.00
##
#We now have 17052 rows, and 45 columns of data
view(covidrep7)