#getwd()
#install.packages("cdlTools") for FIPS numbers
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(cdlTools)
library(BSDA)
## Loading required package: lattice
## 
## Attaching package: 'BSDA'
## The following object is masked from 'package:datasets':
## 
##     Orange

Question: Does mask use affect rate of cases or rate of deaths due to covid?

H_null: probability of new covid cases and deaths is not associated with mask usage, i.e. p_highcompliance = p_lowcompliance for new cases and new deaths.

H_alpha: probability of covid cases and deaths is associated with mask usage, i.e. p_highcompliance is not equal to p_lowcompliance

Alpha is set at 0.05.

Data Used: Dynata, T. N. (2020, July). Estimates from The New York Times, based on roughly 250,000 interviews conducted by Dynata from July 2 to July 14. Retrieved from https://github.com: https://github.com/nytimes/covid-19-data/tree/master/mask-use;

Thomas Hale, S. W. (2020). https://github.com. Retrieved from Oxford COVID-19 Government Response Tracker. Blavatnik School of Government.: https://github.com/OxCGRT/covid-policy-

maskuse <- read_csv("C:/Users/libcl/OneDrive/Documents/DATA110/NYTimesMaskWearing.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   COUNTYFP = col_double(),
##   NEVER = col_double(),
##   RARELY = col_double(),
##   SOMETIMES = col_double(),
##   FREQUENTLY = col_double(),
##   ALWAYS = col_double()
## )
census2019 <- read.csv("C:/Users/libcl/OneDrive/Documents/DATA110/Co_est_census_2019.csv")
covid <- read_csv("C:/Users/libcl/OneDrive/Documents/OxfordData11_30.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   CountryName = col_character(),
##   CountryCode = col_character(),
##   RegionName = col_character(),
##   RegionCode = col_character(),
##   Jurisdiction = col_character(),
##   C1_Notes = col_character(),
##   C2_Notes = col_character(),
##   C3_Notes = col_character(),
##   C4_Notes = col_character(),
##   C5_Notes = col_character(),
##   C6_Notes = col_character(),
##   C7_Notes = col_character(),
##   C8_Notes = col_character(),
##   E1_Notes = col_character(),
##   E2_Notes = col_character(),
##   E3_Notes = col_character(),
##   E4_Notes = col_logical(),
##   H1_Notes = col_character(),
##   H2_Notes = col_character(),
##   H3_Notes = col_character()
##   # ... with 6 more columns
## )
## i Use `spec()` for the full column specifications.
## Warning: 5345 parsing failures.
##  row                       col           expected actual                                                    file
## 1009 H5_Investment in vaccines 1/0/T/F/TRUE/FALSE   0.00 'C:/Users/libcl/OneDrive/Documents/OxfordData11_30.csv'
## 1010 H5_Investment in vaccines 1/0/T/F/TRUE/FALSE   0.00 'C:/Users/libcl/OneDrive/Documents/OxfordData11_30.csv'
## 1011 H5_Investment in vaccines 1/0/T/F/TRUE/FALSE   0.00 'C:/Users/libcl/OneDrive/Documents/OxfordData11_30.csv'
## 1012 H5_Investment in vaccines 1/0/T/F/TRUE/FALSE   0.00 'C:/Users/libcl/OneDrive/Documents/OxfordData11_30.csv'
## 1013 H5_Investment in vaccines 1/0/T/F/TRUE/FALSE   0.00 'C:/Users/libcl/OneDrive/Documents/OxfordData11_30.csv'
## .... ......................... .................. ...... .......................................................
## See problems(...) for more details.

Maskuse Dataset: clean and translate fips code to add variable for state name. Then select countypop2019 in census dataset to create totals for each state, so we can join with Maskuse dataset to do a weighted average of each states’ mask data

#clean maskuse and census datasets
#change column heading to lower case
names(maskuse) <- tolower(names(maskuse))
names(census2019) <- tolower(names(census2019))
#drop last 3 digits of countyfp to get state fip
x <- as.integer(maskuse$countyfp)
statefp <- floor(x/1000) 
#convert state fips code to state name 
maskuse <- maskuse %>% 
  mutate(regionname = fips(statefp, to="Name"))
#write.csv(maskuse, "C:/Users/libcl/OneDrive/Documents/MyNYTimesMaskUseSurvey.csv", row.names = FALSE)


#select population estimate for 2019 to be able to join by FIPS code
census2019 <- census2019 %>% 
  summarize(stname, countyfp = state*1000 + county, state, county, popestimate2019)
#join by FIPS code
masks <- maskuse %>% left_join(census2019, by = "countyfp")
#write.csv(census2019, "C:/Users/libcl/OneDrive/Documents/MyCensus2019CountyFIPS.csv", row.names = FALSE)


#clean covid dataset
#change column heading to lower case
names(covid) <- tolower(names(covid))
#replace . with _ in column headings
names(covid) <- gsub(".","_",names(covid), fixed = TRUE)
names(covid) <- gsub(" ","_",names(covid), fixed = TRUE)

#write.csv(covid, "C:/Users/libcl/OneDrive/Documents/MyOxfordUSCovid.csv", row.names = FALSE)

Manipulate masks dataset to show by state rather than county to be able to join with Covid-10 data later. Weight by population from U.S. Census 2018 data into new variables st_never, st_. …. , st_always. Then sum using 0-100 scale of these new variables to get a county “score”

#sum masks column by state into new column, state_pop 
#mutate weighted county average of compliance categories into st_* new names

masks <- masks %>% 
  group_by(regionname) %>% 
  mutate(state_pop = sum(popestimate2019))
  
masks <- masks %>% 
  group_by(regionname) %>% 
  mutate(st_never = never*popestimate2019/state_pop, st_rarely = rarely*popestimate2019/state_pop, st_sometimes = sometimes*popestimate2019/state_pop, st_frequently = frequently*popestimate2019/state_pop, st_always = always*popestimate2019/state_pop)

# get a county score to use instead of 5 categories; give it a 100 index score to average for each state in the next chunk. 0*rarely, 25*rarely, 50*sometimes, 70*frequently, 100*always. then add across to score each county

masks <- masks %>% 
  group_by(regionname) %>% 
  mutate(score = 0*never + 25*rarely + 50*sometimes + 75*frequently + 100*always)
summary(masks)
##     countyfp         never             rarely          sometimes     
##  Min.   : 1001   Min.   :0.00000   Min.   :0.00000   Min.   :0.0010  
##  1st Qu.:18178   1st Qu.:0.03400   1st Qu.:0.04000   1st Qu.:0.0790  
##  Median :29176   Median :0.06800   Median :0.07300   Median :0.1150  
##  Mean   :30384   Mean   :0.07994   Mean   :0.08292   Mean   :0.1213  
##  3rd Qu.:45081   3rd Qu.:0.11300   3rd Qu.:0.11500   3rd Qu.:0.1560  
##  Max.   :56045   Max.   :0.43200   Max.   :0.38400   Max.   :0.4220  
##    frequently         always        regionname           stname         
##  Min.   :0.0290   Min.   :0.1150   Length:3142        Length:3142       
##  1st Qu.:0.1640   1st Qu.:0.3932   Class :character   Class :character  
##  Median :0.2040   Median :0.4970   Mode  :character   Mode  :character  
##  Mean   :0.2077   Mean   :0.5081                                        
##  3rd Qu.:0.2470   3rd Qu.:0.6138                                        
##  Max.   :0.5490   Max.   :0.8890                                        
##      state           county      popestimate2019      state_pop       
##  Min.   : 1.00   Min.   :  1.0   Min.   :      86   Min.   :  578759  
##  1st Qu.:18.00   1st Qu.: 35.0   1st Qu.:   10902   1st Qu.: 3155070  
##  Median :29.00   Median : 79.0   Median :   25726   Median : 6045680  
##  Mean   :30.28   Mean   :103.6   Mean   :  104468   Mean   : 8932447  
##  3rd Qu.:45.00   3rd Qu.:133.0   3rd Qu.:   68073   3rd Qu.:10617423  
##  Max.   :56.00   Max.   :840.0   Max.   :10039107   Max.   :39512223  
##     st_never           st_rarely          st_sometimes      
##  Min.   :0.0000000   Min.   :0.0000000   Min.   :4.700e-07  
##  1st Qu.:0.0001108   1st Qu.:0.0001217   1st Qu.:2.119e-04  
##  Median :0.0003244   Median :0.0003396   Median :5.397e-04  
##  Mean   :0.0009059   Mean   :0.0009753   Mean   :1.522e-03  
##  3rd Qu.:0.0008565   3rd Qu.:0.0008824   3rd Qu.:1.407e-03  
##  Max.   :0.0317992   Max.   :0.0408248   Max.   :6.900e-02  
##  st_frequently         st_always             score      
##  Min.   :1.290e-06   Min.   :0.0000032   Min.   :35.83  
##  1st Qu.:3.823e-04   1st Qu.:0.0008993   1st Qu.:67.05  
##  Median :9.492e-04   Median :0.0021227   Median :74.79  
##  Mean   :3.099e-03   Mean   :0.0097284   Mean   :74.53  
##  3rd Qu.:2.518e-03   3rd Qu.:0.0059780   3rd Qu.:82.29  
##  Max.   :1.640e-01   Max.   :0.7430000   Max.   :96.22

Finally, we can use our county score to get a state score stored in new variable st_score

# get state score by summing county score then dividing by number of counties for each state
masks <- masks %>% 
  group_by(regionname) %>% 
  summarize(regionname, st_score = sum(score)/n())
## `summarise()` regrouping output by 'regionname' (override with `.groups` argument)

View quartiles with boxplot

#In later analysis we should filter regionname into 2 groups:st_score >= 82.29 and st_score <= 67.05 for 1st quartile and 3rd quartile from summary above and boxplot below for clearer mask compliance demarcation between states
p1 <- boxplot(masks$st_score)

Prepare to join covid dataset by first manipulating what columns and new variables we will need.

#state only population data
population <- read_csv("C:/Users/libcl/OneDrive/Documents/DATA110/populationUSstates.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   rank = col_double(),
##   State = col_character(),
##   Pop = col_double(),
##   Growth = col_double(),
##   Pop2018 = col_double(),
##   Pop2010 = col_double(),
##   growthSince2010 = col_double(),
##   Percent = col_double(),
##   density = col_double()
## )
#clean data
names(population) <- tolower(names(population))
names(population) <- gsub(".","_",names(population), fixed = TRUE)
names(population) <- gsub(" ","_",names(population), fixed = TRUE)

#write.csv(covid, "C:/Users/libcl/OneDrive/Documents/MyPopulation2018State.csv", row.names = FALSE)

Need to join population data to get comparisons of newdeaths and newcases per 100 by joining the covid and population datasets

head(covid)
covid1 <- covid %>%
  select(countryname, countrycode, regionname, date, confirmedcases, confirmeddeaths,  containmenthealthindexfordisplay,  economicsupportindexfordisplay)
head(covid1)
covidpop <- right_join(covid1, population, 
              by = c("regionname" = "state"))

Need to remove Puerto Rico and DC as data is not reported. Add new variable lub_date using lubridate package to transform date variable into class “date”. Then use lag on existing cumulative variables to add newcases and newdeaths variables which will give daily new cases and deaths added each day.

covidpop <- covidpop %>% 
  filter(!is.na(countrycode)) %>%   #remove puerto rico and dc and data not reported
  group_by(regionname) %>% 
#add new variable lub_date of class date, and compute new cases and deaths.
    mutate(lub_date = ymd(date), newcases = confirmedcases - lag(confirmedcases), newdeaths = confirmeddeaths - lag(confirmeddeaths))

Get a rate per 100 people for newcases and newdeaths

#use pop2018 to get a rate of for new cases and new deaths per 100 people
covidpop <- covidpop %>% #first drop calculated NAs
  drop_na(newcases)
covidpop <- covidpop %>% 
  drop_na(newdeaths)
  
covidpop <- covidpop %>% 
  filter(pop2018 != 0) %>% 
    group_by(regionname) %>%
  summarize(regionname, lub_date, containmenthealthindexfordisplay, economicsupportindexfordisplay, confirmedcases, confirmeddeaths, newcases, newdeaths, pop2018, newcases_rate = newcases*100/pop2018, newdeaths_rate = newdeaths*100/pop2018)
## `summarise()` regrouping output by 'regionname' (override with `.groups` argument)

set up data for binary variable of success and failure for newcases and newdeaths

covidpop <- covidpop %>%
  mutate(status_cases = if_else(confirmedcases > lag(confirmedcases), "failure", "success"), status_deaths = if_else(confirmeddeaths > lag(confirmeddeaths), "failure", "success"))

Initial look at variables by date: does not appear to have a linear relationship. R^2 indicates only 3% of y is explained by x. This is not surprising as the virus tracking has been described as exponential in growth rate, although it appears it may be approximately normal from histograms, although highly skewed to the left.

 covidpop %>% 
  ggplot() +
  geom_boxplot(aes(x = newcases_rate)) +
  xlab("New Case Rate (per 100 people)") +
  ggtitle("New Covid Cases") +
  coord_flip()

covidpop %>%
  ggplot() +
  geom_boxplot(aes(x = newdeaths_rate)) +
  xlab("Death Rate (per 100 people)") +
  ggtitle("New Covid Deaths") +
  coord_flip()

covidpop %>%
  ggplot() +
  geom_histogram(aes(x = newcases_rate)) +
  xlab("New Case Rate (per 100 people)") +
  ggtitle("New Covid Cases")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

covidpop %>%
  ggplot() +
  geom_histogram(aes(x = newdeaths_rate)) +
  xlab("New Death Rate (per 100 people)") +
  ggtitle("New Covid Deaths")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

covidpop %>%
  ggplot() +
  geom_point(aes(x = lub_date, y = newcases_rate, alpha = 0.05)) +
  xlab("Date") +
  ylab("New Cases Rate (per 100 people)") +
  ggtitle("New Covid Cases")

covidpop %>%
  ggplot() +
  geom_point(aes(x = lub_date, y = newdeaths_rate, alpha = 0.05)) +
  xlab("Date") +
  ylab("New Death Rate (per 100 people)") +
  ggtitle("New Covid Deaths")

fit_cases <- lm(newcases_rate~lub_date, data = covidpop)
fit_deaths <- lm(newdeaths_rate~lub_date, data = covidpop)
summary(fit_cases)
## 
## Call:
## lm(formula = newcases_rate ~ lub_date, data = covidpop)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.039228 -0.009511 -0.001381  0.004270  0.268024 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.579e+00  3.175e-02  -81.24   <2e-16 ***
## lub_date     1.406e-04  1.722e-06   81.69   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01946 on 15648 degrees of freedom
## Multiple R-squared:  0.299,  Adjusted R-squared:  0.2989 
## F-statistic:  6674 on 1 and 15648 DF,  p-value: < 2.2e-16
summary(fit_deaths)
## 
## Call:
## lm(formula = newdeaths_rate ~ lub_date, data = covidpop)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0017083 -0.0001827 -0.0001142  0.0000159  0.0105700 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.662e-02  7.536e-04  -22.05   <2e-16 ***
## lub_date     9.141e-07  4.087e-08   22.37   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0004619 on 15648 degrees of freedom
## Multiple R-squared:  0.03099,    Adjusted R-squared:  0.03092 
## F-statistic: 500.4 on 1 and 15648 DF,  p-value: < 2.2e-16
qqplot(covidpop$lub_date, covidpop$newcases_rate)

qqplot(covidpop$lub_date, covidpop$newdeaths_rate)

Sub-conclusion: the new cases do not appear to be linear to the dates in the times series whereas the deaths may be from the scatterplot; but normality is questionable because of the highly skewed histograms, as well as the the shape of the qqplots; logistical regression may be attempted after analysis of the mask data survey if desired.

Join datasets, and reduce data set to dates starting from 6/1/2020

#join covid and mask datasets
covidmasks <- covidpop %>% right_join(masks, by = "regionname")

#look at data only from 6/1/2000 forward
covidmasks <- covidmasks %>% 
  group_by(regionname) %>% 
  filter(lub_date >= "2020-06-01")
#write.csv(covid, "C:/Users/libcl/OneDrive/Documents/MyJoinedUSCovidMaskUse.csv", row.names = FALSE)

Add new variable set to binary 0 or 1 for in 1st(never) or 4th(always) quartile of state score, respectively.

covidmasks <- covidmasks %>%
  mutate(comply_quartile = if_else(st_score <= 67.05, 0,
                                   if_else(st_score >= 82.29, 1, 99)))

It might be interesting to look at covid data by individual states to see what is happening. I will filter for 7 states that are representative of several regions of the U.S. to reduce the size of the dataset

covidrep7 <- covidmasks %>% 
  filter((regionname == "New York" | regionname == "Maryland" | regionname == "Florida"  | regionname == "California" | regionname == "North Dakota" | regionname == "Iowa" | regionname == "Arizona" & !is.na(confirmedcases))) 
  
summary(covidrep7)
##   regionname           lub_date          containmenthealthindexfordisplay
##  Length:69174       Min.   :2020-06-01   Min.   :27.43                   
##  Class :character   1st Qu.:2020-07-16   1st Qu.:37.15                   
##  Mode  :character   Median :2020-08-31   Median :48.96                   
##                     Mean   :2020-08-31   Mean   :51.31                   
##                     3rd Qu.:2020-10-16   3rd Qu.:65.97                   
##                     Max.   :2020-11-30   Max.   :80.21                   
##                                          NA's   :395                     
##  economicsupportindexfordisplay confirmedcases    confirmeddeaths
##  Min.   :  0.00                 Min.   :   2625   Min.   :   61  
##  1st Qu.: 50.00                 1st Qu.:  56446   1st Qu.:  975  
##  Median : 75.00                 Median : 178888   Median : 4009  
##  Mean   : 65.38                 Mean   : 307327   Mean   : 9912  
##  3rd Qu.: 87.50                 3rd Qu.: 496655   3rd Qu.:16210  
##  Max.   :100.00                 Max.   :1230264   Max.   :34605  
##  NA's   :515                                                     
##     newcases       newdeaths         pop2018         newcases_rate     
##  Min.   :    0   Min.   :-39.00   Min.   :  760077   Min.   :0.000000  
##  1st Qu.:  540   1st Qu.:  5.00   1st Qu.: 3156140   1st Qu.:0.008177  
##  Median : 1114   Median : 15.00   Median : 7171650   Median :0.015043  
##  Mean   : 2578   Mean   : 37.07   Mean   :14651634   Mean   :0.026995  
##  3rd Qu.: 3649   3rd Qu.: 50.00   3rd Qu.:21299300   3rd Qu.:0.030702  
##  Max.   :17344   Max.   :278.00   Max.   :39557000   Max.   :0.298654  
##                                                                        
##  newdeaths_rate       status_cases       status_deaths         st_score    
##  Min.   :-2.535e-04   Length:69174       Length:69174       Min.   :56.01  
##  1st Qu.: 9.505e-05   Class :character   Class :character   1st Qu.:67.03  
##  Median : 1.945e-04   Mode  :character   Mode  :character   Median :79.78  
##  Mean   : 3.262e-04                                         Mean   :76.70  
##  3rd Qu.: 4.070e-04                                         3rd Qu.:87.31  
##  Max.   : 4.868e-03                                         Max.   :90.17  
##                                                                            
##  comply_quartile
##  Min.   : 0.00  
##  1st Qu.: 0.00  
##  Median : 1.00  
##  Mean   :17.97  
##  3rd Qu.: 1.00  
##  Max.   :99.00  
## 
#We now have 17052 rows, and 45 columns of data
view(covidrep7)

Before digging into the mask part of the data, I look at general scatterplot for newcases by regionname using log2 for newcase rate and newdeaths rate to see if I can get a linear relationship using log2. It appears a logrithmic linear model would work, but since this is not our primary question, I elected not to run coefficients and summaries for prediction for this study. There is valuable information in seeing the slope of the lines for cases or deaths in individual states. There are some hints that political drama may be correlated with that slope.

p10 <- covidrep7 %>%
  group_by(regionname) %>% 
  summarize(regionname, lub_date, newcases_ratelog = log2(newcases_rate)) %>% 
  ggplot() +
  geom_point(aes(x = lub_date, y = newcases_ratelog)) +
  geom_smooth(aes(x = lub_date, y = newcases_ratelog), method = lm, se = FALSE, color = "red") +
  ggtitle("New Cases Rate by Individual State") +
  xlab("Date") +
  ylab("New Cases Rate(Log)") +
  facet_wrap(regionname~.)
## `summarise()` regrouping output by 'regionname' (override with `.groups` argument)
p10
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 134 rows containing non-finite values (stat_smooth).

p11 <- covidrep7 %>%
  group_by(regionname) %>% 
  summarize(regionname, lub_date, newdeaths_ratelog = log2(newdeaths_rate)) %>% 
  ggplot() +
  geom_point(aes(x = lub_date, y = newdeaths_ratelog)) +
  geom_smooth(aes(x = lub_date, y = newdeaths_ratelog), method = lm, se = FALSE, color = "red") +
  ggtitle("New Deaths Rate by Individual State") +
  xlab("Date") +
  ylab("New Deaths Rate(Log)") +
  facet_wrap(regionname~.)
## Warning in mask$eval_all_summarise(quo): NaNs produced

## Warning in mask$eval_all_summarise(quo): NaNs produced

## Warning in mask$eval_all_summarise(quo): NaNs produced

## Warning in mask$eval_all_summarise(quo): NaNs produced
## `summarise()` regrouping output by 'regionname' (override with `.groups` argument)
p11
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 4345 rows containing non-finite values (stat_smooth).
## Warning: Removed 450 rows containing missing values (geom_point).

Look at linear regression model again with this final form of dataset containing only the two treatment groups

#This shows fit1 has 25% r-sq on the dataset reduced by date and mask compliance of never or alwys; this suggests there may be a difference in low compliance and high compliance; we may have to look at multiple regression using lub_date, *_rate, and categorical of low or high compliance if we decide to produce a model to predict rate upon mask usage of never or always.

fit1 <- lm(newcases_rate ~ lub_date, data = covidmasks)
summary(fit1)
## 
## Call:
## lm(formula = newcases_rate ~ lub_date, data = covidmasks)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.047242 -0.013407 -0.002713  0.006094  0.259984 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.805e+00  1.090e-02  -440.6   <2e-16 ***
## lub_date     2.609e-04  5.893e-07   442.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0236 on 574801 degrees of freedom
## Multiple R-squared:  0.2543, Adjusted R-squared:  0.2543 
## F-statistic: 1.961e+05 on 1 and 574801 DF,  p-value: < 2.2e-16
qqplot(covidmasks$lub_date, covidmasks$newcases_rate)

Scatterplot for the two groups is as follows with color: light blue for compliance = always; dark blue for non-compliance = never. This also hints there may be something happening with level of mask wearing specifically

p1 <- covidmasks %>% 
   filter(comply_quartile != 99) %>%
  ggplot() +
  geom_point(aes(x = lub_date, y = newcases_rate, color = comply_quartile)) +
xlab("Date") +
ylab("New Case Rate(per 100 people)") +
ggtitle("New Cases for Never vs. Always Masks")

p2 <- covidmasks %>% 
  filter(comply_quartile != 99) %>% 
  ggplot() +
  geom_point(aes(x = lub_date, y = newdeaths_rate, color = comply_quartile)) + 
  xlab("Date") +
  ylab("New Death Rate(per 100 people)") +
  ggtitle("New Deaths for Never vs. Always Masks")


p1

p2

##### compare newcases_rate and newdeaths_rate following these implementations

p3 <- covidrep7 %>% 
  group_by(regionname) %>% 
  summarize(regionname, lub_date, date_lag = lag(lub_date, 28), newcases_rate, st_score) %>% 
  ggplot() +
  geom_point(aes(x = date_lag, y = newcases_rate, color = st_score)) + scale_color_viridis_c(name = "State Score") +
  facet_wrap(regionname~.) +
  ggtitle("State New Cases")
## `summarise()` regrouping output by 'regionname' (override with `.groups` argument)
p3
## Warning: Removed 196 rows containing missing values (geom_point).

p4 <- covidrep7 %>% 
  group_by(regionname) %>% 
  summarize(regionname, lub_date, date_lag = lag(lub_date, 28), newdeaths_rate, st_score) %>% 
  ggplot() +
  geom_point(aes(x = date_lag, y = newdeaths_rate, color = st_score)) + scale_color_viridis_c(name = "State Score") +
  facet_wrap(regionname~.) +
  ggtitle("State New Deaths")
## `summarise()` regrouping output by 'regionname' (override with `.groups` argument)
p4
## Warning: Removed 196 rows containing missing values (geom_point).

Let’s look at the 4 plots of the 2 categories of compliance with in both cases and deaths

covidmasks %>% 
  filter(comply_quartile == 1) %>% 
  ggplot() +
  geom_histogram(aes(x = newcases_rate), binwidth = .01) +
  ggtitle("New Cases - Always Wears Mask") +
  xlab("Rate per 100 people")

covidmasks %>% 
  filter(comply_quartile == 0) %>% 
  ggplot() +
  geom_histogram(aes(x = newcases_rate), binwidth = .01) +
  ggtitle("New Cases - Never Wears Mask") +
  xlab("Rate per 100 people")

covidmasks %>% 
  filter(comply_quartile == 1) %>% 
  ggplot() +
  geom_histogram(aes(x = newdeaths_rate), binwidth = .0001) +
  ggtitle("New Deaths - Always Wears Mask") +
  xlab("Rate per 100 people")

covidmasks %>% 
  filter(comply_quartile == 0) %>% 
  ggplot() +
  geom_histogram(aes(x = newdeaths_rate), binwidth = .0001) +
  ggtitle("New Deaths - Never Wears Mask") +
  xlab("Rate per 100 people")

The t-test does not seem appropriate because of the large skew; we can try a chisquare for association of treatment type(low compliance in mask wearing and high compliance in mask wearing) with a 2 way table and success defined as no daily increase in cases or deaths as follows:

            Always Masks      Never Masks

Success count success count success Failure count failure count failure

Compute the values for the Chi-Square test for independence in R to enter into the 2-way tables.

#count Success and Failure for third quartile always new cases
covidmasks %>% 
  filter(comply_quartile == 1) %>%
  group_by(status_cases) %>% 
  summarize(n())
## `summarise()` ungrouping output (override with `.groups` argument)
#compute Success and Failure for first quartile never new cases
covidmasks %>% 
  filter(comply_quartile == 0) %>%
  group_by(status_cases) %>% 
  summarize(n())
## `summarise()` ungrouping output (override with `.groups` argument)
#compute Success and Failure for third quartile always new deaths
covidmasks %>% 
  filter(comply_quartile == 1) %>%
  group_by(status_deaths) %>% 
  summarize(n())
## `summarise()` ungrouping output (override with `.groups` argument)
#compute Success and Failure for first quartile never deaths
covidmasks %>% 
  filter(comply_quartile == 0) %>%
  group_by(status_deaths) %>% 
  summarize(n())
## `summarise()` ungrouping output (override with `.groups` argument)

Perform Chi-square for independence in cases using chunk counts above

success <- c(1345, 1522)
failure <- c(97475, 148172)
compliance <- data.frame(success, failure)
chisq.test(compliance)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  compliance
## X-squared = 61.583, df = 1, p-value = 4.244e-15
# returns p = 4.244xe-15 < 0.05; very strong evidence of independence

Perform Chi-square for independence in deaths

success <- c(6835, 30411)
failure <- c(91985, 119283)
compliance <- data.frame(success, failure)
chisq.test(compliance)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  compliance
## X-squared = 8386.2, df = 1, p-value < 2.2e-16
# returns p = .022e-16 < 0.05; very strong evidence of independence 

Given that the two groups above may not be normally distributed from the histograms and are independent according the Chi-Square test, the non-parametric wilcox.test in R will be used for hypothesis testing. First break into 2 datasets ‘always’ and ‘never’ to perform Wilcoxan, Mann, Whitney test to determine if the distributions coincide or if one is shifted significantly away from the other.

covidalways <- covidmasks %>% 
  filter(comply_quartile == 1)

covidnever <- covidmasks %>% 
  filter(comply_quartile == 0)


# The non-parametric Wilcoxin test shows we  reject the null
wilcox.test(covidnever$newcases_rate, covidalways$newcases_rate)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  covidnever$newcases_rate and covidalways$newcases_rate
## W = 1.0349e+10, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(covidnever$newdeaths_rate, covidalways$newdeaths_rate)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  covidnever$newdeaths_rate and covidalways$newdeaths_rate
## W = 7.675e+09, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

CONCLUSION:

Wilcoxin test and Chi-Square test for independence are satisfied, so there is strong evidence that the distribution of mask wearing population and non-mask wearing population show significant shift from each other - we reject the null. Looking at the color scatterplots earlier for each group, it very strongly suggests that mask wearing may lower the rate of covid cases and deaths.