# install trackdown
if(!require("trackdown")) {
  install.packages("trackdown")
  stopifnot(require("trackdown"))
}
## Loading required package: trackdown
## Warning: package 'trackdown' was built under R version 4.1.1
# download from google drive
trackdown::download_file("601proj2.Rmd", shared_drive = "NYP Data Science Team 6")
# update the google docs with your local version
trackdown::update_file("601proj2.Rmd", shared_drive = "NYP Data Science Team 6")

Read data Contruct cases

ds.nyt = read.csv("us-counties.csv")
ds.nyt$date = as.Date(ds.nyt$date, "%Y-%m-%d")
ds.nyt$county = tolower(ds.nyt$county)

ds.city = read.csv("NYP_City_List.csv")
ds.city$County = tolower(ds.city$County)
ds.city$Mask.Mandate = factor(ds.city$Mask.Mandate, c("No", "Yes"))
county_list = gsub("(.*)\\scounty","\\1", ds.city$County)

getY = function(start_time = "2021-08-20", end_time = "2021-10-20") {
  start.date = as.Date(start_time, "%Y-%m-%d")
  end.date = as.Date(end_time, "%Y-%m-%d")
  y = c()
  for (i in 1:length(county_list)) {
    tmp = ds.nyt[ ds.nyt$county == county_list[i] & 
                    ds.nyt$state == ds.city$State[i], ]
    y[i] = (tmp[tmp$date == end.date, "cases"]
            - tmp[tmp$date == start.date, "cases"]) / ds.city$population[i] * 100000
  }
  y
}
ds.city$y = getY()

get inital model

o = lm(y~Mask.Mandate, ds.city, ds.city$County!="duval county")
summary(o)
## 
## Call:
## lm(formula = y ~ Mask.Mandate, data = ds.city, subset = ds.city$County != 
##     "duval county")
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -892.1 -488.5 -165.7  642.6 1109.4 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       2353.3      252.7   9.312 7.34e-08 ***
## Mask.MandateYes   -817.0      323.3  -2.527   0.0224 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 668.6 on 16 degrees of freedom
## Multiple R-squared:  0.2853, Adjusted R-squared:  0.2406 
## F-statistic: 6.387 on 1 and 16 DF,  p-value: 0.02241
g = glm(y~Mask.Mandate, data = ds.city, family = Gamma(),
            subset = ds.city$County!="duval county")
summary(g)
## 
## Call:
## glm(formula = y ~ Mask.Mandate, family = Gamma(), data = ds.city, 
##     subset = ds.city$County != "duval county")
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.4415  -0.3500  -0.1123   0.3048   0.4777  
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     4.249e-04  5.723e-05   7.424 1.44e-06 ***
## Mask.MandateYes 2.260e-04  9.037e-05   2.500   0.0237 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 0.1269934)
## 
##     Null deviance: 2.7772  on 17  degrees of freedom
## Residual deviance: 1.9801  on 16  degrees of freedom
## AIC: 285.81
## 
## Number of Fisher Scoring iterations: 5

deal with previous cases

getYtilde = function(start2="2021-8-7", end2="2021-8-20") {
  start2 = as.Date(start2, "%Y-%m-%d")
  end2 = as.Date(end2, "%Y-%m-%d")
  Ytilde = c()
  for (i in 1:length(county_list)) {
    tmp = ds.nyt[
      ds.nyt$county == county_list[i] &
        ds.nyt$state == ds.city$State[i], ]
    Ytilde[i] = (tmp[tmp$date == end2, "cases"]
                 - tmp[tmp$date == start2, "cases"]) / ds.city$population[i] * 100000
  }
  Ytilde
}
ds.city$Y7 = getYtilde("2021-8-13")
ds.city$Y14 = getYtilde("2021-8-6")
ds.city$Y30 = getYtilde("2021-7-21")
ds.city$Y60 = getYtilde("2021-6-21")
ds.city$Yall = getYtilde("2021-01-21")

layout(matrix(1:5,1,5,T))
par(cex.axis=1.2, cex.main=1.5)
boxplot(Y7~Mask.Mandate, ds.city, main="-7d", ylab = "Cumulative Case", xlab = NA)
boxplot(Y14~Mask.Mandate, ds.city, main="-14d", ylab = NA, xlab = NA)
boxplot(Y30~Mask.Mandate, ds.city, main="-30d", ylab = NA, xlab = NA)
boxplot(Y60~Mask.Mandate, ds.city, main="-60d", ylab = NA, xlab = NA)
boxplot(Yall~Mask.Mandate, ds.city, main="All", ylab = NA, xlab = NA)

# t.test(Y1~Mask.Mandate, ds.city)

adjusted anaylsis

o2 = lm(y~Y14+Mask.Mandate, ds.city, subset = ds.city$County!="duval county")
summary(o2)
## 
## Call:
## lm(formula = y ~ Y14 + Mask.Mandate, data = ds.city, subset = ds.city$County != 
##     "duval county")
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -989.10  -34.44   18.09  187.12  681.10 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      904.4376   311.9109   2.900   0.0110 *  
## Y14                2.8670     0.5374   5.334 8.34e-05 ***
## Mask.MandateYes -521.6632   203.8266  -2.559   0.0218 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 405.7 on 15 degrees of freedom
## Multiple R-squared:  0.7533, Adjusted R-squared:  0.7204 
## F-statistic:  22.9 on 2 and 15 DF,  p-value: 2.762e-05
g2 = glm(y~Y14+Mask.Mandate, ds.city, family = Gamma(),
         subset = ds.city$County!="duval county")
summary(g2)
## 
## Call:
## glm(formula = y ~ Y14 + Mask.Mandate, family = Gamma(), data = ds.city, 
##     subset = ds.city$County != "duval county")
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.52217  -0.12153  -0.00162   0.12968   0.25432  
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      8.992e-04  9.281e-05   9.689 7.56e-08 ***
## Y14             -8.448e-07  1.439e-07  -5.872 3.07e-05 ***
## Mask.MandateYes  1.271e-04  5.051e-05   2.516   0.0238 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 0.03711809)
## 
##     Null deviance: 2.77722  on 17  degrees of freedom
## Residual deviance: 0.62792  on 15  degrees of freedom
## AIC: 266.91
## 
## Number of Fisher Scoring iterations: 4
plot(y~Y14, ds.city, 
     col=(ds.city$Mask.Mandate=="Yes")+1, 
     pch=(ds.city$Mask.Mandate=="Yes")*15+1,
     xlab="Prior 14 days cumulative cases", ylab="outcome(60 days cumulative cases)")
abline(lm(y~Y14, ds.city, subset = ds.city$Mask.Mandate=="Yes"), col=2)
abline(lm(y~Y14, ds.city, subset = ds.city$Mask.Mandate=="No"), col=1, lty=2)
abline(lm(y~Y14, ds.city, subset = ds.city$Mask.Mandate=="No" & ds.city$County!="duval county"))
legend("bottomright", 
       legend = c("Mask required", "Not required", "remove outlier"),
       col=c(2,1,1), lty=c(1,2,1), pch=c(16,1,NA))
text(ds.city$Y14[5]-80, ds.city$y[5]+100, labels = "Jacksonville")

Change start date

outlier is removed

N = 31
Sys.setenv(LANG = "en")
sen = data.frame(
  xlab=-10:20,
  coef=rep(0,N), p=rep(0,N))
sen$date = sen$xlab + as.Date("2021-08-20", "%Y-%m-%d")

for (i in 1:N) {
  ds.city$y1 = getY(
    start_time = sen$date[i], end_time = sen$date[i]+60
  )
  #oo=lm(y1~Mask.Mandate, ds.city, subset = ds.city$County!="duval county")
  oo=glm(y1~Mask.Mandate, ds.city, subset = ds.city$County!="duval county", family = Gamma())
  sen$coef[i]=coef(oo)["Mask.MandateYes"]
  #sen$p[i]=coef(summary(oo)[4])["Mask.MandateYes",4]
  sen$p[i]=coef(summary(oo)[12])["Mask.MandateYes",4]
}

plot

layout(matrix(1:2,1,2,T))
par(cex.axis=1.2)
plot(sen$xlab, sen$coef, ylab=NA, 
     main="Coef[\"Mask\"]", xlab = NA, type = "b")
plot(sen$xlab, sen$p, ylab=NA, 
     main="P-value", xlab = NA, type = "b")

# mtext("Shifting the time window", side = 1, line = -2,  outer = TRUE, cex=1.5)

Add vaccine

ds.vac = read.csv("casevac.csv")
ds.vac$date = as.Date(ds.vac$date)
ds.city$v = ds.vac$Series_Complete_Pop_Pct[ds.vac$date == "2021-08-21"]
summary(lm(y~v+Mask.Mandate, ds.city, subset = ds.city$County!="duval county"))
## 
## Call:
## lm(formula = y ~ v + Mask.Mandate, data = ds.city, subset = ds.city$County != 
##     "duval county")
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1027.4  -441.7  -145.9   564.7  1148.3 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     2588.981    492.243   5.260 9.62e-05 ***
## v                 -5.066      9.009  -0.562   0.5822    
## Mask.MandateYes -827.214    330.920  -2.500   0.0245 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 683.4 on 15 degrees of freedom
## Multiple R-squared:    0.3,  Adjusted R-squared:  0.2067 
## F-statistic: 3.215 on 2 and 15 DF,  p-value: 0.06887

Add previous cases as virate

N = 31
Sys.setenv(LANG = "en")
sen = data.frame(
  xlab=-10:20,
  coef=rep(0,N), p=rep(0,N))
sen$date = sen$xlab + as.Date("2021-08-20", "%Y-%m-%d")

for (i in 1:N) {
  ds.city$y1 = getY(
    start_time = sen$date[i], end_time = sen$date[i]+60
  )
  ds.city$y2 = getYtilde(start2 = sen$date[i] - 14, end2 = sen$date[i] - 1)
  #oo=lm(y1~y2+Mask.Mandate, ds.city, subset = ds.city$County!="duval county")
  oo=glm(y1~y2+Mask.Mandate, ds.city, subset = ds.city$County!="duval county", family = Gamma())
  sen$coef[i]=coef(oo)["Mask.MandateYes"]
  #sen$p[i]=coef(summary(oo)[4])["Mask.MandateYes",4]
  sen$p[i]=coef(summary(oo)[12])["Mask.MandateYes",4]
}
layout(matrix(1:2,1,2,T))
par(cex.axis=1.2)
plot(sen$xlab, sen$coef, ylab=NA, 
     main="Coef[\"Mask\"]", xlab = NA, type = "b")
plot(sen$xlab, sen$p, ylab=NA, 
     main="P-value", xlab = NA, type = "b")

# mtext("Shifting the time window", side = 1, line = -2,  outer = TRUE, cex=1.5)

Diagnostic plot for GLM

layout(matrix(1:4, 2, 2, T))
plot(g2)
mtext("With prior case", side=3, line=-1, outer = T)