This was last published on: April 20, 2021

METHODS

In this study I will leverage the refitted LASSO (relaxo) because my goal is to obtain an active set that helps understand why certain counties were impacted as they were.

IMPORT DATA

COVID-19 Data

Steps:

  1. Import a subset of the COVID-19 data
  2. Since the COVID-19 data is temporal I need to remove the time element somehow
    • I’ll do this by selecting the 56th day after the first confirmed case by county
  3. Remove unnecessary fields
  4. Set a key for faster merges downstream
  5. Finally, house cleaning - remove datasets no longer needed

Notes:

  • The supervisor is the cumulative total number confirmed cases in each county
  • Only counties that had a confirmed case by the 56th day of the first confirmed case will be considered
    • This because my goal is inferential and I hope to better understand why the counties were impacted as they were given the design matrix
  • Also, in addition to importing the COVID-19 data there is also Google mobility data (also time series)
# import covid
covid <-
  data.table::fread(
    file = "C:\\Users\\fsusa\\Documents\\git\\tamu_stat656_project\\stat656_covid.csv",
    nThread = ncores,
    header = TRUE,
    check.names=TRUE,
    select = c("County_FIPS","Confirmed_Cases",
               "retail_and_recreation_percent_change_from_baseline",
               "grocery_and_pharmacy_percent_change_from_baseline",
               "parks_percent_change_from_baseline",
               "transit_stations_percent_change_from_baseline",
               "workplaces_percent_change_from_baseline",
               "residential_percent_change_from_baseline",
               "has_confirmed_cases_flag",
               "Days_since_first_incidenct_Confirmed_Cases"))

# filter covid data
## filter for counties with cases
## filter for 56th day
covid2 = covid[has_confirmed_cases_flag==1][Days_since_first_incidenct_Confirmed_Cases==56]

# filter covid again for certain fields
covid3 = covid2[,!"has_confirmed_cases_flag"][,!"Days_since_first_incidenct_Confirmed_Cases"]

# data.table efficency
data.table::setkey(covid3,County_FIPS)

# HOUSE CLEANING
rm(covid, covid2)
gc()
##           used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 2258918 120.7    4090310 218.5  4090310 218.5
## Vcells 5269373  40.3   21396431 163.3 16669140 127.2

Before moving onto the next step, let’s take a quick glance at the data

str(covid3)
## Classes 'data.table' and 'data.frame':   3139 obs. of  8 variables:
##  $ County_FIPS                                       : int  1001 1003 1005 1007 1009 1011 1013 1015 1017 1019 ...
##  $ Confirmed_Cases                                   : int  127 216 147 62 47 71 312 127 326 30 ...
##  $ retail_and_recreation_percent_change_from_baseline: int  3 -13 5 NA 7 NA NA 3 14 NA ...
##  $ grocery_and_pharmacy_percent_change_from_baseline : int  8 26 -7 NA 14 NA 4 3 13 15 ...
##  $ parks_percent_change_from_baseline                : int  NA 37 NA NA NA NA NA NA NA NA ...
##  $ transit_stations_percent_change_from_baseline     : int  NA -5 NA NA NA NA NA NA NA NA ...
##  $ workplaces_percent_change_from_baseline           : int  -27 -16 -17 -68 -28 -24 -25 -28 -32 -19 ...
##  $ residential_percent_change_from_baseline          : int  10 5 NA NA 7 NA NA 8 9 NA ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr "County_FIPS"

Feature matrix

  • Next up, import the feature matrix
#import feature matrix
features <- data.table::fread(
  file = "C:\\Users\\fsusa\\Documents\\git\\tamu_stat656_project\\stat656_data.csv",
  header = TRUE,
  stringsAsFactors = TRUE,
  nThread = ncores)

# data.table efficiency
data.table::setkey(features,FIPS_Code)

Once again, let’s take a peak at the data

  • Since there are quite a few variables we’ll only look at the dimensions of the matrix
dim(features)
## [1] 3143   34

MERGE DATA

Steps:

  1. Merge the COVID-19 data and the feature matrix
  2. Remove unnecessary fields
  3. Set the County FIPS codes as row names
  4. Remove the County FIPS field
  5. House cleaning
#merge covid and features
dataAll <- merge(covid3, features, by.x = "County_FIPS", by.y = "FIPS_Code")

#remove unneeded features
dataAll2 <- dataAll[,!"State.x"][,!"Area_name"]

# set row names
row.names(dataAll2) <- dataAll2$County_FIPS

#remove county fips codes as feature
dataAll2 <- dataAll2[,!"County_FIPS"]

#modify 
dataAll2[,names(dataAll2):=lapply(.SD, as.numeric)]

# HOUSE CLEANING
rm(covid3, features, dataAll)
gc()
##           used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 2267689 121.2    4090557 218.5  4090557 218.5
## Vcells 3962851  30.3    8388608  64.0  6646160  50.8

DATA PARTIONS

# create data partition
## attempt 1
#trainIndex <- createDataPartition(dataAll2$Confirmed_Cases, p = 0.75, list = FALSE) %>% as.vector(.)
## attempt 2
trainIndex <- createDataPartition(dataAll2$Confirmed_Cases, p = 0.5, list = F) %>% as.vector(.)
validSplit <- createDataPartition(dataAll2[-trainIndex, Confirmed_Cases], p = 0.5, list = F) %>% as.vector(.)

n <- nrow(dataAll2)

## splits
testIndex <- (1:n)[-trainIndex]
validIndex <- (1:n)[-trainIndex][-validSplit]

role <- rep("train", n)
role[testIndex] <- "test"
role[validIndex] <- "validation"

Before moving forward, let’s check that the the number of records in each split is at least greater than 10.

table(role)/(ncol(dataAll2)-1)
## role
##       test      train validation 
##   21.18919   42.37838   21.13514

DATA PREP

Missingness

  • From the snippet before about the structure of the feature matrix we saw that there were missing values
  • Let’s see how many and to what extent
threshMissingness <- 0.33

trainMissing <-
sapply(dataAll2[role=="train"],function(x){sum(is.na(x))}) %>%
  as_tibble(.) %>%
  mutate(ID=names(dataAll2),
         missing = value,
         n = nrow(dataAll2[role=="train"])) %>%
  mutate(pctMissing = round(missing/n,4)) %>%
  select(-1) %>%
  filter(ID != "Confirmed_Cases") %>%
  arrange(.,desc(pctMissing)) %>%
  as.data.frame(.)

trainMissing %>%
  filter(pctMissing >= threshMissingness) %>%
  kable(.) %>%
  kable_styling(bootstrap_options = c("striped", "condensed"), full_width = FALSE)
ID missing n pctMissing
GDP_All_industries_2019 1531 1568 0.9764
parks_percent_change_from_baseline 1269 1568 0.8093
transit_stations_percent_change_from_baseline 1084 1568 0.6913
residential_percent_change_from_baseline 996 1568 0.6352
grocery_and_pharmacy_percent_change_from_baseline 861 1568 0.5491
retail_and_recreation_percent_change_from_baseline 746 1568 0.4758
  • There are 6 features with missingness greater than the threshold of 0.33
  • After removing the above features we’re left with 31 features
  • The majority of the features above are derived from the Google mobility data
    • Those can be identified by the _percent_change_from_baseline naming convention
  • Since the above features have too much missingness let’s remove them
  • Then, we’ll center, scale, and impute the missing values using the caret library
# obtain features that are missing -lt than threshold
varsFiltered <-
  trainMissing %>%
  filter(pctMissing < threshMissingness) %>%
  select(ID)

#####################################################

xTrainPreProc <-
  dataAll2 %>%
  as_tibble(.) %>%
  filter(role == "train") %>%
  select(varsFiltered$ID) %>%
  preProcess(., method = c("center","scale","bagImpute"))

#####################################################

xTrain <-
  dataAll2 %>%
  as_tibble(.) %>%
  filter(role == "train") %>%
  select(varsFiltered$ID) %>%
  predict(xTrainPreProc,.)

xValidation <-
  dataAll2 %>%
  as_tibble(.) %>%
  filter(role == "validation") %>%
  select(varsFiltered$ID) %>%
  predict(xTrainPreProc,.)

xTest <-
  dataAll2 %>%
  as_tibble(.) %>%
  filter(role == "test") %>%
  select(varsFiltered$ID) %>%
  predict(xTrainPreProc,.)

#####################################################

# HOUSE CLEANING
rm(trainMissing, threshMissingness)

Split the supervisor into train, validation, test

# breakout data sets
yTrain <-
  dataAll2 %>%
  as_tibble(.) %>%
  filter(role == "train") %>%
  select(Confirmed_Cases) %>%
  unlist()

yValidation <-
  dataAll2 %>%
  as_tibble(.) %>%
  filter(role == "validation") %>%
  select(Confirmed_Cases) %>%
  unlist()

yTest <-
  dataAll2 %>%
  as_tibble(.) %>%
  filter(role == "test") %>%
  select(Confirmed_Cases) %>%
  unlist()

CORRELATION

corrplot::corrplot(cor(xTrain),
                   type="lower",
                   order="hclust",
                   hclust.method="average")

corrplot::corrplot(cor(xTrain, method="spearman"),
                   type="lower",
                   order="hclust",
                   hclust.method="average")

linearCombos = caret::findLinearCombos(xTrain)
names(xTrain)[linearCombos$remove]
## [1] "Unemployed_2019"
corrRemove = caret::findCorrelation(cor(xTrain))
names(xTrain)[corrRemove]
## [1] "Civilian_labor_force_2019" "Employed_2019"            
## [3] "Unemployed_2019"           "POVALL_2019"              
## [5] "democrat"
# HOUSE CLEANING
rm(linearCombos, corrRemove)

THE MODEL

Y transformations

  • I log transformed the supervisor because the value is the total number of cases on the 56th day after the first confirmed case
    • The first phase of virus spread is exponential, hence the log transformation is a suitable one
    • I also tested different values of a Box Cox transformation, which resulted in lambda = 0
caret::BoxCoxTrans(yTrain)
## Box-Cox Transformation
## 
## 1568 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0    11.0    36.0   376.7   134.0 35854.0 
## 
## Largest/Smallest: 35900 
## Sample Skewness: 12.5 
## 
## Estimated Lambda: -0.1 
## With fudge factor, Lambda = 0 will be used for transformations
car::powerTransform(lm(yTrain~.,data=xTrain))
## Estimated transformation parameter 
##           Y1 
## -0.007527509

Fit the LASSO

  • From the plot it seems the lambda min isn’t on the boundary
set.seed(1)
refittedLasso = glmnet::cv.glmnet(x=as.matrix(xTrain),
                        alpha=1,
                        y=log(yTrain),
                        lambda=seq(1e-7,1,0.00001),
                        parallel = TRUE)

# print lambda estimates
data.frame(lambdaMin = refittedLasso$lambda.min,
           lambda1SE = refittedLasso$lambda.1se) %>%
  kable(.) %>%
  kable_styling(bootstrap_options = c("striped", "condensed"), full_width = FALSE)
lambdaMin lambda1SE
0.0137701 0.1379701
# plot
plot(refittedLasso)

Refitted LASSO

  • First, obtain the active set selected via the LASSO
  • Then, explore the underlying structure of the data as we did before
    • But on the validation set
  • Finally, refit the model on the test set and check assumptions
# coef from glmnet
lasso_lambda1SE_coef = coef(refittedLasso, s="lambda.1se")[-1]
# active set
lasso_lambda1SE_activeSet = which(abs(lasso_lambda1SE_coef) > machine_precision_zero)
  • After exploring the active set selected via the LASSO it seems that the chosen features aren’t highly correlated with each other and don’t have linear dependencies
    • There are a couple of close calls, such as the feature republican and beds
      • Republican = the number of republican voters in a county in the 2016 presidential election
      • Beds = the number of hospital beds in a county
corrplot::corrplot(cor(xValidation[lasso_lambda1SE_activeSet]),
                   type="lower",
                   order="hclust",
                   hclust.method="average")

corrplot::corrplot(cor(xValidation[lasso_lambda1SE_activeSet], method="spearman"),
                   type="lower",
                   order="hclust",
                   hclust.method="average")

linearCombos = caret::findLinearCombos(xValidation[lasso_lambda1SE_activeSet])
names(xValidation[lasso_lambda1SE_activeSet])[linearCombos$remove]
## character(0)
corrRemove = caret::findCorrelation(cor(xValidation[lasso_lambda1SE_activeSet]))
names(xValidation[lasso_lambda1SE_activeSet])[corrRemove]
## character(0)
caret::featurePlot(xValidation[,lasso_lambda1SE_activeSet], log(yValidation))

# HOUSE CLEANING
rm(linearCombos, corrRemove)
  • Refit the LASSO > refitted LASSO or relaxo on the test set
    • Refit on the test now since the previous step explored the active set using the validation set
    • We’ll explore the output at the end
set.seed(1)
refittedLasso_LM = lm(log(yTest)~.,data=xTest[lasso_lambda1SE_activeSet])
#linear_regression_diagnostic_tools(lm=refittedLasso_LM,type="MLR",verbose=TRUE)

Model assessment

  • For the most part it looks like the model fits the data fairly well because the residuals mostly have constant variance
    • Except there does appear to be some leverage points
  • Plots galore below, you’ve been warned…
    • From the plots we see that for there does seem to be some influential counties
    • The residuals deviate from normality at the tails
    • There’s some curvature in the inverse response plot, which should be a strain, 45 degree line
    • Three features have a strange standardized residual plot:
      • Republican, beds, and Residual_2019
    • Lastly, according the the marginal model plots the model mostly fits the data well

Default plots

par(mfrow=c(2,2))
plot(refittedLasso_LM)

par(mfrow=c(1,1))

Residual plot and inverse response plot

par(mfrow=c(1,2))
plot(x=yhat,
     y=ri,
     type="p",
     main="Standardized Residual Plot",
     xlab="Standardized Residuals",
     ylab="Yhat",
     ylim=c(-max(abs(ri)),max(abs(ri))),
     xlim=c(0,max(yhat)))
abline(h=0,lty=2,col="darkorange")
plot(x=y,
     y=yhat,
     type="p",
     xlim=c(0,15),
     ylim=c(0,15),
     main="Inverse Response Plot",
     ylab="Yhat",
     xlab="Y")
abline(a=0,b=1,lty=2,col="darkorange")

par(mfrow=c(1,1))

Marginal Model Plots

#, fig.height=20, fig.width=15
car::marginalModelPlots(refittedLasso_LM,main="Marginal Model Plot", ylab="Y",ask=FALSE)#,layout=c(4,3))

Model tests

  • The test measures whether the multiple linear regression assumptions have been met
    • It appears that the model meets the constant variance assumption
  • The model doesn’t seem to suffer from multicolinearity too badly
    • According to STAT604 if the VIF values are less than 5 then that’s okay
    • However, from other literature (R in Action) it says there is multicolinearity if the VIF values are greater than 2 it’s an issue
#to test model assumptions
gvlma::gvlma(refittedLasso_LM)
## 
## Call:
## lm(formula = log(yTest) ~ ., data = xTest[lasso_lambda1SE_activeSet])
## 
## Coefficients:
##                             (Intercept)  
##                                 3.77019  
##                                    beds  
##                                 0.02278  
## workplaces_percent_change_from_baseline  
##                                -0.08676  
##                                  Asthma  
##                                 0.26422  
##                                    COPD  
##                                -0.29898  
##                              republican  
##                                 0.85546  
##                  Chronic.Kidney.Disease  
##                                 0.13107  
##                            Hypertension  
##                                 0.41765  
##                  INTERNATIONAL_MIG_2019  
##                                 0.12508  
##                           RESIDUAL_2019  
##                                 0.13822  
##            Median_Household_Income_2019  
##                                 0.24037  
##                         Median_Age_2018  
##                                -0.36621  
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma::gvlma(x = refittedLasso_LM) 
## 
##                      Value   p-value                   Decision
## Global Stat        82.0825 1.110e-16 Assumptions NOT satisfied!
## Skewness            2.4403 1.182e-01    Assumptions acceptable.
## Kurtosis           10.3471 1.297e-03 Assumptions NOT satisfied!
## Link Function      68.8918 1.110e-16 Assumptions NOT satisfied!
## Heteroscedasticity  0.4033 5.254e-01    Assumptions acceptable.
#variance inflation factors: VIF
(VIF = car::vif(refittedLasso_LM))
##                                    beds workplaces_percent_change_from_baseline 
##                                3.955747                                1.282504 
##                                  Asthma                                    COPD 
##                                1.454265                                1.741227 
##                              republican                  Chronic.Kidney.Disease 
##                                4.370459                                2.584567 
##                            Hypertension                  INTERNATIONAL_MIG_2019 
##                                2.636421                                1.141851 
##                           RESIDUAL_2019            Median_Household_Income_2019 
##                                1.038997                                1.757807 
##                         Median_Age_2018 
##                                1.194864
VIF_filtered =
  VIF %>%
  data.frame(VIF=.) %>%
  filter(.,VIF>2)

VIF_filtered %>%
  mutate_all(.,round,3) %>%
  kable(.) %>%
  kable_styling(bootstrap_options = c("striped", "condensed"), full_width = FALSE)
VIF
beds 3.956
republican 4.370
Chronic.Kidney.Disease 2.585
Hypertension 2.636

CONCLUSION: INFERENCES

out = broom::tidy(refittedLasso_LM)[-1,] %>%
  select(c(1:2)) %>%
  mutate_if(.,is.numeric,round,4) %>%
  arrange(.,desc(abs(estimate)))

# print
kable(out) %>%
kable_styling(bootstrap_options = c("striped", "condensed"), full_width = FALSE)
term estimate
republican 0.8555
Hypertension 0.4176
Median_Age_2018 -0.3662
COPD -0.2990
Asthma 0.2642
Median_Household_Income_2019 0.2404
RESIDUAL_2019 0.1382
Chronic.Kidney.Disease 0.1311
INTERNATIONAL_MIG_2019 0.1251
workplaces_percent_change_from_baseline -0.0868
beds 0.0228