This was last published on: April 20, 2021
In this study I will leverage the refitted LASSO (relaxo) because my goal is to obtain an active set that helps understand why certain counties were impacted as they were.
Steps:
Notes:
# import covid
covid <-
data.table::fread(
file = "C:\\Users\\fsusa\\Documents\\git\\tamu_stat656_project\\stat656_covid.csv",
nThread = ncores,
header = TRUE,
check.names=TRUE,
select = c("County_FIPS","Confirmed_Cases",
"retail_and_recreation_percent_change_from_baseline",
"grocery_and_pharmacy_percent_change_from_baseline",
"parks_percent_change_from_baseline",
"transit_stations_percent_change_from_baseline",
"workplaces_percent_change_from_baseline",
"residential_percent_change_from_baseline",
"has_confirmed_cases_flag",
"Days_since_first_incidenct_Confirmed_Cases"))
# filter covid data
## filter for counties with cases
## filter for 56th day
covid2 = covid[has_confirmed_cases_flag==1][Days_since_first_incidenct_Confirmed_Cases==56]
# filter covid again for certain fields
covid3 = covid2[,!"has_confirmed_cases_flag"][,!"Days_since_first_incidenct_Confirmed_Cases"]
# data.table efficency
data.table::setkey(covid3,County_FIPS)
# HOUSE CLEANING
rm(covid, covid2)
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2258918 120.7 4090310 218.5 4090310 218.5
## Vcells 5269373 40.3 21396431 163.3 16669140 127.2
Before moving onto the next step, let’s take a quick glance at the data
str(covid3)## Classes 'data.table' and 'data.frame': 3139 obs. of 8 variables:
## $ County_FIPS : int 1001 1003 1005 1007 1009 1011 1013 1015 1017 1019 ...
## $ Confirmed_Cases : int 127 216 147 62 47 71 312 127 326 30 ...
## $ retail_and_recreation_percent_change_from_baseline: int 3 -13 5 NA 7 NA NA 3 14 NA ...
## $ grocery_and_pharmacy_percent_change_from_baseline : int 8 26 -7 NA 14 NA 4 3 13 15 ...
## $ parks_percent_change_from_baseline : int NA 37 NA NA NA NA NA NA NA NA ...
## $ transit_stations_percent_change_from_baseline : int NA -5 NA NA NA NA NA NA NA NA ...
## $ workplaces_percent_change_from_baseline : int -27 -16 -17 -68 -28 -24 -25 -28 -32 -19 ...
## $ residential_percent_change_from_baseline : int 10 5 NA NA 7 NA NA 8 9 NA ...
## - attr(*, ".internal.selfref")=<externalptr>
## - attr(*, "sorted")= chr "County_FIPS"
#import feature matrix
features <- data.table::fread(
file = "C:\\Users\\fsusa\\Documents\\git\\tamu_stat656_project\\stat656_data.csv",
header = TRUE,
stringsAsFactors = TRUE,
nThread = ncores)
# data.table efficiency
data.table::setkey(features,FIPS_Code)Once again, let’s take a peak at the data
dim(features)## [1] 3143 34
Steps:
#merge covid and features
dataAll <- merge(covid3, features, by.x = "County_FIPS", by.y = "FIPS_Code")
#remove unneeded features
dataAll2 <- dataAll[,!"State.x"][,!"Area_name"]
# set row names
row.names(dataAll2) <- dataAll2$County_FIPS
#remove county fips codes as feature
dataAll2 <- dataAll2[,!"County_FIPS"]
#modify
dataAll2[,names(dataAll2):=lapply(.SD, as.numeric)]
# HOUSE CLEANING
rm(covid3, features, dataAll)
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2267689 121.2 4090557 218.5 4090557 218.5
## Vcells 3962851 30.3 8388608 64.0 6646160 50.8
# create data partition
## attempt 1
#trainIndex <- createDataPartition(dataAll2$Confirmed_Cases, p = 0.75, list = FALSE) %>% as.vector(.)
## attempt 2
trainIndex <- createDataPartition(dataAll2$Confirmed_Cases, p = 0.5, list = F) %>% as.vector(.)
validSplit <- createDataPartition(dataAll2[-trainIndex, Confirmed_Cases], p = 0.5, list = F) %>% as.vector(.)
n <- nrow(dataAll2)
## splits
testIndex <- (1:n)[-trainIndex]
validIndex <- (1:n)[-trainIndex][-validSplit]
role <- rep("train", n)
role[testIndex] <- "test"
role[validIndex] <- "validation"Before moving forward, let’s check that the the number of records in each split is at least greater than 10.
table(role)/(ncol(dataAll2)-1)## role
## test train validation
## 21.18919 42.37838 21.13514
threshMissingness <- 0.33
trainMissing <-
sapply(dataAll2[role=="train"],function(x){sum(is.na(x))}) %>%
as_tibble(.) %>%
mutate(ID=names(dataAll2),
missing = value,
n = nrow(dataAll2[role=="train"])) %>%
mutate(pctMissing = round(missing/n,4)) %>%
select(-1) %>%
filter(ID != "Confirmed_Cases") %>%
arrange(.,desc(pctMissing)) %>%
as.data.frame(.)
trainMissing %>%
filter(pctMissing >= threshMissingness) %>%
kable(.) %>%
kable_styling(bootstrap_options = c("striped", "condensed"), full_width = FALSE)| ID | missing | n | pctMissing |
|---|---|---|---|
| GDP_All_industries_2019 | 1531 | 1568 | 0.9764 |
| parks_percent_change_from_baseline | 1269 | 1568 | 0.8093 |
| transit_stations_percent_change_from_baseline | 1084 | 1568 | 0.6913 |
| residential_percent_change_from_baseline | 996 | 1568 | 0.6352 |
| grocery_and_pharmacy_percent_change_from_baseline | 861 | 1568 | 0.5491 |
| retail_and_recreation_percent_change_from_baseline | 746 | 1568 | 0.4758 |
# obtain features that are missing -lt than threshold
varsFiltered <-
trainMissing %>%
filter(pctMissing < threshMissingness) %>%
select(ID)
#####################################################
xTrainPreProc <-
dataAll2 %>%
as_tibble(.) %>%
filter(role == "train") %>%
select(varsFiltered$ID) %>%
preProcess(., method = c("center","scale","bagImpute"))
#####################################################
xTrain <-
dataAll2 %>%
as_tibble(.) %>%
filter(role == "train") %>%
select(varsFiltered$ID) %>%
predict(xTrainPreProc,.)
xValidation <-
dataAll2 %>%
as_tibble(.) %>%
filter(role == "validation") %>%
select(varsFiltered$ID) %>%
predict(xTrainPreProc,.)
xTest <-
dataAll2 %>%
as_tibble(.) %>%
filter(role == "test") %>%
select(varsFiltered$ID) %>%
predict(xTrainPreProc,.)
#####################################################
# HOUSE CLEANING
rm(trainMissing, threshMissingness)Split the supervisor into train, validation, test
# breakout data sets
yTrain <-
dataAll2 %>%
as_tibble(.) %>%
filter(role == "train") %>%
select(Confirmed_Cases) %>%
unlist()
yValidation <-
dataAll2 %>%
as_tibble(.) %>%
filter(role == "validation") %>%
select(Confirmed_Cases) %>%
unlist()
yTest <-
dataAll2 %>%
as_tibble(.) %>%
filter(role == "test") %>%
select(Confirmed_Cases) %>%
unlist()corrplot::corrplot(cor(xTrain),
type="lower",
order="hclust",
hclust.method="average")corrplot::corrplot(cor(xTrain, method="spearman"),
type="lower",
order="hclust",
hclust.method="average")linearCombos = caret::findLinearCombos(xTrain)
names(xTrain)[linearCombos$remove]## [1] "Unemployed_2019"
corrRemove = caret::findCorrelation(cor(xTrain))
names(xTrain)[corrRemove]## [1] "Civilian_labor_force_2019" "Employed_2019"
## [3] "Unemployed_2019" "POVALL_2019"
## [5] "democrat"
# HOUSE CLEANING
rm(linearCombos, corrRemove)caret::BoxCoxTrans(yTrain)## Box-Cox Transformation
##
## 1568 data points used to estimate Lambda
##
## Input data summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 11.0 36.0 376.7 134.0 35854.0
##
## Largest/Smallest: 35900
## Sample Skewness: 12.5
##
## Estimated Lambda: -0.1
## With fudge factor, Lambda = 0 will be used for transformations
car::powerTransform(lm(yTrain~.,data=xTrain))## Estimated transformation parameter
## Y1
## -0.007527509
set.seed(1)
refittedLasso = glmnet::cv.glmnet(x=as.matrix(xTrain),
alpha=1,
y=log(yTrain),
lambda=seq(1e-7,1,0.00001),
parallel = TRUE)
# print lambda estimates
data.frame(lambdaMin = refittedLasso$lambda.min,
lambda1SE = refittedLasso$lambda.1se) %>%
kable(.) %>%
kable_styling(bootstrap_options = c("striped", "condensed"), full_width = FALSE)| lambdaMin | lambda1SE |
|---|---|
| 0.0137701 | 0.1379701 |
# plot
plot(refittedLasso)# coef from glmnet
lasso_lambda1SE_coef = coef(refittedLasso, s="lambda.1se")[-1]
# active set
lasso_lambda1SE_activeSet = which(abs(lasso_lambda1SE_coef) > machine_precision_zero)corrplot::corrplot(cor(xValidation[lasso_lambda1SE_activeSet]),
type="lower",
order="hclust",
hclust.method="average")corrplot::corrplot(cor(xValidation[lasso_lambda1SE_activeSet], method="spearman"),
type="lower",
order="hclust",
hclust.method="average")linearCombos = caret::findLinearCombos(xValidation[lasso_lambda1SE_activeSet])
names(xValidation[lasso_lambda1SE_activeSet])[linearCombos$remove]## character(0)
corrRemove = caret::findCorrelation(cor(xValidation[lasso_lambda1SE_activeSet]))
names(xValidation[lasso_lambda1SE_activeSet])[corrRemove]## character(0)
caret::featurePlot(xValidation[,lasso_lambda1SE_activeSet], log(yValidation))# HOUSE CLEANING
rm(linearCombos, corrRemove)set.seed(1)
refittedLasso_LM = lm(log(yTest)~.,data=xTest[lasso_lambda1SE_activeSet])
#linear_regression_diagnostic_tools(lm=refittedLasso_LM,type="MLR",verbose=TRUE)par(mfrow=c(2,2))
plot(refittedLasso_LM)par(mfrow=c(1,1))par(mfrow=c(1,2))
plot(x=yhat,
y=ri,
type="p",
main="Standardized Residual Plot",
xlab="Standardized Residuals",
ylab="Yhat",
ylim=c(-max(abs(ri)),max(abs(ri))),
xlim=c(0,max(yhat)))
abline(h=0,lty=2,col="darkorange")
plot(x=y,
y=yhat,
type="p",
xlim=c(0,15),
ylim=c(0,15),
main="Inverse Response Plot",
ylab="Yhat",
xlab="Y")
abline(a=0,b=1,lty=2,col="darkorange")par(mfrow=c(1,1))#, fig.height=20, fig.width=15
car::marginalModelPlots(refittedLasso_LM,main="Marginal Model Plot", ylab="Y",ask=FALSE)#,layout=c(4,3))#to test model assumptions
gvlma::gvlma(refittedLasso_LM)##
## Call:
## lm(formula = log(yTest) ~ ., data = xTest[lasso_lambda1SE_activeSet])
##
## Coefficients:
## (Intercept)
## 3.77019
## beds
## 0.02278
## workplaces_percent_change_from_baseline
## -0.08676
## Asthma
## 0.26422
## COPD
## -0.29898
## republican
## 0.85546
## Chronic.Kidney.Disease
## 0.13107
## Hypertension
## 0.41765
## INTERNATIONAL_MIG_2019
## 0.12508
## RESIDUAL_2019
## 0.13822
## Median_Household_Income_2019
## 0.24037
## Median_Age_2018
## -0.36621
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma::gvlma(x = refittedLasso_LM)
##
## Value p-value Decision
## Global Stat 82.0825 1.110e-16 Assumptions NOT satisfied!
## Skewness 2.4403 1.182e-01 Assumptions acceptable.
## Kurtosis 10.3471 1.297e-03 Assumptions NOT satisfied!
## Link Function 68.8918 1.110e-16 Assumptions NOT satisfied!
## Heteroscedasticity 0.4033 5.254e-01 Assumptions acceptable.
#variance inflation factors: VIF
(VIF = car::vif(refittedLasso_LM))## beds workplaces_percent_change_from_baseline
## 3.955747 1.282504
## Asthma COPD
## 1.454265 1.741227
## republican Chronic.Kidney.Disease
## 4.370459 2.584567
## Hypertension INTERNATIONAL_MIG_2019
## 2.636421 1.141851
## RESIDUAL_2019 Median_Household_Income_2019
## 1.038997 1.757807
## Median_Age_2018
## 1.194864
VIF_filtered =
VIF %>%
data.frame(VIF=.) %>%
filter(.,VIF>2)
VIF_filtered %>%
mutate_all(.,round,3) %>%
kable(.) %>%
kable_styling(bootstrap_options = c("striped", "condensed"), full_width = FALSE)| VIF | |
|---|---|
| beds | 3.956 |
| republican | 4.370 |
| Chronic.Kidney.Disease | 2.585 |
| Hypertension | 2.636 |
out = broom::tidy(refittedLasso_LM)[-1,] %>%
select(c(1:2)) %>%
mutate_if(.,is.numeric,round,4) %>%
arrange(.,desc(abs(estimate)))
# print
kable(out) %>%
kable_styling(bootstrap_options = c("striped", "condensed"), full_width = FALSE)| term | estimate |
|---|---|
| republican | 0.8555 |
| Hypertension | 0.4176 |
| Median_Age_2018 | -0.3662 |
| COPD | -0.2990 |
| Asthma | 0.2642 |
| Median_Household_Income_2019 | 0.2404 |
| RESIDUAL_2019 | 0.1382 |
| Chronic.Kidney.Disease | 0.1311 |
| INTERNATIONAL_MIG_2019 | 0.1251 |
| workplaces_percent_change_from_baseline | -0.0868 |
| beds | 0.0228 |