For Parts 2 and 3, use the glmnet function in R.
setwd("C:/Users/tarra/Documents/Intro to Analytics Modeling/Fall2020hw8/data 11.1")
uscrime_data <- read.table("uscrime.txt", header = TRUE)
# Let's start by making just a flat out model without preforming anything.
uscrime_model <- lm(Crime~., data = uscrime_data)
summary(uscrime_model)
##
## Call:
## lm(formula = Crime ~ ., data = uscrime_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -395.74 -98.09 -6.69 112.99 512.67
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.984e+03 1.628e+03 -3.675 0.000893 ***
## M 8.783e+01 4.171e+01 2.106 0.043443 *
## So -3.803e+00 1.488e+02 -0.026 0.979765
## Ed 1.883e+02 6.209e+01 3.033 0.004861 **
## Po1 1.928e+02 1.061e+02 1.817 0.078892 .
## Po2 -1.094e+02 1.175e+02 -0.931 0.358830
## LF -6.638e+02 1.470e+03 -0.452 0.654654
## M.F 1.741e+01 2.035e+01 0.855 0.398995
## Pop -7.330e-01 1.290e+00 -0.568 0.573845
## NW 4.204e+00 6.481e+00 0.649 0.521279
## U1 -5.827e+03 4.210e+03 -1.384 0.176238
## U2 1.678e+02 8.234e+01 2.038 0.050161 .
## Wealth 9.617e-02 1.037e-01 0.928 0.360754
## Ineq 7.067e+01 2.272e+01 3.111 0.003983 **
## Prob -4.855e+03 2.272e+03 -2.137 0.040627 *
## Time -3.479e+00 7.165e+00 -0.486 0.630708
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 209.1 on 31 degrees of freedom
## Multiple R-squared: 0.8031, Adjusted R-squared: 0.7078
## F-statistic: 8.429 on 15 and 31 DF, p-value: 3.539e-07
# On its own the r squares and adjusted r squared is 0.8031 and 0.7078 respectively.
# Let's see if we can make this better using stepwise regression.
stepwise_model <- train(Crime ~., data = uscrime_data,
method = "lmStepAIC",
trControl = trainControl(),
trace = FALSE)
# Model accuracy
stepwise_model$results
## parameter RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 none 331.8706 0.4574266 256.7255 68.56171 0.1911635 57.23648
# Final model coefficients
stepwise_model$finalModel
##
## Call:
## lm(formula = .outcome ~ M + Ed + Po1 + M.F + U1 + U2 + Ineq +
## Prob, data = dat)
##
## Coefficients:
## (Intercept) M Ed Po1 M.F U1
## -6426.10 93.32 180.12 102.65 22.34 -6086.63
## U2 Ineq Prob
## 187.35 61.33 -3796.03
# Summary of the model
summary(stepwise_model$finalModel)
##
## Call:
## lm(formula = .outcome ~ M + Ed + Po1 + M.F + U1 + U2 + Ineq +
## Prob, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -444.70 -111.07 3.03 122.15 483.30
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6426.10 1194.61 -5.379 4.04e-06 ***
## M 93.32 33.50 2.786 0.00828 **
## Ed 180.12 52.75 3.414 0.00153 **
## Po1 102.65 15.52 6.613 8.26e-08 ***
## M.F 22.34 13.60 1.642 0.10874
## U1 -6086.63 3339.27 -1.823 0.07622 .
## U2 187.35 72.48 2.585 0.01371 *
## Ineq 61.33 13.96 4.394 8.63e-05 ***
## Prob -3796.03 1490.65 -2.547 0.01505 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 195.5 on 38 degrees of freedom
## Multiple R-squared: 0.7888, Adjusted R-squared: 0.7444
## F-statistic: 17.74 on 8 and 38 DF, p-value: 1.159e-10
# With our new model after performing stepwise regression we get a slightly lower r squared value of 0.7888, but a slightly higher adjusted r squared value of 0.7444.
# Let's see what we can do with Lasso now.
# Scale the data; does not scaled col #2 as it is binary and col #16 as it is
uscrime_scale <- cbind(as.data.frame(scale(uscrime_data[,1])),
as.data.frame(uscrime_data[,2]),
as.data.frame(scale(uscrime_data[,c(3,4,5,6,7,8,9,10,11,12,13,14,15)])),
as.data.frame(uscrime_data[,16]))
# Fix the column names
colnames(uscrime_scale) <- colnames(uscrime_data)
# Display scaled data
head(uscrime_scale)
## M So Ed Po1 Po2 LF M.F
## 1 0.9886930 1 -1.3085099 -0.9085105 -0.8666988 -1.2667456 -1.12060499
## 2 0.3521372 0 0.6580587 0.6056737 0.5280852 0.5396568 0.98341752
## 3 0.2725678 1 -1.4872888 -1.3459415 -1.2958632 -0.6976051 -0.47582390
## 4 -0.2048491 0 1.3731746 2.1535064 2.1732150 0.3911854 0.37257228
## 5 0.1929983 0 1.3731746 0.8075649 0.7426673 0.7376187 0.06714965
## 6 -1.3983912 0 0.3898903 1.1104017 1.2433590 -0.3511718 -0.64550313
## Pop NW U1 U2 Wealth Ineq
## 1 -0.09500679 1.943738564 0.69510600 0.8313680 -1.3616094 1.6793638
## 2 -0.62033844 0.008483424 0.02950365 0.2393332 0.3276683 0.0000000
## 3 -0.48900552 1.146296747 -0.08143007 -0.1158877 -2.1492481 1.4036474
## 4 3.16204944 -0.205464381 0.36230482 0.5945541 1.5298536 -0.6767585
## 5 -0.48900552 -0.691709391 -0.24783066 -1.6551781 0.5453053 -0.5013026
## 6 -0.30513945 -0.555560788 -0.63609870 -0.5895155 1.6956723 -1.7044289
## Prob Time Crime
## 1 1.6497631 -0.05599367 791
## 2 -0.7693365 -0.18315796 1635
## 3 1.5969416 -0.32416470 578
## 4 -1.3761895 0.46611085 1969
## 5 -0.2503580 -0.74759413 1234
## 6 -0.5669349 -0.78996812 682
# Run lasso
uscrime_lasso <- cv.glmnet(x=as.matrix(uscrime_scale[,-16]), y=as.matrix(uscrime_scale$Crime),
alpha=1, nfolds = 5, type.measure="mse", family="gaussian")
# Find the lamba with the smallest cvm
x <- uscrime_lasso$cvm
which(x == min(x))
## [1] 35
# The lamba for the smallest cvm
uscrime_lasso$lambda[which(x == min(x))]
## [1] 11.12694
# or
uscrime_lasso$lambda.min
## [1] 11.12694
coefficients(uscrime_lasso, s=uscrime_lasso$lambda.min)
## 16 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 889.057299
## M 85.003217
## So 47.081683
## Ed 124.290064
## Po1 308.889820
## Po2 .
## LF 1.116734
## M.F 52.037497
## Pop .
## NW 4.443519
## U1 -22.257315
## U2 55.981384
## Wealth .
## Ineq 180.588656
## Prob -81.812835
## Time .
# Now lets make a model with these coefficients
uscrime_lasso_lm <- lm(Crime ~M+So+Ed+Po1+M.F+Pop+NW+U1+U2+Wealth+Ineq+Prob,
data = uscrime_scale)
summary(uscrime_lasso_lm)
##
## Call:
## lm(formula = Crime ~ M + So + Ed + Po1 + M.F + Pop + NW + U1 +
## U2 + Wealth + Ineq + Prob, data = uscrime_scale)
##
## Residuals:
## Min 1Q Median 3Q Max
## -434.18 -107.01 18.55 115.88 470.32
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 897.29 51.91 17.286 < 2e-16 ***
## M 112.71 49.35 2.284 0.02876 *
## So 22.89 125.35 0.183 0.85621
## Ed 195.70 62.94 3.109 0.00378 **
## Po1 293.18 64.99 4.511 7.32e-05 ***
## M.F 48.92 48.12 1.017 0.31656
## Pop -33.25 45.63 -0.729 0.47113
## NW 19.16 57.71 0.332 0.74195
## U1 -89.76 65.68 -1.367 0.18069
## U2 140.78 66.77 2.108 0.04245 *
## Wealth 83.30 95.53 0.872 0.38932
## Ineq 285.77 85.19 3.355 0.00196 **
## Prob -92.75 41.12 -2.255 0.03065 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 202.6 on 34 degrees of freedom
## Multiple R-squared: 0.7971, Adjusted R-squared: 0.7255
## F-statistic: 11.13 on 12 and 34 DF, p-value: 1.52e-08
# Let's remove factor which p>0.05 and show the new model.
uscrime_lasso_lm2 <- lm(Crime ~M+ Ed+ Po1+ U2+ Ineq+Prob, data = uscrime_scale)
summary(uscrime_lasso_lm2)
##
## Call:
## lm(formula = Crime ~ M + Ed + Po1 + U2 + Ineq + Prob, data = uscrime_scale)
##
## Residuals:
## Min 1Q Median 3Q Max
## -470.68 -78.41 -19.68 133.12 556.23
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 905.09 29.27 30.918 < 2e-16 ***
## M 131.98 41.85 3.154 0.00305 **
## Ed 219.79 50.07 4.390 8.07e-05 ***
## Po1 341.84 40.87 8.363 2.56e-10 ***
## U2 75.47 34.55 2.185 0.03483 *
## Ineq 269.91 55.60 4.855 1.88e-05 ***
## Prob -86.44 34.74 -2.488 0.01711 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 200.7 on 40 degrees of freedom
## Multiple R-squared: 0.7659, Adjusted R-squared: 0.7307
## F-statistic: 21.81 on 6 and 40 DF, p-value: 3.418e-11
# Let's move on to the final model, elastic net.
r2=c()
for (i in 0:10) {
mod_uscrime_elastic <- cv.glmnet(x=as.matrix(uscrime_scale[,-16]),y=as.matrix(uscrime_scale$Crime),
alpha=i/10, nfolds = 5, type.measure="mse",
family="gaussian")
#dev.ratio is the percentage of deviance explained
#min index for the dev.ratio of the model
r2 = cbind(r2, mod_uscrime_elastic$glmnet.fit$dev.ratio[which(mod_uscrime_elastic$glmnet.fit$lambda == mod_uscrime_elastic$lambda.min)])
}
#get the best alpha
alpha <- (which.max(r2)-1)/10
#get model with alphas = 0.05
uscrime_elastic <- cv.glmnet(x=as.matrix(uscrime_scale[,-16]),
y=as.matrix(uscrime_scale$Crime), alpha=0.05,
nfolds = 5, type.measure="mse", family="gaussian")
uscrime_elastic_lm = lm(Crime ~M+So+Ed+Po1+Po2+LF+M.F+NW+U1+U2+Wealth+Ineq+Prob
+Time, data = uscrime_scale)
summary(uscrime_elastic_lm)
##
## Call:
## lm(formula = Crime ~ M + So + Ed + Po1 + Po2 + LF + M.F + NW +
## U1 + U2 + Wealth + Ineq + Prob + Time, data = uscrime_scale)
##
## Residuals:
## Min 1Q Median 3Q Max
## -380.91 -101.89 -14.77 110.87 505.40
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 906.483 58.484 15.500 < 2e-16 ***
## M 112.837 51.691 2.183 0.03649 *
## So -4.105 147.172 -0.028 0.97792
## Ed 211.246 68.713 3.074 0.00429 **
## Po1 563.337 311.541 1.808 0.07998 .
## Po2 -313.824 324.701 -0.966 0.34104
## LF -31.702 58.147 -0.545 0.58939
## M.F 64.479 54.722 1.178 0.24737
## NW 44.572 65.892 0.676 0.50362
## U1 -112.728 73.902 -1.525 0.13699
## U2 143.186 68.749 2.083 0.04535 *
## Wealth 87.836 98.588 0.891 0.37961
## Ineq 269.086 86.824 3.099 0.00403 **
## Prob -110.457 51.117 -2.161 0.03830 *
## Time -31.582 48.772 -0.648 0.52189
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 206.8 on 32 degrees of freedom
## Multiple R-squared: 0.801, Adjusted R-squared: 0.714
## F-statistic: 9.202 on 14 and 32 DF, p-value: 1.301e-07
uscrime_elastic_lm2 = lm(Crime ~M+Ed+U2+Ineq+Prob, data = uscrime_scale)
summary(uscrime_elastic_lm2)
##
## Call:
## lm(formula = Crime ~ M + Ed + U2 + Ineq + Prob, data = uscrime_scale)
##
## Residuals:
## Min 1Q Median 3Q Max
## -478.8 -233.6 -46.5 143.2 797.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 905.09 47.94 18.881 < 2e-16 ***
## M 107.25 68.36 1.569 0.12437
## Ed 240.18 81.89 2.933 0.00547 **
## U2 135.14 55.35 2.441 0.01903 *
## Ineq 117.68 86.03 1.368 0.17880
## Prob -156.82 55.20 -2.841 0.00697 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 328.6 on 41 degrees of freedom
## Multiple R-squared: 0.3565, Adjusted R-squared: 0.278
## F-statistic: 4.542 on 5 and 41 DF, p-value: 0.002186