library(mlbench)
data(BostonHousing)
dim(BostonHousing)
## [1] 506 14
head(BostonHousing)
## crim zn indus chas nox rm age dis rad tax ptratio b
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12
## lstat medv
## 1 4.98 24.0
## 2 9.14 21.6
## 3 4.03 34.7
## 4 2.94 33.4
## 5 5.33 36.2
## 6 5.21 28.7
str(BostonHousing)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : num 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ b : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
summary(BostonHousing)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 0:471
## 1st Qu.: 0.08204 1st Qu.: 0.00 1st Qu.: 5.19 1: 35
## Median : 0.25651 Median : 0.00 Median : 9.69
## Mean : 3.61352 Mean : 11.36 Mean :11.14
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10
## Max. :88.97620 Max. :100.00 Max. :27.74
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio b
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
library(car) #package to calculate Variance Inflation Factor
library(corrplot) #correlation plots
library(leaps) #best subsets regression
library(glmnet) #allows ridge regression, LASSO and elastic net
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-10
library(caret) #parameter tuning
## Loading required package: lattice
## Loading required package: ggplot2
data("BostonHousing")
plot(BostonHousing[,-4])

plot(BostonHousing$indus)

table(BostonHousing$chas)
##
## 0 1
## 471 35
boxplot(BostonHousing$medv~BostonHousing$chas, xlab="Charles River", ylab="median value")

p.cor = cor(BostonHousing[,-4])
corrplot.mixed(p.cor)
Boston <- BostonHousing[,-4]
p.cor = cor(Boston)
corrplot.mixed(p.cor)

set.seed(123) #random number generator
ind = sample(2, nrow(Boston), replace=TRUE, prob=c(0.7, 0.3))
train = Boston[ind==1,] #the training data set
test = Boston[ind==2,] #the test data set
str(test) #confirm it worked
## 'data.frame': 143 obs. of 13 variables:
## $ crim : num 0.0273 0.0324 0.069 0.1446 0.2249 ...
## $ zn : num 0 0 0 12.5 12.5 0 0 0 0 0 ...
## $ indus : num 7.07 2.18 2.18 7.87 7.87 8.14 8.14 8.14 8.14 8.14 ...
## $ nox : num 0.469 0.458 0.458 0.524 0.524 0.538 0.538 0.538 0.538 0.538 ...
## $ rm : num 6.42 7 7.15 6.17 6.38 ...
## $ age : num 78.9 45.8 54.2 96.1 94.3 56.5 69.5 98.1 100 85.7 ...
## $ dis : num 4.97 6.06 6.06 5.95 6.35 ...
## $ rad : num 2 3 3 5 5 4 4 4 4 4 ...
## $ tax : num 242 222 222 311 311 307 307 307 307 307 ...
## $ ptratio: num 17.8 18.7 18.7 15.2 15.2 21 21 21 21 21 ...
## $ b : num 397 395 397 397 393 ...
## $ lstat : num 9.14 2.94 5.33 19.15 20.45 ...
## $ medv : num 21.6 33.4 36.2 27.1 15 19.9 18.2 13.6 14.5 13.9 ...
subfit = regsubsets(medv~., data=train)
b.sum = summary(subfit)
which.min(b.sum$bic)
## [1] 7
plot(b.sum$bic, type="l", xlab="# of Features", ylab="BIC", main="BIC score by Feature Inclusion")

plot(subfit, scale="bic", main="Best Subset Features")

ols = lm(medv~zn+nox+rm+dis+ptratio+b+lstat, data=train)
plot(ols$fitted.values, train$medv, xlab="Predicted", ylab="Actual", main="Predicted vs Actual")

pred.subfit = predict(ols, newdata=test)
plot(pred.subfit, test$medv , xlab="Predicted", ylab="Actual", main="Predicted vs Actual")

resid.subfit = test$medv - pred.subfit
mean(resid.subfit^2)
## [1] 21.87516
#ridge regression
x = as.matrix(train[,1:12])
y = train[ ,13]
ridge = glmnet(x, y, family="gaussian", alpha=0)
print(ridge)
##
## Call: glmnet(x = x, y = y, family = "gaussian", alpha = 0)
##
## Df %Dev Lambda
## [1,] 12 5.343e-36 6854.0000
## [2,] 12 7.948e-03 6245.0000
## [3,] 12 8.712e-03 5690.0000
## [4,] 12 9.549e-03 5185.0000
## [5,] 12 1.047e-02 4724.0000
## [6,] 12 1.147e-02 4304.0000
## [7,] 12 1.256e-02 3922.0000
## [8,] 12 1.376e-02 3574.0000
## [9,] 12 1.507e-02 3256.0000
## [10,] 12 1.651e-02 2967.0000
## [11,] 12 1.807e-02 2703.0000
## [12,] 12 1.978e-02 2463.0000
## [13,] 12 2.164e-02 2244.0000
## [14,] 12 2.368e-02 2045.0000
## [15,] 12 2.590e-02 1863.0000
## [16,] 12 2.831e-02 1698.0000
## [17,] 12 3.094e-02 1547.0000
## [18,] 12 3.380e-02 1409.0000
## [19,] 12 3.691e-02 1284.0000
## [20,] 12 4.028e-02 1170.0000
## [21,] 12 4.395e-02 1066.0000
## [22,] 12 4.791e-02 971.5000
## [23,] 12 5.221e-02 885.2000
## [24,] 12 5.686e-02 806.6000
## [25,] 12 6.187e-02 734.9000
## [26,] 12 6.728e-02 669.6000
## [27,] 12 7.310e-02 610.1000
## [28,] 12 7.936e-02 555.9000
## [29,] 12 8.607e-02 506.5000
## [30,] 12 9.326e-02 461.5000
## [31,] 12 1.009e-01 420.5000
## [32,] 12 1.091e-01 383.2000
## [33,] 12 1.178e-01 349.1000
## [34,] 12 1.271e-01 318.1000
## [35,] 12 1.369e-01 289.9000
## [36,] 12 1.472e-01 264.1000
## [37,] 12 1.581e-01 240.6000
## [38,] 12 1.695e-01 219.3000
## [39,] 12 1.814e-01 199.8000
## [40,] 12 1.938e-01 182.0000
## [41,] 12 2.067e-01 165.9000
## [42,] 12 2.201e-01 151.1000
## [43,] 12 2.338e-01 137.7000
## [44,] 12 2.480e-01 125.5000
## [45,] 12 2.625e-01 114.3000
## [46,] 12 2.773e-01 104.2000
## [47,] 12 2.924e-01 94.9200
## [48,] 12 3.077e-01 86.4800
## [49,] 12 3.231e-01 78.8000
## [50,] 12 3.386e-01 71.8000
## [51,] 12 3.541e-01 65.4200
## [52,] 12 3.697e-01 59.6100
## [53,] 12 3.852e-01 54.3100
## [54,] 12 4.006e-01 49.4900
## [55,] 12 4.159e-01 45.0900
## [56,] 12 4.311e-01 41.0900
## [57,] 12 4.460e-01 37.4400
## [58,] 12 4.607e-01 34.1100
## [59,] 12 4.752e-01 31.0800
## [60,] 12 4.894e-01 28.3200
## [61,] 12 5.033e-01 25.8000
## [62,] 12 5.168e-01 23.5100
## [63,] 12 5.301e-01 21.4200
## [64,] 12 5.429e-01 19.5200
## [65,] 12 5.553e-01 17.7900
## [66,] 12 5.674e-01 16.2100
## [67,] 12 5.789e-01 14.7700
## [68,] 12 5.901e-01 13.4500
## [69,] 12 6.007e-01 12.2600
## [70,] 12 6.109e-01 11.1700
## [71,] 12 6.206e-01 10.1800
## [72,] 12 6.297e-01 9.2730
## [73,] 12 6.384e-01 8.4500
## [74,] 12 6.465e-01 7.6990
## [75,] 12 6.542e-01 7.0150
## [76,] 12 6.613e-01 6.3920
## [77,] 12 6.680e-01 5.8240
## [78,] 12 6.742e-01 5.3070
## [79,] 12 6.799e-01 4.8350
## [80,] 12 6.853e-01 4.4060
## [81,] 12 6.902e-01 4.0140
## [82,] 12 6.947e-01 3.6580
## [83,] 12 6.989e-01 3.3330
## [84,] 12 7.028e-01 3.0370
## [85,] 12 7.063e-01 2.7670
## [86,] 12 7.096e-01 2.5210
## [87,] 12 7.126e-01 2.2970
## [88,] 12 7.154e-01 2.0930
## [89,] 12 7.179e-01 1.9070
## [90,] 12 7.202e-01 1.7380
## [91,] 12 7.223e-01 1.5830
## [92,] 12 7.243e-01 1.4430
## [93,] 12 7.260e-01 1.3140
## [94,] 12 7.277e-01 1.1980
## [95,] 12 7.291e-01 1.0910
## [96,] 12 7.305e-01 0.9944
## [97,] 12 7.317e-01 0.9060
## [98,] 12 7.328e-01 0.8255
## [99,] 12 7.338e-01 0.7522
## [100,] 12 7.347e-01 0.6854
plot(ridge, label=TRUE)

plot(ridge, xvar="lambda", label=TRUE)

ridge.coef = coef(ridge, s=0.1, exact=TRUE)
ridge.coef
## 13 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 36.076511743
## crim -0.111102539
## zn 0.050887090
## indus 0.005784647
## nox -20.528464797
## rm 4.400431797
## age -0.004088521
## dis -1.700728401
## rad 0.292258832
## tax -0.010788618
## ptratio -1.055776373
## b 0.009837657
## lstat -0.436608612
plot(ridge, xvar="dev", label=TRUE)

newx = as.matrix(test[,1:12])
ridge.y = predict(ridge, newx=newx, type="response", s=0.1)
plot(ridge.y, test$medv, xlab="Predicted", ylab="Actual",main="Ridge Regression")

ridge.resid = ridge.y - test$medv
mean(ridge.resid^2)
## [1] 20.49134
#Lasso
lasso = glmnet(x, y, family="gaussian", alpha=1)
print(lasso)
##
## Call: glmnet(x = x, y = y, family = "gaussian", alpha = 1)
##
## Df %Dev Lambda
## [1,] 0 0.0000 6.854000
## [2,] 2 0.1051 6.245000
## [3,] 2 0.1943 5.690000
## [4,] 2 0.2684 5.185000
## [5,] 2 0.3299 4.724000
## [6,] 2 0.3809 4.304000
## [7,] 2 0.4233 3.922000
## [8,] 2 0.4585 3.574000
## [9,] 2 0.4877 3.256000
## [10,] 3 0.5148 2.967000
## [11,] 3 0.5415 2.703000
## [12,] 3 0.5637 2.463000
## [13,] 3 0.5821 2.244000
## [14,] 3 0.5974 2.045000
## [15,] 3 0.6101 1.863000
## [16,] 3 0.6206 1.698000
## [17,] 3 0.6294 1.547000
## [18,] 3 0.6366 1.409000
## [19,] 4 0.6435 1.284000
## [20,] 4 0.6501 1.170000
## [21,] 4 0.6555 1.066000
## [22,] 5 0.6602 0.971500
## [23,] 5 0.6642 0.885200
## [24,] 5 0.6675 0.806600
## [25,] 6 0.6704 0.734900
## [26,] 6 0.6729 0.669600
## [27,] 7 0.6751 0.610100
## [28,] 7 0.6827 0.555900
## [29,] 7 0.6890 0.506500
## [30,] 7 0.6942 0.461500
## [31,] 7 0.6985 0.420500
## [32,] 8 0.7033 0.383200
## [33,] 8 0.7075 0.349100
## [34,] 8 0.7109 0.318100
## [35,] 8 0.7138 0.289900
## [36,] 8 0.7162 0.264100
## [37,] 9 0.7182 0.240600
## [38,] 9 0.7211 0.219300
## [39,] 9 0.7234 0.199800
## [40,] 10 0.7254 0.182000
## [41,] 11 0.7281 0.165900
## [42,] 11 0.7304 0.151100
## [43,] 10 0.7323 0.137700
## [44,] 10 0.7339 0.125500
## [45,] 10 0.7351 0.114300
## [46,] 10 0.7362 0.104200
## [47,] 10 0.7371 0.094920
## [48,] 10 0.7379 0.086480
## [49,] 10 0.7385 0.078800
## [50,] 10 0.7390 0.071800
## [51,] 10 0.7394 0.065420
## [52,] 10 0.7398 0.059610
## [53,] 10 0.7401 0.054310
## [54,] 11 0.7403 0.049490
## [55,] 11 0.7405 0.045090
## [56,] 11 0.7407 0.041090
## [57,] 11 0.7408 0.037440
## [58,] 11 0.7410 0.034110
## [59,] 11 0.7411 0.031080
## [60,] 11 0.7411 0.028320
## [61,] 11 0.7412 0.025800
## [62,] 11 0.7413 0.023510
## [63,] 11 0.7413 0.021420
## [64,] 11 0.7414 0.019520
## [65,] 11 0.7414 0.017790
## [66,] 11 0.7414 0.016210
## [67,] 12 0.7414 0.014770
## [68,] 12 0.7415 0.013450
## [69,] 12 0.7415 0.012260
## [70,] 12 0.7415 0.011170
## [71,] 12 0.7415 0.010180
## [72,] 12 0.7415 0.009273
## [73,] 12 0.7416 0.008450
## [74,] 12 0.7416 0.007699
## [75,] 12 0.7416 0.007015
## [76,] 12 0.7416 0.006392
plot(lasso, xvar="lambda", label=TRUE)

lasso.coef = coef(lasso, s=0.015, exact=TRUE)
lasso.coef
## 13 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 3.697616e+01
## crim -1.119973e-01
## zn 5.187351e-02
## indus 3.701671e-04
## nox -2.105488e+01
## rm 4.365174e+00
## age -2.256860e-03
## dis -1.739044e+00
## rad 3.037438e-01
## tax -1.117913e-02
## ptratio -1.063355e+00
## b 9.706682e-03
## lstat -4.427820e-01
lasso.y = predict(lasso, newx=newx, type="response", s=0.015)
plot(lasso.y, test$medv, xlab="Predicted", ylab="Actual", main="LASSO")

lasso.resid = lasso.y - test$medv
mean(lasso.resid^2)
## [1] 21.0103
#Elastic net
grid = expand.grid(.alpha=seq(0,1, by=.2), .lambda=seq(0.00,0.2, by=0.02))
table(grid)
## .lambda
## .alpha 0 0.02 0.04 0.06 0.08 0.1 0.12 0.14 0.16 0.18 0.2
## 0 1 1 1 1 1 1 1 1 1 1 1
## 0.2 1 1 1 1 1 1 1 1 1 1 1
## 0.4 1 1 1 1 1 1 1 1 1 1 1
## 0.6 1 1 1 1 1 1 1 1 1 1 1
## 0.8 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
control = trainControl(method="LOOCV")
enet.train = train(medv~., data=train, method="glmnet", trControl=control, tuneGrid=grid)
enet.train
## glmnet
##
## 363 samples
## 12 predictor
##
## No pre-processing
## Resampling: Leave-One-Out Cross-Validation
## Summary of sample sizes: 362, 362, 362, 362, 362, 362, ...
## Resampling results across tuning parameters:
##
## alpha lambda RMSE Rsquared
## 0.0 0.00 5.101644 0.7122665
## 0.0 0.02 5.101644 0.7122665
## 0.0 0.04 5.101644 0.7122665
## 0.0 0.06 5.101644 0.7122665
## 0.0 0.08 5.101644 0.7122665
## 0.0 0.10 5.101644 0.7122665
## 0.0 0.12 5.101644 0.7122665
## 0.0 0.14 5.101644 0.7122665
## 0.0 0.16 5.101644 0.7122665
## 0.0 0.18 5.101644 0.7122665
## 0.0 0.20 5.101644 0.7122665
## 0.2 0.00 5.071536 0.7152357
## 0.2 0.02 5.071317 0.7152482
## 0.2 0.04 5.069278 0.7154080
## 0.2 0.06 5.065218 0.7158055
## 0.2 0.08 5.064562 0.7158396
## 0.2 0.10 5.066455 0.7156021
## 0.2 0.12 5.068943 0.7153087
## 0.2 0.14 5.072067 0.7149528
## 0.2 0.16 5.075874 0.7145280
## 0.2 0.18 5.079915 0.7140835
## 0.2 0.20 5.083943 0.7136468
## 0.4 0.00 5.071136 0.7152805
## 0.4 0.02 5.070351 0.7153326
## 0.4 0.04 5.064600 0.7158881
## 0.4 0.06 5.065015 0.7157872
## 0.4 0.08 5.067938 0.7154283
## 0.4 0.10 5.072258 0.7149314
## 0.4 0.12 5.077440 0.7143532
## 0.4 0.14 5.082877 0.7137600
## 0.4 0.16 5.088727 0.7131325
## 0.4 0.18 5.095753 0.7123809
## 0.4 0.20 5.103554 0.7115487
## 0.6 0.00 5.071549 0.7152408
## 0.6 0.02 5.068851 0.7154756
## 0.6 0.04 5.064322 0.7158877
## 0.6 0.06 5.067782 0.7154509
## 0.6 0.08 5.073102 0.7148363
## 0.6 0.10 5.078605 0.7142275
## 0.6 0.12 5.086528 0.7133668
## 0.6 0.14 5.096769 0.7122587
## 0.6 0.16 5.108985 0.7109362
## 0.6 0.18 5.122668 0.7094528
## 0.6 0.20 5.137326 0.7078617
## 0.8 0.00 5.072097 0.7151838
## 0.8 0.02 5.065667 0.7158068
## 0.8 0.04 5.065458 0.7157364
## 0.8 0.06 5.070640 0.7151159
## 0.8 0.08 5.077388 0.7143610
## 0.8 0.10 5.088438 0.7131518
## 0.8 0.12 5.102777 0.7115903
## 0.8 0.14 5.120076 0.7097034
## 0.8 0.16 5.140142 0.7075030
## 0.8 0.18 5.162723 0.7050088
## 0.8 0.20 5.186842 0.7023278
## 1.0 0.00 5.072064 0.7151867
## 1.0 0.02 5.063951 0.7159768
## 1.0 0.04 5.067284 0.7155129
## 1.0 0.06 5.073814 0.7147564
## 1.0 0.08 5.086140 0.7133961
## 1.0 0.10 5.103660 0.7114788
## 1.0 0.12 5.125536 0.7090808
## 1.0 0.14 5.152399 0.7061095
## 1.0 0.16 5.182900 0.7027032
## 1.0 0.18 5.209522 0.6997390
## 1.0 0.20 5.224552 0.6981484
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda = 0.02.
enet = glmnet(x, y,family="gaussian", alpha=1, lambda=.02)
enet.coef = coef(enet, s=.02, exact=TRUE)
enet.coef
## 13 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 36.665957641
## crim -0.110659882
## zn 0.051262134
## indus .
## nox -20.919387998
## rm 4.372997843
## age -0.001889596
## dis -1.721916544
## rad 0.296280440
## tax -0.010859856
## ptratio -1.060014694
## b 0.009657763
## lstat -0.442844577
enet.y = predict(enet, newx=newx, type="response", s=.02)
plot(enet.y, test$medv, xlab="Predicted", ylab="Actual", main="Elastic Net")

enet.resid = enet.y-test$medv
mean(enet.resid^2)
## [1] 20.97042
#cross validation with glmnet
set.seed(123)
lasso.cv = cv.glmnet(x, y)
plot(lasso.cv)

lasso.cv$lambda.min #minimum
## [1] 0.02351164
lasso.cv$lambda.1se #one standard error away
## [1] 0.2898625
coef(lasso.cv, s ="lambda.1se")
## 13 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 20.387146446
## crim -0.042684357
## zn 0.015833165
## indus .
## nox -11.079099176
## rm 4.858171117
## age .
## dis -0.826788579
## rad .
## tax .
## ptratio -0.872498509
## b 0.007227703
## lstat -0.425463297
lasso.y.cv = predict(lasso.cv, newx=newx, type="response", s="lambda.1se")
lasso.cv.resid = lasso.y.cv - test$medv
mean(lasso.cv.resid^2)
## [1] 20.66634