Assignment 5: Subset Selection, Shrinkage Methods, and Dimension Reduction
library(leaps)
library(boot)
library(MASS)
library(class)
library(ROCR)
library(e1071)
library(glmnet)
library(pls)1: Reading the Data
First we will read and revise the King County House Sales data set.
House=read.csv("kc house sales.csv")
House$date=as.Date(House$date,format="%Y%m%d")
House$view=as.factor(House$view)
House$waterfront=as.factor(House$waterfront)
House$zipcode=as.factor(House$zipcode)
House[15871,3]=3
attach(House)2: Forward Stepwise Selection
Now we attempt to find the best subset of a multiple regression model using the forward selection heuristic. Values for the lowest Cp and BIC (and the number of predictors in those models) are provided.
fwd.fit=regsubsets(log(price)~(.-sqft_basement-zipcode)^2+zipcode, data=House,nvmax=500, method="forward")
fwd.sum=summary(fwd.fit)
#Cp
mincp=which.min(fwd.sum$cp)
plot(fwd.sum$cp,type="b")
points(mincp,fwd.sum$cp[mincp],pch=4,cex=2,col=2)#BIC
plot(fwd.sum$bic,type="b")
minbic=which.min(fwd.sum$bic)
points(minbic,fwd.sum$bic[minbic],pch=4,cex=2,col=2)#Prepare Results
fres = data.frame(c(min(fwd.sum$cp),which.min(fwd.sum$cp)),c(min(fwd.sum$bic),which.min(fwd.sum$bic)))
colnames(fres) = c("Cp", "BIC")
rownames(fres) = c("Min Value", "#of Predictors")#Display Results
knitr::kable(fres, align = 'l', format='simple', caption="Forwards step-wise selection results")| Cp | BIC | |
|---|---|---|
| Min Value | 174.4023 | -46771.02 |
| #of Predictors | 197.0000 | 131.00 |
3: Backwards Stepwise Selection
Now we attempt to find the best subset of a multiple regression model using the backward selection heuristic. Values for the lowest Cp and BIC (and the number of predictors in those models) are provided.
bck.fit=regsubsets(log(price)~(.-sqft_basement-zipcode)^2+zipcode, data=House,nvmax=500, method="backward")
bck.sum=summary(bck.fit)
#Cp
mincp=which.min(bck.sum$cp)
plot(bck.sum$cp,type="b")
points(mincp,bck.sum$cp[mincp],pch=4,cex=2,col=2)#BIC
plot(bck.sum$bic,type="b")
minbic=which.min(bck.sum$bic)
points(minbic,bck.sum$bic[minbic],pch=4,cex=2,col=2)#Prepare Results
bres = data.frame(c(min(bck.sum$cp),which.min(bck.sum$cp)),c(min(bck.sum$bic),which.min(bck.sum$bic)))
colnames(bres) = c("Cp", "BIC")
rownames(bres) = c("Best Value", "#of Predictors")#Display Results
knitr::kable(bres, align = 'l', format='simple', caption='Backwards step-wise selection results')| Cp | BIC | |
|---|---|---|
| Best Value | 120.2565 | -46976.7 |
| #of Predictors | 167.0000 | 120.0 |
4: Comparing Forward/Backwards Stepwise
This step compares the performance metrics of the forward and backward step-wise selection subsets.
#Prepare Results
options(scipen=999)
bckresults = c(min(bck.sum$cp),min(bck.sum$bic),max(bck.sum$adjr2))
fwdresults = c(min(fwd.sum$cp), min(fwd.sum$bic), max(fwd.sum$adjr2))
fbckresults = data.frame(fwdresults, bckresults)
rownames(fbckresults) = c("Cp","BIC","AdjR2")
colnames(fbckresults) = c("Forward Stepwise", "Backward Stepwise")
#Display Results
knitr::kable(fbckresults, align = 'l', format='simple', caption = 'Forward & Backward step-wise selection results')| Forward Stepwise | Backward Stepwise | |
|---|---|---|
| Cp | 174.4023407 | 120.2564845 |
| BIC | -46771.0211901 | -46976.6952475 |
| AdjR2 | 0.8925686 | 0.8926839 |
The selection made under the backward selection heuristic achieved a significantly lower Cp value along with a slightly higher adjusted R squared. The backwards set-wise selection model is better.
5: Backwards Selection Model
Below is the model obtained from the backward stepwise selection according to BIC. The AIC of the model is provided in the output.
bwd.glm=glm(log(price)~.-sqft_basement+bedrooms:sqft_living+bedrooms:sqft_lot+bedrooms:floors+bedrooms:grade+bedrooms:sqft_above+bedrooms:long+bedrooms:sqft_lot15+bathrooms:sqft_lot+bathrooms:floors+bathrooms:sqft_above+bathrooms:yr_built+bathrooms:long+bathrooms:sqft_lot15+sqft_living:floors+sqft_living:sqft_above+sqft_living:yr_renovated+sqft_living:lat+sqft_living:long+sqft_living:sqft_living15+sqft_living:view+sqft_lot:yr_built+sqft_lot:sqft_lot15+floors:yr_built+floors:lat+condition:grade+condition:yr_built+condition:yr_renovated+condition:sqft_living+grade:long+grade:waterfront+sqft_above:yr_built+sqft_above:lat+yr_built:long+yr_built:sqft_living15+yr_renovated:lat+yr_renovated:view+lat:sqft_lot15+lat:waterfront+long:date+long:view+long:waterfront+view:waterfront, House, family=gaussian)
bwd.glm$aic## [1] -14301.49
6: Backwards Stepwise Cross-Validation
In this step we will perform 10-fold CV on the model stated in the previous question in order to measure the performance. The test-mse is stored in a variable to be compared later.
set.seed(100)
bck.cv <- cv.glm(House,bwd.glm,K=10)
#Store MSE
bck.cv.mse = bck.cv$delta[2]7: Ridge Regression
Now we will consider the same model from question 2, this time using ridge regression to shrink the coefficients and reduce variance. The best lambda found by the model is provided in the output. The test-mse is stored and will be compared later.
#Prepare model matrix and grid of lambda values
x=model.matrix(log(price)~(.-sqft_basement-zipcode)^2+zipcode, House)[,-1]
y=log(price)
grid=exp(1)^seq(10,-5,length=100)
cv.out=cv.glmnet(x,y,lambda=grid,alpha=0)
plot(cv.out)bestlam.r=cv.out$lambda.min
bestlam.r## [1] 0.006737947
min.r=which.min(cv.out$cvm)
#Store MSE
ridge.mse <- cv.out$cvm[min.r]8: The LASSO
Now we will use the LASSO shrinkage method on the model from question 2. The best lambda found by the model is included in the output.
cv.out2=cv.glmnet(x,y,lambda=grid,alpha=1)
plot(cv.out2)bestlam.l=cv.out2$lambda.min
bestlam.l## [1] 0.006737947
min.l=which.min(cv.out2$cvm)
#Store MSE
lasso.mse <- cv.out2$cvm[min.l]9: Model Comparison
Here is a comparison of the performance of the Backwards selection, ridge regression, and LASSO models.
#Prepare Results
options(digits = 5)
bcklr.res = data.frame("Backwards Selection"=bck.cv.mse, "Ridge Regression"=ridge.mse, "LASSO"=lasso.mse)
rownames(bcklr.res) = c("Test MSE")
#Display Results
knitr::kable(bcklr.res, align = 'l', format = 'simple')| Backwards.Selection | Ridge.Regression | LASSO | |
|---|---|---|---|
| Test MSE | 0.03039 | 0.03202 | 0.03716 |
Of these models, Backwards selection provided the smallest test MSE. I would not recommend using ridge regression or LASSO, because the test MSE achieved with both methods is worse than without them. This is probably because the number of predictors is small compared to the number of observations.
10: Principal Component Regression
In this step we will perform principal component regression to predict log(price) using all other variables in the data.
options(digits = 8)
pcr.fit=pcr(log(price)~., data=House, scale=T, validation="CV")
summary(pcr.fit)## Data: X dimension: 21613 90
## Y dimension: 21613 1
## Fit method: svdpc
## Number of components considered: 90
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 0.5268 0.3982 0.2924 0.292 0.2655 0.2649 0.264
## adjCV 0.5268 0.3982 0.2924 0.292 0.2655 0.2649 0.264
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 0.2498 0.2469 0.2465 0.2397 0.2384 0.2384 0.2381
## adjCV 0.2496 0.2468 0.2464 0.2385 0.2382 0.2382 0.2381
## 14 comps 15 comps 16 comps 17 comps 18 comps 19 comps 20 comps
## CV 0.2348 0.2346 0.2340 0.2336 0.2335 0.2333 0.2325
## adjCV 0.2349 0.2348 0.2344 0.2340 0.2338 0.2337 0.2327
## 21 comps 22 comps 23 comps 24 comps 25 comps 26 comps 27 comps
## CV 0.2320 0.2319 0.2318 0.2309 0.2309 0.2305 0.2303
## adjCV 0.2323 0.2322 0.2322 0.2312 0.2312 0.2308 0.2305
## 28 comps 29 comps 30 comps 31 comps 32 comps 33 comps 34 comps
## CV 0.2302 0.2301 0.2300 0.2295 0.2281 0.2278 0.2270
## adjCV 0.2305 0.2305 0.2305 0.2300 0.2286 0.2283 0.2276
## 35 comps 36 comps 37 comps 38 comps 39 comps 40 comps 41 comps
## CV 0.2261 0.2260 0.2259 0.2256 0.2253 0.2248 0.2244
## adjCV 0.2262 0.2261 0.2260 0.2257 0.2250 0.2248 0.2243
## 42 comps 43 comps 44 comps 45 comps 46 comps 47 comps 48 comps
## CV 0.2240 0.2236 0.2236 0.2235 0.2228 0.2226 0.2225
## adjCV 0.2239 0.2238 0.2239 0.2240 0.2228 0.2227 0.2228
## 49 comps 50 comps 51 comps 52 comps 53 comps 54 comps 55 comps
## CV 0.2225 0.2223 0.2216 0.2209 0.2205 0.2203 0.2202
## adjCV 0.2227 0.2226 0.2218 0.2206 0.2204 0.2203 0.2204
## 56 comps 57 comps 58 comps 59 comps 60 comps 61 comps 62 comps
## CV 0.2200 0.2199 0.2194 0.2195 0.2194 0.2193 0.2194
## adjCV 0.2201 0.2201 0.2195 0.2196 0.2195 0.2195 0.2195
## 63 comps 64 comps 65 comps 66 comps 67 comps 68 comps 69 comps
## CV 0.2192 0.2192 0.2192 0.2191 0.2190 0.2188 0.2185
## adjCV 0.2193 0.2193 0.2193 0.2193 0.2192 0.2190 0.2186
## 70 comps 71 comps 72 comps 73 comps 74 comps 75 comps 76 comps
## CV 0.2175 0.2156 0.2029 0.1975 0.196 0.1960 0.1942
## adjCV 0.2176 0.2160 0.2028 0.1963 0.196 0.1959 0.1942
## 77 comps 78 comps 79 comps 80 comps 81 comps 82 comps 83 comps
## CV 0.1924 0.192 0.1918 0.1916 0.1911 0.1903 0.1880
## adjCV 0.1923 0.192 0.1917 0.1915 0.1910 0.1903 0.1879
## 84 comps 85 comps 86 comps 87 comps 88 comps 89 comps 90 comps
## CV 0.1879 0.1878 0.1871 0.1844 0.1842 0.1835 0.1835
## adjCV 0.1879 0.1877 0.1871 0.1843 0.1842 0.1835 0.1834
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 6.078 8.947 11.29 13.36 15.14 16.75 18.11
## log(price) 42.886 69.201 69.27 74.61 74.72 74.91 77.53
## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps
## X 19.41 20.67 21.89 23.12 24.31 25.5 26.66
## log(price) 78.07 78.14 79.49 79.56 79.56 79.6 80.19
## 15 comps 16 comps 17 comps 18 comps 19 comps 20 comps
## X 27.81 28.96 30.10 31.24 32.38 33.52
## log(price) 80.19 80.27 80.32 80.36 80.40 80.58
## 21 comps 22 comps 23 comps 24 comps 25 comps 26 comps
## X 34.65 35.79 36.93 38.06 39.20 40.33
## log(price) 80.66 80.68 80.68 80.83 80.84 80.91
## 27 comps 28 comps 29 comps 30 comps 31 comps 32 comps
## X 41.47 42.60 43.73 44.86 46.00 47.13
## log(price) 80.97 80.97 80.99 81.00 81.08 81.31
## 33 comps 34 comps 35 comps 36 comps 37 comps 38 comps
## X 48.26 49.39 50.52 51.64 52.77 53.90
## log(price) 81.36 81.48 81.70 81.71 81.73 81.79
## 39 comps 40 comps 41 comps 42 comps 43 comps 44 comps
## X 55.03 56.15 57.28 58.41 59.53 60.66
## log(price) 81.91 81.93 82.03 82.09 82.09 82.09
## 45 comps 46 comps 47 comps 48 comps 49 comps 50 comps
## X 61.79 62.91 64.04 65.16 66.29 67.41
## log(price) 82.10 82.28 82.30 82.30 82.31 82.33
## 51 comps 52 comps 53 comps 54 comps 55 comps 56 comps
## X 68.54 69.66 70.78 71.91 73.03 74.15
## log(price) 82.44 82.62 82.66 82.67 82.67 82.71
## 57 comps 58 comps 59 comps 60 comps 61 comps 62 comps
## X 75.27 76.39 77.51 78.63 79.75 80.87
## log(price) 82.71 82.79 82.79 82.81 82.81 82.81
## 63 comps 64 comps 65 comps 66 comps 67 comps 68 comps
## X 81.99 83.11 84.22 85.34 86.46 87.57
## log(price) 82.86 82.86 82.86 82.87 82.89 82.92
## 69 comps 70 comps 71 comps 72 comps 73 comps 74 comps
## X 88.66 89.72 90.77 91.81 92.81 93.79
## log(price) 83.00 83.13 83.37 85.31 86.25 86.27
## 75 comps 76 comps 77 comps 78 comps 79 comps 80 comps
## X 94.74 95.56 96.36 97.12 97.71 98.24
## log(price) 86.27 86.53 86.78 86.83 86.86 86.89
## 81 comps 82 comps 83 comps 84 comps 85 comps 86 comps
## X 98.65 98.96 99.25 99.51 99.75 99.92
## log(price) 86.97 87.07 87.39 87.40 87.42 87.50
## 87 comps 88 comps 89 comps 90 comps
## X 99.98 100.0 100.00 100.00
## log(price) 87.88 87.9 87.99 87.99
validationplot(pcr.fit, val.type="MSEP", legendpos="topright")#Results visually obtained from pcr summary
pcr.mse = 0.1835^2
pcr.min = 8911: Partial Least Squares Regression
Now we will repreat question 10 for partial least squares regression.
pls.fit=plsr(log(price)~., data=House, scale=T, validation="CV")
summary(pls.fit)## Data: X dimension: 21613 90
## Y dimension: 21613 1
## Fit method: kernelpls
## Number of components considered: 90
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 0.5268 0.2832 0.2071 0.1898 0.1878 0.1863 0.1856
## adjCV 0.5268 0.2832 0.2071 0.1897 0.1878 0.1863 0.1855
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 0.1852 0.1848 0.1844 0.1843 0.1842 0.1842 0.1841
## adjCV 0.1852 0.1847 0.1844 0.1842 0.1842 0.1841 0.1841
## 14 comps 15 comps 16 comps 17 comps 18 comps 19 comps 20 comps
## CV 0.1841 0.1841 0.1840 0.1838 0.1838 0.1837 0.1837
## adjCV 0.1841 0.1840 0.1839 0.1838 0.1837 0.1837 0.1837
## 21 comps 22 comps 23 comps 24 comps 25 comps 26 comps 27 comps
## CV 0.1837 0.1836 0.1835 0.1835 0.1835 0.1835 0.1835
## adjCV 0.1836 0.1836 0.1835 0.1835 0.1835 0.1835 0.1835
## 28 comps 29 comps 30 comps 31 comps 32 comps 33 comps 34 comps
## CV 0.1835 0.1835 0.1835 0.1835 0.1835 0.1835 0.1835
## adjCV 0.1835 0.1835 0.1835 0.1835 0.1835 0.1835 0.1835
## 35 comps 36 comps 37 comps 38 comps 39 comps 40 comps 41 comps
## CV 0.1835 0.1835 0.1835 0.1835 0.1835 0.1835 0.1835
## adjCV 0.1835 0.1835 0.1835 0.1835 0.1835 0.1835 0.1835
## 42 comps 43 comps 44 comps 45 comps 46 comps 47 comps 48 comps
## CV 0.1835 0.1835 0.1843 0.1843 0.1843 0.1843 0.1843
## adjCV 0.1835 0.1834 0.1838 0.1839 0.1838 0.1838 0.1838
## 49 comps 50 comps 51 comps 52 comps 53 comps 54 comps 55 comps
## CV 0.1843 0.1843 0.1843 0.1843 0.1843 0.1843 0.1843
## adjCV 0.1838 0.1838 0.1838 0.1838 0.1838 0.1838 0.1838
## 56 comps 57 comps 58 comps 59 comps 60 comps 61 comps 62 comps
## CV 0.1843 0.1843 0.1843 0.1843 0.1843 0.1843 0.1843
## adjCV 0.1838 0.1838 0.1838 0.1838 0.1838 0.1838 0.1838
## 63 comps 64 comps 65 comps 66 comps 67 comps 68 comps 69 comps
## CV 0.1843 0.1843 0.1843 0.1843 0.1843 0.1843 0.1843
## adjCV 0.1838 0.1838 0.1838 0.1838 0.1838 0.1838 0.1838
## 70 comps 71 comps 72 comps 73 comps 74 comps 75 comps 76 comps
## CV 0.1843 0.1843 0.1843 0.1843 0.1843 0.1843 0.1843
## adjCV 0.1838 0.1838 0.1838 0.1838 0.1838 0.1838 0.1838
## 77 comps 78 comps 79 comps 80 comps 81 comps 82 comps 83 comps
## CV 0.1843 0.1843 0.1843 0.1843 0.1843 0.1843 0.1843
## adjCV 0.1838 0.1838 0.1838 0.1838 0.1838 0.1838 0.1838
## 84 comps 85 comps 86 comps 87 comps 88 comps 89 comps 90 comps
## CV 0.1843 0.1843 0.1843 0.1843 0.1843 0.1843 0.1843
## adjCV 0.1838 0.1838 0.1838 0.1838 0.1838 0.1838 0.1838
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 5.605 8.61 10.23 11.94 12.79 13.74 15.24
## log(price) 71.129 84.61 87.14 87.41 87.62 87.72 87.77
## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps
## X 16.47 17.43 18.56 19.46 20.44 21.42 22.17
## log(price) 87.82 87.87 87.89 87.90 87.90 87.91 87.91
## 15 comps 16 comps 17 comps 18 comps 19 comps 20 comps
## X 22.95 23.62 24.02 24.90 25.62 26.32
## log(price) 87.92 87.93 87.95 87.96 87.96 87.96
## 21 comps 22 comps 23 comps 24 comps 25 comps 26 comps
## X 26.99 27.49 28.02 28.86 29.39 30.33
## log(price) 87.97 87.98 87.99 87.99 87.99 87.99
## 27 comps 28 comps 29 comps 30 comps 31 comps 32 comps
## X 31.24 31.94 32.56 33.58 34.69 35.82
## log(price) 87.99 87.99 87.99 87.99 87.99 87.99
## 33 comps 34 comps 35 comps 36 comps 37 comps 38 comps
## X 36.93 38.01 39.09 40.20 41.34 42.48
## log(price) 87.99 87.99 87.99 87.99 87.99 87.99
## 39 comps 40 comps 41 comps 42 comps 43 comps 44 comps
## X 43.61 44.75 45.88 47.01 48.14 48.22
## log(price) 87.99 87.99 87.99 87.99 87.99 87.95
## 45 comps 46 comps 47 comps 48 comps 49 comps 50 comps
## X 49.12 49.28 49.45 49.62 49.79 49.96
## log(price) 87.94 87.94 87.94 87.94 87.94 87.94
## 51 comps 52 comps 53 comps 54 comps 55 comps 56 comps
## X 50.13 50.30 50.48 50.65 50.82 50.99
## log(price) 87.94 87.94 87.94 87.94 87.94 87.94
## 57 comps 58 comps 59 comps 60 comps 61 comps 62 comps
## X 51.16 51.33 51.51 51.68 51.85 52.02
## log(price) 87.94 87.94 87.94 87.94 87.94 87.94
## 63 comps 64 comps 65 comps 66 comps 67 comps 68 comps
## X 52.19 52.36 52.54 52.71 52.88 53.05
## log(price) 87.94 87.94 87.94 87.94 87.94 87.94
## 69 comps 70 comps 71 comps 72 comps 73 comps 74 comps
## X 53.22 53.40 53.57 53.74 53.91 54.08
## log(price) 87.94 87.94 87.94 87.94 87.94 87.94
## 75 comps 76 comps 77 comps 78 comps 79 comps 80 comps
## X 54.25 54.43 54.60 54.77 54.94 55.11
## log(price) 87.94 87.94 87.94 87.94 87.94 87.94
## 81 comps 82 comps 83 comps 84 comps 85 comps 86 comps
## X 55.29 55.46 55.63 55.80 55.97 56.14
## log(price) 87.94 87.94 87.94 87.94 87.94 87.94
## 87 comps 88 comps 89 comps 90 comps
## X 56.31 56.49 56.66 56.83
## log(price) 87.94 87.94 87.94 87.94
z=validationplot(pls.fit, val.type="MSEP", legendpos="topright")#Results visually obtained from the plsr summary
plsr.mse = 0.1835^2
plsr.min = 2312: Comparison of all the models
Here is a comparison of all five models: backward selection, ridge regression, LASSO, principal component regression, and partial least squares.
#Prepare Results
bcklr.res["Principle Component Regression"] = pcr.mse
bcklr.res["Partial Least Squares Regression"] = plsr.mse
#Display Results
knitr::kable(bcklr.res, align = 'l', format='simple')| Backwards.Selection | Ridge.Regression | LASSO | Principle Component Regression | Partial Least Squares Regression | |
|---|---|---|---|---|---|
| Test MSE | 0.0303886 | 0.03202018 | 0.03716253 | 0.03367225 | 0.03367225 |
Of these models, the Backwards selection model provided the lowest test MSE, therefore I recommend the backwards selection model.