Assignment 5: Subset Selection, Shrinkage Methods, and Dimension Reduction

library(leaps)
library(boot)
library(MASS)
library(class)
library(ROCR)
library(e1071)
library(glmnet)
library(pls)

1: Reading the Data

First we will read and revise the King County House Sales data set.

House=read.csv("kc house sales.csv")
House$date=as.Date(House$date,format="%Y%m%d")
House$view=as.factor(House$view)
House$waterfront=as.factor(House$waterfront)
House$zipcode=as.factor(House$zipcode)
House[15871,3]=3
attach(House)

2: Forward Stepwise Selection

Now we attempt to find the best subset of a multiple regression model using the forward selection heuristic. Values for the lowest Cp and BIC (and the number of predictors in those models) are provided.

fwd.fit=regsubsets(log(price)~(.-sqft_basement-zipcode)^2+zipcode, data=House,nvmax=500, method="forward")
fwd.sum=summary(fwd.fit)

#Cp
mincp=which.min(fwd.sum$cp)
plot(fwd.sum$cp,type="b")
points(mincp,fwd.sum$cp[mincp],pch=4,cex=2,col=2)

#BIC
plot(fwd.sum$bic,type="b")
minbic=which.min(fwd.sum$bic)
points(minbic,fwd.sum$bic[minbic],pch=4,cex=2,col=2)

#Prepare Results
fres = data.frame(c(min(fwd.sum$cp),which.min(fwd.sum$cp)),c(min(fwd.sum$bic),which.min(fwd.sum$bic)))
colnames(fres) = c("Cp", "BIC")
rownames(fres) = c("Min Value", "#of Predictors")
#Display Results
knitr::kable(fres, align = 'l', format='simple', caption="Forwards step-wise selection results")
Forwards step-wise selection results
Cp BIC
Min Value 174.4023 -46771.02
#of Predictors 197.0000 131.00

3: Backwards Stepwise Selection

Now we attempt to find the best subset of a multiple regression model using the backward selection heuristic. Values for the lowest Cp and BIC (and the number of predictors in those models) are provided.

bck.fit=regsubsets(log(price)~(.-sqft_basement-zipcode)^2+zipcode, data=House,nvmax=500, method="backward")
bck.sum=summary(bck.fit)

#Cp
mincp=which.min(bck.sum$cp)
plot(bck.sum$cp,type="b")
points(mincp,bck.sum$cp[mincp],pch=4,cex=2,col=2)

#BIC
plot(bck.sum$bic,type="b")
minbic=which.min(bck.sum$bic)
points(minbic,bck.sum$bic[minbic],pch=4,cex=2,col=2)

#Prepare Results
bres = data.frame(c(min(bck.sum$cp),which.min(bck.sum$cp)),c(min(bck.sum$bic),which.min(bck.sum$bic)))
colnames(bres) = c("Cp", "BIC")
rownames(bres) = c("Best Value", "#of Predictors")
#Display Results
knitr::kable(bres, align = 'l', format='simple', caption='Backwards step-wise selection results')
Backwards step-wise selection results
Cp BIC
Best Value 120.2565 -46976.7
#of Predictors 167.0000 120.0

4: Comparing Forward/Backwards Stepwise

This step compares the performance metrics of the forward and backward step-wise selection subsets.

#Prepare Results
options(scipen=999)
bckresults = c(min(bck.sum$cp),min(bck.sum$bic),max(bck.sum$adjr2))
fwdresults = c(min(fwd.sum$cp), min(fwd.sum$bic), max(fwd.sum$adjr2))
fbckresults = data.frame(fwdresults, bckresults)
rownames(fbckresults) = c("Cp","BIC","AdjR2")
colnames(fbckresults) = c("Forward Stepwise", "Backward Stepwise")

#Display Results
knitr::kable(fbckresults, align = 'l', format='simple', caption = 'Forward & Backward step-wise selection results')
Forward & Backward step-wise selection results
Forward Stepwise Backward Stepwise
Cp 174.4023407 120.2564845
BIC -46771.0211901 -46976.6952475
AdjR2 0.8925686 0.8926839

The selection made under the backward selection heuristic achieved a significantly lower Cp value along with a slightly higher adjusted R squared. The backwards set-wise selection model is better.

5: Backwards Selection Model

Below is the model obtained from the backward stepwise selection according to BIC. The AIC of the model is provided in the output.

bwd.glm=glm(log(price)~.-sqft_basement+bedrooms:sqft_living+bedrooms:sqft_lot+bedrooms:floors+bedrooms:grade+bedrooms:sqft_above+bedrooms:long+bedrooms:sqft_lot15+bathrooms:sqft_lot+bathrooms:floors+bathrooms:sqft_above+bathrooms:yr_built+bathrooms:long+bathrooms:sqft_lot15+sqft_living:floors+sqft_living:sqft_above+sqft_living:yr_renovated+sqft_living:lat+sqft_living:long+sqft_living:sqft_living15+sqft_living:view+sqft_lot:yr_built+sqft_lot:sqft_lot15+floors:yr_built+floors:lat+condition:grade+condition:yr_built+condition:yr_renovated+condition:sqft_living+grade:long+grade:waterfront+sqft_above:yr_built+sqft_above:lat+yr_built:long+yr_built:sqft_living15+yr_renovated:lat+yr_renovated:view+lat:sqft_lot15+lat:waterfront+long:date+long:view+long:waterfront+view:waterfront, House, family=gaussian)

bwd.glm$aic
## [1] -14301.49

6: Backwards Stepwise Cross-Validation

In this step we will perform 10-fold CV on the model stated in the previous question in order to measure the performance. The test-mse is stored in a variable to be compared later.

set.seed(100)
bck.cv <- cv.glm(House,bwd.glm,K=10)

#Store MSE
bck.cv.mse = bck.cv$delta[2]

7: Ridge Regression

Now we will consider the same model from question 2, this time using ridge regression to shrink the coefficients and reduce variance. The best lambda found by the model is provided in the output. The test-mse is stored and will be compared later.

#Prepare model matrix and grid of lambda values
x=model.matrix(log(price)~(.-sqft_basement-zipcode)^2+zipcode, House)[,-1]
y=log(price)
grid=exp(1)^seq(10,-5,length=100)

cv.out=cv.glmnet(x,y,lambda=grid,alpha=0)
plot(cv.out)

bestlam.r=cv.out$lambda.min
bestlam.r
## [1] 0.006737947
min.r=which.min(cv.out$cvm)

#Store MSE
ridge.mse <- cv.out$cvm[min.r]

8: The LASSO

Now we will use the LASSO shrinkage method on the model from question 2. The best lambda found by the model is included in the output.

cv.out2=cv.glmnet(x,y,lambda=grid,alpha=1)
plot(cv.out2)

bestlam.l=cv.out2$lambda.min
bestlam.l
## [1] 0.006737947
min.l=which.min(cv.out2$cvm)

#Store MSE
lasso.mse <- cv.out2$cvm[min.l]

9: Model Comparison

Here is a comparison of the performance of the Backwards selection, ridge regression, and LASSO models.

#Prepare Results
options(digits = 5)
bcklr.res = data.frame("Backwards Selection"=bck.cv.mse, "Ridge Regression"=ridge.mse, "LASSO"=lasso.mse)
rownames(bcklr.res) = c("Test MSE")

#Display Results
knitr::kable(bcklr.res, align = 'l', format = 'simple')
Backwards.Selection Ridge.Regression LASSO
Test MSE 0.03039 0.03202 0.03716

Of these models, Backwards selection provided the smallest test MSE. I would not recommend using ridge regression or LASSO, because the test MSE achieved with both methods is worse than without them. This is probably because the number of predictors is small compared to the number of observations.

10: Principal Component Regression

In this step we will perform principal component regression to predict log(price) using all other variables in the data.

options(digits = 8)
pcr.fit=pcr(log(price)~., data=House, scale=T, validation="CV")
summary(pcr.fit)
## Data:    X dimension: 21613 90 
##  Y dimension: 21613 1
## Fit method: svdpc
## Number of components considered: 90
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV          0.5268   0.3982   0.2924    0.292   0.2655   0.2649    0.264
## adjCV       0.5268   0.3982   0.2924    0.292   0.2655   0.2649    0.264
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV      0.2498   0.2469   0.2465    0.2397    0.2384    0.2384    0.2381
## adjCV   0.2496   0.2468   0.2464    0.2385    0.2382    0.2382    0.2381
##        14 comps  15 comps  16 comps  17 comps  18 comps  19 comps  20 comps
## CV       0.2348    0.2346    0.2340    0.2336    0.2335    0.2333    0.2325
## adjCV    0.2349    0.2348    0.2344    0.2340    0.2338    0.2337    0.2327
##        21 comps  22 comps  23 comps  24 comps  25 comps  26 comps  27 comps
## CV       0.2320    0.2319    0.2318    0.2309    0.2309    0.2305    0.2303
## adjCV    0.2323    0.2322    0.2322    0.2312    0.2312    0.2308    0.2305
##        28 comps  29 comps  30 comps  31 comps  32 comps  33 comps  34 comps
## CV       0.2302    0.2301    0.2300    0.2295    0.2281    0.2278    0.2270
## adjCV    0.2305    0.2305    0.2305    0.2300    0.2286    0.2283    0.2276
##        35 comps  36 comps  37 comps  38 comps  39 comps  40 comps  41 comps
## CV       0.2261    0.2260    0.2259    0.2256    0.2253    0.2248    0.2244
## adjCV    0.2262    0.2261    0.2260    0.2257    0.2250    0.2248    0.2243
##        42 comps  43 comps  44 comps  45 comps  46 comps  47 comps  48 comps
## CV       0.2240    0.2236    0.2236    0.2235    0.2228    0.2226    0.2225
## adjCV    0.2239    0.2238    0.2239    0.2240    0.2228    0.2227    0.2228
##        49 comps  50 comps  51 comps  52 comps  53 comps  54 comps  55 comps
## CV       0.2225    0.2223    0.2216    0.2209    0.2205    0.2203    0.2202
## adjCV    0.2227    0.2226    0.2218    0.2206    0.2204    0.2203    0.2204
##        56 comps  57 comps  58 comps  59 comps  60 comps  61 comps  62 comps
## CV       0.2200    0.2199    0.2194    0.2195    0.2194    0.2193    0.2194
## adjCV    0.2201    0.2201    0.2195    0.2196    0.2195    0.2195    0.2195
##        63 comps  64 comps  65 comps  66 comps  67 comps  68 comps  69 comps
## CV       0.2192    0.2192    0.2192    0.2191    0.2190    0.2188    0.2185
## adjCV    0.2193    0.2193    0.2193    0.2193    0.2192    0.2190    0.2186
##        70 comps  71 comps  72 comps  73 comps  74 comps  75 comps  76 comps
## CV       0.2175    0.2156    0.2029    0.1975     0.196    0.1960    0.1942
## adjCV    0.2176    0.2160    0.2028    0.1963     0.196    0.1959    0.1942
##        77 comps  78 comps  79 comps  80 comps  81 comps  82 comps  83 comps
## CV       0.1924     0.192    0.1918    0.1916    0.1911    0.1903    0.1880
## adjCV    0.1923     0.192    0.1917    0.1915    0.1910    0.1903    0.1879
##        84 comps  85 comps  86 comps  87 comps  88 comps  89 comps  90 comps
## CV       0.1879    0.1878    0.1871    0.1844    0.1842    0.1835    0.1835
## adjCV    0.1879    0.1877    0.1871    0.1843    0.1842    0.1835    0.1834
## 
## TRAINING: % variance explained
##             1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X             6.078    8.947    11.29    13.36    15.14    16.75    18.11
## log(price)   42.886   69.201    69.27    74.61    74.72    74.91    77.53
##             8 comps  9 comps  10 comps  11 comps  12 comps  13 comps  14 comps
## X             19.41    20.67     21.89     23.12     24.31      25.5     26.66
## log(price)    78.07    78.14     79.49     79.56     79.56      79.6     80.19
##             15 comps  16 comps  17 comps  18 comps  19 comps  20 comps
## X              27.81     28.96     30.10     31.24     32.38     33.52
## log(price)     80.19     80.27     80.32     80.36     80.40     80.58
##             21 comps  22 comps  23 comps  24 comps  25 comps  26 comps
## X              34.65     35.79     36.93     38.06     39.20     40.33
## log(price)     80.66     80.68     80.68     80.83     80.84     80.91
##             27 comps  28 comps  29 comps  30 comps  31 comps  32 comps
## X              41.47     42.60     43.73     44.86     46.00     47.13
## log(price)     80.97     80.97     80.99     81.00     81.08     81.31
##             33 comps  34 comps  35 comps  36 comps  37 comps  38 comps
## X              48.26     49.39     50.52     51.64     52.77     53.90
## log(price)     81.36     81.48     81.70     81.71     81.73     81.79
##             39 comps  40 comps  41 comps  42 comps  43 comps  44 comps
## X              55.03     56.15     57.28     58.41     59.53     60.66
## log(price)     81.91     81.93     82.03     82.09     82.09     82.09
##             45 comps  46 comps  47 comps  48 comps  49 comps  50 comps
## X              61.79     62.91     64.04     65.16     66.29     67.41
## log(price)     82.10     82.28     82.30     82.30     82.31     82.33
##             51 comps  52 comps  53 comps  54 comps  55 comps  56 comps
## X              68.54     69.66     70.78     71.91     73.03     74.15
## log(price)     82.44     82.62     82.66     82.67     82.67     82.71
##             57 comps  58 comps  59 comps  60 comps  61 comps  62 comps
## X              75.27     76.39     77.51     78.63     79.75     80.87
## log(price)     82.71     82.79     82.79     82.81     82.81     82.81
##             63 comps  64 comps  65 comps  66 comps  67 comps  68 comps
## X              81.99     83.11     84.22     85.34     86.46     87.57
## log(price)     82.86     82.86     82.86     82.87     82.89     82.92
##             69 comps  70 comps  71 comps  72 comps  73 comps  74 comps
## X              88.66     89.72     90.77     91.81     92.81     93.79
## log(price)     83.00     83.13     83.37     85.31     86.25     86.27
##             75 comps  76 comps  77 comps  78 comps  79 comps  80 comps
## X              94.74     95.56     96.36     97.12     97.71     98.24
## log(price)     86.27     86.53     86.78     86.83     86.86     86.89
##             81 comps  82 comps  83 comps  84 comps  85 comps  86 comps
## X              98.65     98.96     99.25     99.51     99.75     99.92
## log(price)     86.97     87.07     87.39     87.40     87.42     87.50
##             87 comps  88 comps  89 comps  90 comps
## X              99.98     100.0    100.00    100.00
## log(price)     87.88      87.9     87.99     87.99
validationplot(pcr.fit, val.type="MSEP", legendpos="topright")

#Results visually obtained from pcr summary
pcr.mse = 0.1835^2
pcr.min = 89

11: Partial Least Squares Regression

Now we will repreat question 10 for partial least squares regression.

pls.fit=plsr(log(price)~., data=House, scale=T, validation="CV")
summary(pls.fit)
## Data:    X dimension: 21613 90 
##  Y dimension: 21613 1
## Fit method: kernelpls
## Number of components considered: 90
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV          0.5268   0.2832   0.2071   0.1898   0.1878   0.1863   0.1856
## adjCV       0.5268   0.2832   0.2071   0.1897   0.1878   0.1863   0.1855
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV      0.1852   0.1848   0.1844    0.1843    0.1842    0.1842    0.1841
## adjCV   0.1852   0.1847   0.1844    0.1842    0.1842    0.1841    0.1841
##        14 comps  15 comps  16 comps  17 comps  18 comps  19 comps  20 comps
## CV       0.1841    0.1841    0.1840    0.1838    0.1838    0.1837    0.1837
## adjCV    0.1841    0.1840    0.1839    0.1838    0.1837    0.1837    0.1837
##        21 comps  22 comps  23 comps  24 comps  25 comps  26 comps  27 comps
## CV       0.1837    0.1836    0.1835    0.1835    0.1835    0.1835    0.1835
## adjCV    0.1836    0.1836    0.1835    0.1835    0.1835    0.1835    0.1835
##        28 comps  29 comps  30 comps  31 comps  32 comps  33 comps  34 comps
## CV       0.1835    0.1835    0.1835    0.1835    0.1835    0.1835    0.1835
## adjCV    0.1835    0.1835    0.1835    0.1835    0.1835    0.1835    0.1835
##        35 comps  36 comps  37 comps  38 comps  39 comps  40 comps  41 comps
## CV       0.1835    0.1835    0.1835    0.1835    0.1835    0.1835    0.1835
## adjCV    0.1835    0.1835    0.1835    0.1835    0.1835    0.1835    0.1835
##        42 comps  43 comps  44 comps  45 comps  46 comps  47 comps  48 comps
## CV       0.1835    0.1835    0.1843    0.1843    0.1843    0.1843    0.1843
## adjCV    0.1835    0.1834    0.1838    0.1839    0.1838    0.1838    0.1838
##        49 comps  50 comps  51 comps  52 comps  53 comps  54 comps  55 comps
## CV       0.1843    0.1843    0.1843    0.1843    0.1843    0.1843    0.1843
## adjCV    0.1838    0.1838    0.1838    0.1838    0.1838    0.1838    0.1838
##        56 comps  57 comps  58 comps  59 comps  60 comps  61 comps  62 comps
## CV       0.1843    0.1843    0.1843    0.1843    0.1843    0.1843    0.1843
## adjCV    0.1838    0.1838    0.1838    0.1838    0.1838    0.1838    0.1838
##        63 comps  64 comps  65 comps  66 comps  67 comps  68 comps  69 comps
## CV       0.1843    0.1843    0.1843    0.1843    0.1843    0.1843    0.1843
## adjCV    0.1838    0.1838    0.1838    0.1838    0.1838    0.1838    0.1838
##        70 comps  71 comps  72 comps  73 comps  74 comps  75 comps  76 comps
## CV       0.1843    0.1843    0.1843    0.1843    0.1843    0.1843    0.1843
## adjCV    0.1838    0.1838    0.1838    0.1838    0.1838    0.1838    0.1838
##        77 comps  78 comps  79 comps  80 comps  81 comps  82 comps  83 comps
## CV       0.1843    0.1843    0.1843    0.1843    0.1843    0.1843    0.1843
## adjCV    0.1838    0.1838    0.1838    0.1838    0.1838    0.1838    0.1838
##        84 comps  85 comps  86 comps  87 comps  88 comps  89 comps  90 comps
## CV       0.1843    0.1843    0.1843    0.1843    0.1843    0.1843    0.1843
## adjCV    0.1838    0.1838    0.1838    0.1838    0.1838    0.1838    0.1838
## 
## TRAINING: % variance explained
##             1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X             5.605     8.61    10.23    11.94    12.79    13.74    15.24
## log(price)   71.129    84.61    87.14    87.41    87.62    87.72    87.77
##             8 comps  9 comps  10 comps  11 comps  12 comps  13 comps  14 comps
## X             16.47    17.43     18.56     19.46     20.44     21.42     22.17
## log(price)    87.82    87.87     87.89     87.90     87.90     87.91     87.91
##             15 comps  16 comps  17 comps  18 comps  19 comps  20 comps
## X              22.95     23.62     24.02     24.90     25.62     26.32
## log(price)     87.92     87.93     87.95     87.96     87.96     87.96
##             21 comps  22 comps  23 comps  24 comps  25 comps  26 comps
## X              26.99     27.49     28.02     28.86     29.39     30.33
## log(price)     87.97     87.98     87.99     87.99     87.99     87.99
##             27 comps  28 comps  29 comps  30 comps  31 comps  32 comps
## X              31.24     31.94     32.56     33.58     34.69     35.82
## log(price)     87.99     87.99     87.99     87.99     87.99     87.99
##             33 comps  34 comps  35 comps  36 comps  37 comps  38 comps
## X              36.93     38.01     39.09     40.20     41.34     42.48
## log(price)     87.99     87.99     87.99     87.99     87.99     87.99
##             39 comps  40 comps  41 comps  42 comps  43 comps  44 comps
## X              43.61     44.75     45.88     47.01     48.14     48.22
## log(price)     87.99     87.99     87.99     87.99     87.99     87.95
##             45 comps  46 comps  47 comps  48 comps  49 comps  50 comps
## X              49.12     49.28     49.45     49.62     49.79     49.96
## log(price)     87.94     87.94     87.94     87.94     87.94     87.94
##             51 comps  52 comps  53 comps  54 comps  55 comps  56 comps
## X              50.13     50.30     50.48     50.65     50.82     50.99
## log(price)     87.94     87.94     87.94     87.94     87.94     87.94
##             57 comps  58 comps  59 comps  60 comps  61 comps  62 comps
## X              51.16     51.33     51.51     51.68     51.85     52.02
## log(price)     87.94     87.94     87.94     87.94     87.94     87.94
##             63 comps  64 comps  65 comps  66 comps  67 comps  68 comps
## X              52.19     52.36     52.54     52.71     52.88     53.05
## log(price)     87.94     87.94     87.94     87.94     87.94     87.94
##             69 comps  70 comps  71 comps  72 comps  73 comps  74 comps
## X              53.22     53.40     53.57     53.74     53.91     54.08
## log(price)     87.94     87.94     87.94     87.94     87.94     87.94
##             75 comps  76 comps  77 comps  78 comps  79 comps  80 comps
## X              54.25     54.43     54.60     54.77     54.94     55.11
## log(price)     87.94     87.94     87.94     87.94     87.94     87.94
##             81 comps  82 comps  83 comps  84 comps  85 comps  86 comps
## X              55.29     55.46     55.63     55.80     55.97     56.14
## log(price)     87.94     87.94     87.94     87.94     87.94     87.94
##             87 comps  88 comps  89 comps  90 comps
## X              56.31     56.49     56.66     56.83
## log(price)     87.94     87.94     87.94     87.94
z=validationplot(pls.fit, val.type="MSEP", legendpos="topright")

#Results visually obtained from the plsr summary
plsr.mse = 0.1835^2
plsr.min = 23

12: Comparison of all the models

Here is a comparison of all five models: backward selection, ridge regression, LASSO, principal component regression, and partial least squares.

#Prepare Results
bcklr.res["Principle Component Regression"] = pcr.mse
bcklr.res["Partial Least Squares Regression"] = plsr.mse

#Display Results
knitr::kable(bcklr.res, align = 'l', format='simple')
Backwards.Selection Ridge.Regression LASSO Principle Component Regression Partial Least Squares Regression
Test MSE 0.0303886 0.03202018 0.03716253 0.03367225 0.03367225

Of these models, the Backwards selection model provided the lowest test MSE, therefore I recommend the backwards selection model.