ESC Assignment_Regularization of Boston real estate data

Best Subset Selection method, Ridge Regression, Lasso의 test MSE 값 비교

error = noise + bias + variance
noise(X) : Irreducible error
bias, variance : Reducibl error
Ridge와 Lasso 는 독립변수를 제거하지 않고 회귀 계수만 줄이기 때문에 week 3 에서 다루었던 selection method보다 bias 는 커지지만 variance를 더 크게 줄일 수 있으면 결과적으로 더 예측력이 좋은 회귀 모형을 만들 수 있다.

library(MASS)
library(ISLR)
library(glmnet)

## Loading required package: Matrix

## Loading required package: foreach

## Loaded glmnet 2.0-16

str(Boston)

## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ black  : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...

attach(Boston)
sum(is.na(medv))

## [1] 0

x<-model.matrix(medv~., data=Boston)[,-1]
y<-Boston$medv

set.seed(2013122032)
train=sample(1:nrow(Boston), nrow(Boston)/2)
test=(-train)
y.test=y[test]

1. Multiple Linear Regression Model. Best Subset Selection

library(leaps)
lm.fit.full<-regsubsets(medv~crim+zn+chas+nox+rm+dis+rad+tax+ptratio+black+lstat*rm+poly(lstat,2),data=Boston, nvmax=13)

## Reordering variables and trying again:

(reg.summary<-summary(lm.fit.full))

## Subset selection object
## Call: regsubsets.formula(medv ~ crim + zn + chas + nox + rm + dis + 
##     rad + tax + ptratio + black + lstat * rm + poly(lstat, 2), 
##     data = Boston, nvmax = 13)
## 14 Variables  (and intercept)
##                 Forced in Forced out
## crim                FALSE      FALSE
## zn                  FALSE      FALSE
## chas                FALSE      FALSE
## nox                 FALSE      FALSE
## rm                  FALSE      FALSE
## dis                 FALSE      FALSE
## rad                 FALSE      FALSE
## tax                 FALSE      FALSE
## ptratio             FALSE      FALSE
## black               FALSE      FALSE
## lstat               FALSE      FALSE
## poly(lstat, 2)2     FALSE      FALSE
## rm:lstat            FALSE      FALSE
## poly(lstat, 2)1     FALSE      FALSE
## 1 subsets of each size up to 13
## Selection Algorithm: exhaustive
##           crim zn  chas nox rm  dis rad tax ptratio black lstat
## 1  ( 1 )  " "  " " " "  " " " " " " " " " " " "     " "   "*"  
## 2  ( 1 )  " "  " " " "  " " "*" " " " " " " " "     " "   " "  
## 3  ( 1 )  " "  " " " "  " " "*" " " " " " " " "     " "   " "  
## 4  ( 1 )  " "  " " " "  " " "*" " " " " " " "*"     " "   "*"  
## 5  ( 1 )  " "  " " " "  " " "*" "*" " " " " "*"     " "   "*"  
## 6  ( 1 )  "*"  " " " "  " " "*" "*" " " " " "*"     " "   " "  
## 7  ( 1 )  "*"  " " " "  "*" "*" "*" " " " " "*"     " "   " "  
## 8  ( 1 )  "*"  " " " "  "*" "*" "*" "*" " " "*"     " "   " "  
## 9  ( 1 )  "*"  " " " "  "*" "*" "*" "*" "*" "*"     " "   "*"  
## 10  ( 1 ) "*"  " " "*"  "*" "*" "*" "*" "*" "*"     " "   " "  
## 11  ( 1 ) "*"  " " "*"  "*" "*" "*" "*" "*" "*"     " "   " "  
## 12  ( 1 ) "*"  " " "*"  "*" "*" "*" "*" "*" "*"     "*"   " "  
## 13  ( 1 ) "*"  "*" "*"  "*" "*" "*" "*" "*" "*"     "*"   "*"  
##           poly(lstat, 2)1 poly(lstat, 2)2 rm:lstat
## 1  ( 1 )  " "             " "             " "     
## 2  ( 1 )  " "             " "             "*"     
## 3  ( 1 )  "*"             " "             "*"     
## 4  ( 1 )  " "             " "             "*"     
## 5  ( 1 )  " "             " "             "*"     
## 6  ( 1 )  "*"             " "             "*"     
## 7  ( 1 )  "*"             " "             "*"     
## 8  ( 1 )  "*"             " "             "*"     
## 9  ( 1 )  " "             " "             "*"     
## 10  ( 1 ) "*"             " "             "*"     
## 11  ( 1 ) "*"             "*"             "*"     
## 12  ( 1 ) "*"             "*"             "*"     
## 13  ( 1 ) " "             "*"             "*"

(bestsub.no.var<-which.min(reg.summary$cp))

## [1] 13

plot(reg.summary$cp, main="Relation between Cp and number of variables", xlab="Number of Variabes", ylab="Cp", type="l", xlim=c(0,13.5))
points(bestsub.no.var, reg.summary$cp[bestsub.no.var], col="green",cex=2, pch=20)

bestsub<-lm(medv~crim+zn+chas+nox+rm+dis+rad+tax+ptratio+black+poly(lstat,2),rm*lstat, data=Boston)

(mean((medv-predict(bestsub, data=Boston))[test]^2))

## [1] 147.9016

Boston.test=Boston[test,]
(best.subset.variance<-var(predict(bestsub, Boston.test,type="response")))

## [1] 83.0654

test MSE = 147.9016
Variance = 83.0654

2. Ridge regression with Boston data

grid<-10^seq(10,-2,length=100)
ridge.mod<-glmnet(x,y,alpha=0, lambda=grid)

cv.out1<-cv.glmnet(x[train,],medv[train],alpha=0)
(bestlamb1<-cv.out1$lambda.min)

## [1] 0.7170866

ridge.mod<-glmnet(x[train,],y[train],alpha=0, lambda=grid)
ridge.pred<-predict(ridge.mod, s=bestlamb1, newx=x[test,])
out1<-glmnet(x,y, alpha=0, lambda=grid)
(ridge.coef<-predict(out1, type="coefficients",s=bestlamb1))

## 14 x 1 sparse Matrix of class "dgCMatrix"
##                         1
## (Intercept)  27.787117301
## crim         -0.087241905
## zn            0.032331960
## indus        -0.039606003
## chas          2.903460658
## nox         -11.764352214
## rm            4.011966332
## age          -0.003850166
## dis          -1.106601138
## rad           0.150455495
## tax          -0.005623005
## ptratio      -0.851483398
## black         0.009050531
## lstat        -0.469882889

추정한 회귀 계수들을 살펴본 결과 indus, age, black은 계수가 아주 작지만 ridge regression의 특성상 0 값을 갖는 predictor는 없다는 것을 확인할 수 있다.

(mean((ridge.pred-y.test)^2))

## [1] 30.31254

(var(predict(ridge.mod, s=bestlamb1, type="response", newx=x[test,])))

##          1
## 1 68.61045

test MSE = 27.15963
test MSE = 68.61045

3. The Lasso

lasso.mod<-glmnet(x[train,],y[train], alpha=1, lambda=grid)

cv.out2<-cv.glmnet(x[train,],medv[train],alpha=1)
(bestlamb2<-cv.out2$lambda.min)

## [1] 0.03916926

out2<-glmnet(x,y, alpha=1, lambda=grid)
(lasso.coef<-predict(out2, type="coefficients",s=bestlamb2))

## 14 x 1 sparse Matrix of class "dgCMatrix"
##                         1
## (Intercept)  33.682988194
## crim         -0.094680931
## zn            0.039626479
## indus         .          
## chas          2.669592508
## nox         -15.861315375
## rm            3.890988277
## age           .          
## dis          -1.356020807
## rad           0.235652236
## tax          -0.009111526
## ptratio      -0.924153390
## black         0.008928566
## lstat        -0.522530747

Lasso model은 ridge regression과 다르게 lambda 값에 따라 추정한 회귀 계수를 0으로 줄이는데, 위의 경우 indus와 age의 계수가 0이라는 것을 확인할 수 있다.

lasso.pred<-predict(lasso.mod, s=bestlamb2, newx<-x[test,])
(mean((lasso.pred-y.test)^2))

## [1] 29.91682

(lasso.variance<-var(predict(lasso.mod, s=bestlamb2, type="response", newx=x[test,])))

##          1
## 1 73.51259

test MSE = 33.9658
variance = 73.51259

Compare Between Best Subset Selection, Ridge Regression, The Lasso

(best.subset.testMSE<-(mean((medv-predict(bestsub, data=Boston))[test]^2)))

## [1] 147.9016

(ridge.testMSE<-(mean((ridge.pred-y.test)^2)))

## [1] 30.31254

(lasso.testMSE<-(mean((lasso.pred-y.test)^2)))

## [1] 29.91682

which.max(c(best.subset.testMSE, ridge.testMSE, lasso.testMSE))

## [1] 1

Best Subset Selection으로 만든 회귀 모형의 test MSE 값이 가장 컸다. Ridge와 Lasso를 이용한 Shrinkage로 인해 커진 bias의 영향보다 variance를 줄인 영향이 더 커서 결과적으로 예측력이 더 좋은 회귀 모형을 만들 수 있었다.
다만 ridge regression의 best lambda는 0.717, lasso의 best lambda는 0.039였는데 두 lambda 값 모두 0에 가까워서 shrinkage penalty가 크지 않았음에도 불구하고 test MSE를 크게 줄였다는 것을 확인할 수 있다.
Ridge regression은 회귀 계수를 0으로 줄이지는 않는 반면 lasso는 0인 회귀 계수들이 존재하는데 대부분의 predictor들이 y 값을 예측하는 데 유용한 경우에는 ridge가 lasso보다 예측력을 높이는데 유리하다. 그런데 위의 경에는 lasso를 사용했을 때의 test MSE가 살짝 더 작은데 이는 indus와 age 이 두 변수처럼 medv를 예측하는 데 무의미한 변수가 존재하기 때문이다.

(best.subset.variance<-var(predict(bestsub, Boston.test,type="response")))

## [1] 83.0654

(ridge.variance<-var(predict(ridge.mod, s=bestlamb1, type="response", newx=x[test,])))

##          1
## 1 68.61045

(lasso.variance<-var(predict(lasso.mod, s=bestlamb2, type="response", newx=x[test,])))

##          1
## 1 73.51259

Best subset selection 했을 때 variance는 83.0654이었다. regularization을 실행한 후 ridge의 variance는 68.61045, lasso는 73.51259로 variance가 전보다 작아졌는데 따라서 test MSE가 줄어든 것도 variance가 작아졌기 때문이라고 결론내릴 수 있다.

ESC Assignment_Regularization of Boston real estate data

SeongJin Kim

April 28, 2018

Best Subset Selection method, Ridge Regression, Lasso의 test MSE 값 비교

1. Multiple Linear Regression Model. Best Subset Selection

2. Ridge regression with Boston data

3. The Lasso