bias, variance : Reducibl error
Ridge와 Lasso 는 독립변수를 제거하지 않고 회귀 계수만 줄이기 때문에 week 3 에서 다루었던 selection method보다 bias 는 커지지만 variance를 더 크게 줄일 수 있으면 결과적으로 더 예측력이 좋은 회귀 모형을 만들 수 있다.
library(MASS)
library(ISLR)
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-16
str(Boston)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
attach(Boston)
sum(is.na(medv))
## [1] 0
x<-model.matrix(medv~., data=Boston)[,-1]
y<-Boston$medv
set.seed(2013122032)
train=sample(1:nrow(Boston), nrow(Boston)/2)
test=(-train)
y.test=y[test]
library(leaps)
lm.fit.full<-regsubsets(medv~crim+zn+chas+nox+rm+dis+rad+tax+ptratio+black+lstat*rm+poly(lstat,2),data=Boston, nvmax=13)
## Reordering variables and trying again:
(reg.summary<-summary(lm.fit.full))
## Subset selection object
## Call: regsubsets.formula(medv ~ crim + zn + chas + nox + rm + dis +
## rad + tax + ptratio + black + lstat * rm + poly(lstat, 2),
## data = Boston, nvmax = 13)
## 14 Variables (and intercept)
## Forced in Forced out
## crim FALSE FALSE
## zn FALSE FALSE
## chas FALSE FALSE
## nox FALSE FALSE
## rm FALSE FALSE
## dis FALSE FALSE
## rad FALSE FALSE
## tax FALSE FALSE
## ptratio FALSE FALSE
## black FALSE FALSE
## lstat FALSE FALSE
## poly(lstat, 2)2 FALSE FALSE
## rm:lstat FALSE FALSE
## poly(lstat, 2)1 FALSE FALSE
## 1 subsets of each size up to 13
## Selection Algorithm: exhaustive
## crim zn chas nox rm dis rad tax ptratio black lstat
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " "*"
## 2 ( 1 ) " " " " " " " " "*" " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " "*" " " " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " "*" " " " " " " "*" " " "*"
## 5 ( 1 ) " " " " " " " " "*" "*" " " " " "*" " " "*"
## 6 ( 1 ) "*" " " " " " " "*" "*" " " " " "*" " " " "
## 7 ( 1 ) "*" " " " " "*" "*" "*" " " " " "*" " " " "
## 8 ( 1 ) "*" " " " " "*" "*" "*" "*" " " "*" " " " "
## 9 ( 1 ) "*" " " " " "*" "*" "*" "*" "*" "*" " " "*"
## 10 ( 1 ) "*" " " "*" "*" "*" "*" "*" "*" "*" " " " "
## 11 ( 1 ) "*" " " "*" "*" "*" "*" "*" "*" "*" " " " "
## 12 ( 1 ) "*" " " "*" "*" "*" "*" "*" "*" "*" "*" " "
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## poly(lstat, 2)1 poly(lstat, 2)2 rm:lstat
## 1 ( 1 ) " " " " " "
## 2 ( 1 ) " " " " "*"
## 3 ( 1 ) "*" " " "*"
## 4 ( 1 ) " " " " "*"
## 5 ( 1 ) " " " " "*"
## 6 ( 1 ) "*" " " "*"
## 7 ( 1 ) "*" " " "*"
## 8 ( 1 ) "*" " " "*"
## 9 ( 1 ) " " " " "*"
## 10 ( 1 ) "*" " " "*"
## 11 ( 1 ) "*" "*" "*"
## 12 ( 1 ) "*" "*" "*"
## 13 ( 1 ) " " "*" "*"
(bestsub.no.var<-which.min(reg.summary$cp))
## [1] 13
plot(reg.summary$cp, main="Relation between Cp and number of variables", xlab="Number of Variabes", ylab="Cp", type="l", xlim=c(0,13.5))
points(bestsub.no.var, reg.summary$cp[bestsub.no.var], col="green",cex=2, pch=20)
bestsub<-lm(medv~crim+zn+chas+nox+rm+dis+rad+tax+ptratio+black+poly(lstat,2),rm*lstat, data=Boston)
(mean((medv-predict(bestsub, data=Boston))[test]^2))
## [1] 147.9016
Boston.test=Boston[test,]
(best.subset.variance<-var(predict(bestsub, Boston.test,type="response")))
## [1] 83.0654
grid<-10^seq(10,-2,length=100)
ridge.mod<-glmnet(x,y,alpha=0, lambda=grid)
cv.out1<-cv.glmnet(x[train,],medv[train],alpha=0)
(bestlamb1<-cv.out1$lambda.min)
## [1] 0.7170866
ridge.mod<-glmnet(x[train,],y[train],alpha=0, lambda=grid)
ridge.pred<-predict(ridge.mod, s=bestlamb1, newx=x[test,])
out1<-glmnet(x,y, alpha=0, lambda=grid)
(ridge.coef<-predict(out1, type="coefficients",s=bestlamb1))
## 14 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 27.787117301
## crim -0.087241905
## zn 0.032331960
## indus -0.039606003
## chas 2.903460658
## nox -11.764352214
## rm 4.011966332
## age -0.003850166
## dis -1.106601138
## rad 0.150455495
## tax -0.005623005
## ptratio -0.851483398
## black 0.009050531
## lstat -0.469882889
(mean((ridge.pred-y.test)^2))
## [1] 30.31254
(var(predict(ridge.mod, s=bestlamb1, type="response", newx=x[test,])))
## 1
## 1 68.61045
lasso.mod<-glmnet(x[train,],y[train], alpha=1, lambda=grid)
cv.out2<-cv.glmnet(x[train,],medv[train],alpha=1)
(bestlamb2<-cv.out2$lambda.min)
## [1] 0.03916926
out2<-glmnet(x,y, alpha=1, lambda=grid)
(lasso.coef<-predict(out2, type="coefficients",s=bestlamb2))
## 14 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 33.682988194
## crim -0.094680931
## zn 0.039626479
## indus .
## chas 2.669592508
## nox -15.861315375
## rm 3.890988277
## age .
## dis -1.356020807
## rad 0.235652236
## tax -0.009111526
## ptratio -0.924153390
## black 0.008928566
## lstat -0.522530747
lasso.pred<-predict(lasso.mod, s=bestlamb2, newx<-x[test,])
(mean((lasso.pred-y.test)^2))
## [1] 29.91682
(lasso.variance<-var(predict(lasso.mod, s=bestlamb2, type="response", newx=x[test,])))
## 1
## 1 73.51259
(best.subset.testMSE<-(mean((medv-predict(bestsub, data=Boston))[test]^2)))
## [1] 147.9016
(ridge.testMSE<-(mean((ridge.pred-y.test)^2)))
## [1] 30.31254
(lasso.testMSE<-(mean((lasso.pred-y.test)^2)))
## [1] 29.91682
which.max(c(best.subset.testMSE, ridge.testMSE, lasso.testMSE))
## [1] 1
Best Subset Selection으로 만든 회귀 모형의 test MSE 값이 가장 컸다. Ridge와 Lasso를 이용한 Shrinkage로 인해 커진 bias의 영향보다 variance를 줄인 영향이 더 커서 결과적으로 예측력이 더 좋은 회귀 모형을 만들 수 있었다.
다만 ridge regression의 best lambda는 0.717, lasso의 best lambda는 0.039였는데 두 lambda 값 모두 0에 가까워서 shrinkage penalty가 크지 않았음에도 불구하고 test MSE를 크게 줄였다는 것을 확인할 수 있다.
Ridge regression은 회귀 계수를 0으로 줄이지는 않는 반면 lasso는 0인 회귀 계수들이 존재하는데 대부분의 predictor들이 y 값을 예측하는 데 유용한 경우에는 ridge가 lasso보다 예측력을 높이는데 유리하다. 그런데 위의 경에는 lasso를 사용했을 때의 test MSE가 살짝 더 작은데 이는 indus와 age 이 두 변수처럼 medv를 예측하는 데 무의미한 변수가 존재하기 때문이다.
(best.subset.variance<-var(predict(bestsub, Boston.test,type="response")))
## [1] 83.0654
(ridge.variance<-var(predict(ridge.mod, s=bestlamb1, type="response", newx=x[test,])))
## 1
## 1 68.61045
(lasso.variance<-var(predict(lasso.mod, s=bestlamb2, type="response", newx=x[test,])))
## 1
## 1 73.51259