學習筆記
- Statistical Learning 統計學習
2.Statistical Learning
3.Linear
4.Classification
5.Resampling Method
6.Linear Model Selection and Regularization
7.Moving Beyond Linearity
8.Tree-Based Methods
9.Support Vector Machines
10.Unsupervised Learning
Chapter 6 Lab 1: Subset Selection Methods
1.Best Subset Selection
## [1] "AtBat" "Hits" "HmRun" "Runs" "RBI"
## [6] "Walks" "Years" "CAtBat" "CHits" "CHmRun"
## [11] "CRuns" "CRBI" "CWalks" "League" "Division"
## [16] "PutOuts" "Assists" "Errors" "Salary" "NewLeague"
## [1] 322 20
## [1] 59
## [1] 263 20
## [1] 0
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., Hitters)
## 19 Variables (and intercept)
## Forced in Forced out
## AtBat FALSE FALSE
## Hits FALSE FALSE
## HmRun FALSE FALSE
## Runs FALSE FALSE
## RBI FALSE FALSE
## Walks FALSE FALSE
## Years FALSE FALSE
## CAtBat FALSE FALSE
## CHits FALSE FALSE
## CHmRun FALSE FALSE
## CRuns FALSE FALSE
## CRBI FALSE FALSE
## CWalks FALSE FALSE
## LeagueN FALSE FALSE
## DivisionW FALSE FALSE
## PutOuts FALSE FALSE
## Assists FALSE FALSE
## Errors FALSE FALSE
## NewLeagueN FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " "
## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " "
## 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " "
## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " "
## 7 ( 1 ) " " "*" " " " " " " "*" " " "*" "*" "*" " "
## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " "*" "*"
## CRBI CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1 ( 1 ) "*" " " " " " " " " " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " " " " " "
## 3 ( 1 ) "*" " " " " " " "*" " " " " " "
## 4 ( 1 ) "*" " " " " "*" "*" " " " " " "
## 5 ( 1 ) "*" " " " " "*" "*" " " " " " "
## 6 ( 1 ) "*" " " " " "*" "*" " " " " " "
## 7 ( 1 ) " " " " " " "*" "*" " " " " " "
## 8 ( 1 ) " " "*" " " "*" "*" " " " " " "
regfit.full=regsubsets(Salary~.,data=Hitters,nvmax=19)
reg.summary=summary(regfit.full)
names(reg.summary)## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
## [1] 0.3214501 0.4252237 0.4514294 0.4754067 0.4908036 0.5087146
par(mfrow=c(2,2))
plot(reg.summary$rss,xlab = "Number of Variabes",ylab = "RSS",type = "l")
plot(reg.summary$adjr2,xlab = "Number of Variables",ylab = "Adjusted RSq",type = "l")
which.max(reg.summary$adjr2)## [1] 11
points(11,reg.summary$adjr2[11],col="red"
,cex=2,pch=20)
plot(reg.summary$cp,xlab = "Number of Variables",ylab = "Cp",type = "l")
which.min(reg.summary$cp)## [1] 10
points(10,reg.summary$cp[10],col="red",
cex=2,pch=20)
plot(reg.summary$bic,xlab = "Number of Variables",ylab = "BIC",type = "l")
which.min(reg.summary$bic)## [1] 6
#lowest BIC is six-variable model. coefficient estimates associated with this model.
coef(regfit.full,6)## (Intercept) AtBat Hits Walks CRBI
## 91.5117981 -1.8685892 7.6043976 3.6976468 0.6430169
## DivisionW PutOuts
## -122.9515338 0.2643076
Forward and Backward Stepwise Selection
#從null model開始
regfit.fwd=regsubsets(Salary~.,data=Hitters,nvmax=19,method="forward")
#summary(regfit.fwd)
#從Full model開始
regfit.bwd=regsubsets(Salary~.,data=Hitters,nvmax=19,method="backward")
#summary(regfit.bwd)
coef(regfit.full,7)## (Intercept) Hits Walks CAtBat CHits
## 79.4509472 1.2833513 3.2274264 -0.3752350 1.4957073
## CHmRun DivisionW PutOuts
## 1.4420538 -129.9866432 0.2366813
Choosing Among Models
set.seed(1)
train=sample(c(TRUE,FALSE), nrow(Hitters),rep=TRUE)
test=(!train)
regfit.best=regsubsets(Salary~.,data=Hitters[train,],nvmax=19)
#X matrix
test.mat=model.matrix(Salary~.,data=Hitters[test,])
val.errors=rep(NA,19)
for(i in 1:19){
coefi=coef(regfit.best,id=i)
pred=test.mat[,names(coefi)]%*%coefi
#Test Set MSE
val.errors[i]=mean((Hitters$Salary[test]-pred)^2)
}## [1] 164377.3 144405.5 152175.7 145198.4 137902.1 139175.7 126849.0
## [8] 136191.4 132889.6 135434.9 136963.3 140694.9 140690.9 141951.2
## [15] 141508.2 142164.4 141767.4 142339.6 142238.2
## [1] 7
## (Intercept) AtBat Hits HmRun Walks
## 71.8074075 -1.5038124 5.9130470 -11.5241809 8.4349759
## CAtBat CRuns CRBI CWalks DivisionW
## -0.1654850 1.7064330 0.7903694 -0.9107515 -109.5616997
## PutOuts
## 0.2426078
predict.regsubsets=function(object,newdata,id,...){
form=as.formula(object$call[[2]])
mat=model.matrix(form,newdata)
coefi=coef(object,id=id)
xvars=names(coefi)
mat[,xvars]%*%coefi
}
regfit.best=regsubsets(Salary~.,data=Hitters,nvmax=19)
coef(regfit.best,10)## (Intercept) AtBat Hits Walks CAtBat
## 162.5354420 -2.1686501 6.9180175 5.7732246 -0.1300798
## CRuns CRBI CWalks DivisionW PutOuts
## 1.4082490 0.7743122 -0.8308264 -112.3800575 0.2973726
## Assists
## 0.2831680
k=10
set.seed(1)
#每個樣本給予folds
folds=sample(1:k,nrow(Hitters),replace=TRUE)
#creat a k*19 matrix(NA值),維度名稱為NULL和1:19
#因有k folds*best subset變數個數1-19
cv.errors=matrix(NA,k,19, dimnames=list(NULL, paste(1:19)))
for(j in 1:k){
best.fit=regsubsets(Salary~.,data=Hitters[folds!=j,],nvmax=19)
for(i in 1:19){
pred=predict(best.fit,Hitters[folds==j,],id=i)
cv.errors[j,i]=mean( (Hitters$Salary[folds==j]-pred)^2)
}
}
#average the columns
mean.cv.errors=apply(cv.errors,2,mean)
mean.cv.errors## 1 2 3 4 5 6 7 8
## 149821.1 130922.0 139127.0 131028.8 131050.2 119538.6 124286.1 113580.0
## 9 10 11 12 13 14 15 16
## 115556.5 112216.7 113251.2 115755.9 117820.8 119481.2 120121.6 120074.3
## 17 18 19
## 120084.8 120085.8 120403.5
## (Intercept) AtBat Hits Walks CAtBat
## 135.7512195 -2.1277482 6.9236994 5.6202755 -0.1389914
## CRuns CRBI CWalks LeagueN DivisionW
## 1.4553310 0.7852528 -0.8228559 43.1116152 -111.1460252
## PutOuts Assists
## 0.2894087 0.2688277
Chapter 6 Lab 2: Ridge Regression and the Lasso
#將類別變數轉換成dummy variables
x=model.matrix(Salary~.,Hitters)[,-1]
y=Hitters$Salary
# Ridge Regression(alpha parameter=0)
library(glmnet)
#lambda:
grid=10^seq(10,-2,length=100)
ridge.mod=glmnet(x,y,alpha=0,lambda=grid)
#20(predictor value+intercept)*100(lambda)
dim(coef(ridge.mod))## [1] 20 100
## [1] 11497.57
## (Intercept) AtBat Hits HmRun Runs
## 407.356050200 0.036957182 0.138180344 0.524629976 0.230701523
## RBI Walks Years CAtBat CHits
## 0.239841459 0.289618741 1.107702929 0.003131815 0.011653637
## CHmRun CRuns CRBI CWalks LeagueN
## 0.087545670 0.023379882 0.024138320 0.025015421 0.085028114
## DivisionW PutOuts Assists Errors NewLeagueN
## -6.215440973 0.016482577 0.002612988 -0.020502690 0.301433531
## [1] 6.360612
## [1] 705.4802
## (Intercept) AtBat Hits HmRun Runs
## 54.32519950 0.11211115 0.65622409 1.17980910 0.93769713
## RBI Walks Years CAtBat CHits
## 0.84718546 1.31987948 2.59640425 0.01083413 0.04674557
## CHmRun CRuns CRBI CWalks LeagueN
## 0.33777318 0.09355528 0.09780402 0.07189612 13.68370191
## DivisionW PutOuts Assists Errors NewLeagueN
## -54.65877750 0.11852289 0.01606037 -0.70358655 8.61181213
## [1] 57.11001
## (Intercept) AtBat Hits HmRun Runs
## 4.876610e+01 -3.580999e-01 1.969359e+00 -1.278248e+00 1.145892e+00
## RBI Walks Years CAtBat CHits
## 8.038292e-01 2.716186e+00 -6.218319e+00 5.447837e-03 1.064895e-01
## CHmRun CRuns CRBI CWalks LeagueN
## 6.244860e-01 2.214985e-01 2.186914e-01 -1.500245e-01 4.592589e+01
## DivisionW PutOuts Assists Errors NewLeagueN
## -1.182011e+02 2.502322e-01 1.215665e-01 -3.278600e+00 -9.496680e+00
set.seed(1)
train=sample(1:nrow(x), nrow(x)/2)
test=(-train)
y.test=y[test]
#use training set build model
ridge.mod=glmnet(x[train,],y[train],alpha=0,lambda=grid, thresh=1e-12)
ridge.pred=predict(ridge.mod,s=4,newx=x[test,])
mean((ridge.pred-y.test)^2)#test set MSE## [1] 142199.2
## [1] 224669.9
#set large value of lamnda 10^10
ridge.pred=predict(ridge.mod,s=1e10,newx=x[test,])
mean((ridge.pred-y.test)^2)## [1] 224669.8
ridge.pred=predict(ridge.mod,s=0,newx=x[test,],exact=T,x=x[train,],y=y[train])
mean((ridge.pred-y.test)^2)## [1] 168588.6
##
## Call:
## lm(formula = y ~ x, subset = train)
##
## Coefficients:
## (Intercept) xAtBat xHits xHmRun xRuns
## 274.0145 -0.3521 -1.6377 5.8145 1.5424
## xRBI xWalks xYears xCAtBat xCHits
## 1.1243 3.7287 -16.3773 -0.6412 3.1632
## xCHmRun xCRuns xCRBI xCWalks xLeagueN
## 3.4008 -0.9739 -0.6005 0.3379 119.1486
## xDivisionW xPutOuts xAssists xErrors xNewLeagueN
## -144.0831 0.1976 0.6804 -4.7128 -71.0951
## (Intercept) AtBat Hits HmRun Runs
## 274.0200994 -0.3521900 -1.6371383 5.8146692 1.5423361
## RBI Walks Years CAtBat CHits
## 1.1241837 3.7288406 -16.3795195 -0.6411235 3.1629444
## CHmRun CRuns CRBI CWalks LeagueN
## 3.4005281 -0.9739405 -0.6003976 0.3378422 119.1434637
## DivisionW PutOuts Assists Errors NewLeagueN
## -144.0853061 0.1976300 0.6804200 -4.7127879 -71.0898914
#使用cross-validation來選擇參數lambda
set.seed(1)
cv.out=cv.glmnet(x[train,],y[train],alpha=0)
plot(cv.out)## [1] 326.0828
## [1] 139856.6
## (Intercept) AtBat Hits HmRun Runs
## 15.44383135 0.07715547 0.85911581 0.60103107 1.06369007
## RBI Walks Years CAtBat CHits
## 0.87936105 1.62444616 1.35254780 0.01134999 0.05746654
## CHmRun CRuns CRBI CWalks LeagueN
## 0.40680157 0.11456224 0.12116504 0.05299202 22.09143189
## DivisionW PutOuts Assists Errors NewLeagueN
## -79.04032637 0.16619903 0.02941950 -1.36092945 9.12487767
The Lasso
#cross-validation find best lambda
set.seed(1)
cv.out=cv.glmnet(x[train,],y[train],alpha=1)
plot(cv.out)bestlam=cv.out$lambda.min
lasso.pred=predict(lasso.mod,s=bestlam,newx=x[test,])
mean((lasso.pred-y.test)^2)#test set MSE## [1] 143673.6
out=glmnet(x,y,alpha=1,lambda=grid)
lasso.coef=predict(out,type="coefficients",s=bestlam)[1:20,]
lasso.coef## (Intercept) AtBat Hits HmRun Runs
## 1.27479059 -0.05497143 2.18034583 0.00000000 0.00000000
## RBI Walks Years CAtBat CHits
## 0.00000000 2.29192406 -0.33806109 0.00000000 0.00000000
## CHmRun CRuns CRBI CWalks LeagueN
## 0.02825013 0.21628385 0.41712537 0.00000000 20.28615023
## DivisionW PutOuts Assists Errors NewLeagueN
## -116.16755870 0.23752385 0.00000000 -0.85629148 0.00000000
## (Intercept) AtBat Hits Walks Years
## 1.27479059 -0.05497143 2.18034583 2.29192406 -0.33806109
## CHmRun CRuns CRBI LeagueN DivisionW
## 0.02825013 0.21628385 0.41712537 20.28615023 -116.16755870
## PutOuts Errors
## 0.23752385 -0.85629148
Chapter 6 Lab 3: PCR and PLS Regression
Principal Components Regression
library(pls)
set.seed(2)
pcr.fit=pcr(Salary~., data=Hitters,scale=TRUE,validation="CV")
summary(pcr.fit)## Data: X dimension: 263 19
## Y dimension: 263 1
## Fit method: svdpc
## Number of components considered: 19
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 452 351.9 353.2 355.0 352.8 348.4 343.6
## adjCV 452 351.6 352.7 354.4 352.1 347.6 342.7
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 345.5 347.7 349.6 351.4 352.1 353.5 358.2
## adjCV 344.7 346.7 348.5 350.1 350.7 352.0 356.5
## 14 comps 15 comps 16 comps 17 comps 18 comps 19 comps
## CV 349.7 349.4 339.9 341.6 339.2 339.6
## adjCV 348.0 347.7 338.2 339.7 337.2 337.6
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 38.31 60.16 70.84 79.03 84.29 88.63 92.26
## Salary 40.63 41.58 42.17 43.22 44.90 46.48 46.69
## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps
## X 94.96 96.28 97.26 97.98 98.65 99.15 99.47
## Salary 46.75 46.86 47.76 47.82 47.85 48.10 50.40
## 15 comps 16 comps 17 comps 18 comps 19 comps
## X 99.75 99.89 99.97 99.99 100.00
## Salary 50.55 53.01 53.85 54.61 54.61
set.seed(1)
pcr.fit=pcr(Salary~., data=Hitters,subset=train,scale=TRUE, validation="CV")
#pcr.fit %>% summary()
#set M=1時,佔據39.32%的變異
validationplot(pcr.fit,val.type="MSEP")## [1] 140751.3
## Data: X dimension: 263 19
## Y dimension: 263 1
## Fit method: svdpc
## Number of components considered: 7
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 38.31 60.16 70.84 79.03 84.29 88.63 92.26
## y 40.63 41.58 42.17 43.22 44.90 46.48 46.69
Partial Least Squares
set.seed(1)
pls.fit=plsr(Salary~., data=Hitters,subset=train,scale=TRUE, validation="CV")
summary(pls.fit)## Data: X dimension: 131 19
## Y dimension: 131 1
## Fit method: kernelpls
## Number of components considered: 19
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 428.3 325.5 329.9 328.8 339.0 338.9 340.1
## adjCV 428.3 325.0 328.2 327.2 336.6 336.1 336.6
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 339.0 347.1 346.4 343.4 341.5 345.4 356.4
## adjCV 336.2 343.4 342.8 340.2 338.3 341.8 351.1
## 14 comps 15 comps 16 comps 17 comps 18 comps 19 comps
## CV 348.4 349.1 350.0 344.2 344.5 345.0
## adjCV 344.2 345.0 345.9 340.4 340.6 341.1
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 39.13 48.80 60.09 75.07 78.58 81.12 88.21
## Salary 46.36 50.72 52.23 53.03 54.07 54.77 55.05
## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps
## X 90.71 93.17 96.05 97.08 97.61 97.97 98.70
## Salary 55.66 55.95 56.12 56.47 56.68 57.37 57.76
## 15 comps 16 comps 17 comps 18 comps 19 comps
## X 99.12 99.61 99.70 99.95 100.00
## Salary 58.08 58.17 58.49 58.56 58.62
## [1] 145367.7
## Data: X dimension: 263 19
## Y dimension: 263 1
## Fit method: kernelpls
## Number of components considered: 2
## TRAINING: % variance explained
## 1 comps 2 comps
## X 38.08 51.03
## Salary 43.05 46.40