學習筆記

Statistical Learning 統計學習
2.Statistical Learning
3.Linear
4.Classification
5.Resampling Method
6.Linear Model Selection and Regularization
7.Moving Beyond Linearity
8.Tree-Based Methods
9.Support Vector Machines
10.Unsupervised Learning

Chapter 6 Lab 1: Subset Selection Methods

1.Best Subset Selection

rm(list=ls())#清除環境
LoadLibrary=function(){
  library(ISLR)
  library(dplyr)
}
LoadLibrary()

names(Hitters)

##  [1] "AtBat"     "Hits"      "HmRun"     "Runs"      "RBI"      
##  [6] "Walks"     "Years"     "CAtBat"    "CHits"     "CHmRun"   
## [11] "CRuns"     "CRBI"      "CWalks"    "League"    "Division" 
## [16] "PutOuts"   "Assists"   "Errors"    "Salary"    "NewLeague"

dim(Hitters)

## [1] 322  20

sum(is.na(Hitters$Salary))

## [1] 59

Hitters=na.omit(Hitters)
dim(Hitters)

## [1] 263  20

sum(is.na(Hitters))

## [1] 0

library(leaps)
regfit.full=regsubsets(Salary~.,Hitters)
summary(regfit.full)

## Subset selection object
## Call: regsubsets.formula(Salary ~ ., Hitters)
## 19 Variables  (and intercept)
##            Forced in Forced out
## AtBat          FALSE      FALSE
## Hits           FALSE      FALSE
## HmRun          FALSE      FALSE
## Runs           FALSE      FALSE
## RBI            FALSE      FALSE
## Walks          FALSE      FALSE
## Years          FALSE      FALSE
## CAtBat         FALSE      FALSE
## CHits          FALSE      FALSE
## CHmRun         FALSE      FALSE
## CRuns          FALSE      FALSE
## CRBI           FALSE      FALSE
## CWalks         FALSE      FALSE
## LeagueN        FALSE      FALSE
## DivisionW      FALSE      FALSE
## PutOuts        FALSE      FALSE
## Assists        FALSE      FALSE
## Errors         FALSE      FALSE
## NewLeagueN     FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
##          AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns
## 1  ( 1 ) " "   " "  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 2  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 3  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 4  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 5  ( 1 ) "*"   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 6  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    " "  
## 7  ( 1 ) " "   "*"  " "   " "  " " "*"   " "   "*"    "*"   "*"    " "  
## 8  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   " "    " "   "*"    "*"  
##          CRBI CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1  ( 1 ) "*"  " "    " "     " "       " "     " "     " "    " "       
## 2  ( 1 ) "*"  " "    " "     " "       " "     " "     " "    " "       
## 3  ( 1 ) "*"  " "    " "     " "       "*"     " "     " "    " "       
## 4  ( 1 ) "*"  " "    " "     "*"       "*"     " "     " "    " "       
## 5  ( 1 ) "*"  " "    " "     "*"       "*"     " "     " "    " "       
## 6  ( 1 ) "*"  " "    " "     "*"       "*"     " "     " "    " "       
## 7  ( 1 ) " "  " "    " "     "*"       "*"     " "     " "    " "       
## 8  ( 1 ) " "  "*"    " "     "*"       "*"     " "     " "    " "

regfit.full=regsubsets(Salary~.,data=Hitters,nvmax=19)
reg.summary=summary(regfit.full)
names(reg.summary)

## [1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"

#變數個數1-19共19個model對應R-squre
reg.summary$rsq %>% head()

## [1] 0.3214501 0.4252237 0.4514294 0.4754067 0.4908036 0.5087146

par(mfrow=c(2,2))
plot(reg.summary$rss,xlab = "Number of Variabes",ylab = "RSS",type = "l")

plot(reg.summary$adjr2,xlab = "Number of Variables",ylab = "Adjusted RSq",type = "l")
which.max(reg.summary$adjr2)

## [1] 11

points(11,reg.summary$adjr2[11],col="red"
       ,cex=2,pch=20)

plot(reg.summary$cp,xlab = "Number of Variables",ylab = "Cp",type = "l")
which.min(reg.summary$cp)

## [1] 10

points(10,reg.summary$cp[10],col="red",
       cex=2,pch=20)

plot(reg.summary$bic,xlab = "Number of Variables",ylab = "BIC",type = "l")
which.min(reg.summary$bic)

## [1] 6

points(6,reg.summary$bic[6],col="red",
       cex=2,pch=20)

plot(regfit.full,scale="r2")

plot(regfit.full,scale="adjr2")

plot(regfit.full,scale="Cp")

plot(regfit.full,scale="bic")

#lowest BIC is six-variable model. coefficient estimates associated with this model.
coef(regfit.full,6)

##  (Intercept)        AtBat         Hits        Walks         CRBI 
##   91.5117981   -1.8685892    7.6043976    3.6976468    0.6430169 
##    DivisionW      PutOuts 
## -122.9515338    0.2643076

Forward and Backward Stepwise Selection

#從null model開始
regfit.fwd=regsubsets(Salary~.,data=Hitters,nvmax=19,method="forward")
#summary(regfit.fwd)

#從Full model開始
regfit.bwd=regsubsets(Salary~.,data=Hitters,nvmax=19,method="backward")
#summary(regfit.bwd)

coef(regfit.full,7)

##  (Intercept)         Hits        Walks       CAtBat        CHits 
##   79.4509472    1.2833513    3.2274264   -0.3752350    1.4957073 
##       CHmRun    DivisionW      PutOuts 
##    1.4420538 -129.9866432    0.2366813

#coef(regfit.fwd,7)
#coef(regfit.bwd,7)

Choosing Among Models

set.seed(1)
train=sample(c(TRUE,FALSE), nrow(Hitters),rep=TRUE)
test=(!train)
regfit.best=regsubsets(Salary~.,data=Hitters[train,],nvmax=19)

#X matrix
test.mat=model.matrix(Salary~.,data=Hitters[test,])
val.errors=rep(NA,19)
for(i in 1:19){
   coefi=coef(regfit.best,id=i)
   pred=test.mat[,names(coefi)]%*%coefi
   
   #Test Set MSE
   val.errors[i]=mean((Hitters$Salary[test]-pred)^2)
}

val.errors

##  [1] 164377.3 144405.5 152175.7 145198.4 137902.1 139175.7 126849.0
##  [8] 136191.4 132889.6 135434.9 136963.3 140694.9 140690.9 141951.2
## [15] 141508.2 142164.4 141767.4 142339.6 142238.2

which.min(val.errors)

## [1] 7

coef(regfit.best,10)

##  (Intercept)        AtBat         Hits        HmRun        Walks 
##   71.8074075   -1.5038124    5.9130470  -11.5241809    8.4349759 
##       CAtBat        CRuns         CRBI       CWalks    DivisionW 
##   -0.1654850    1.7064330    0.7903694   -0.9107515 -109.5616997 
##      PutOuts 
##    0.2426078

predict.regsubsets=function(object,newdata,id,...){
  form=as.formula(object$call[[2]])
  mat=model.matrix(form,newdata)
  coefi=coef(object,id=id)
  xvars=names(coefi)
  mat[,xvars]%*%coefi
  }
regfit.best=regsubsets(Salary~.,data=Hitters,nvmax=19)
coef(regfit.best,10)

##  (Intercept)        AtBat         Hits        Walks       CAtBat 
##  162.5354420   -2.1686501    6.9180175    5.7732246   -0.1300798 
##        CRuns         CRBI       CWalks    DivisionW      PutOuts 
##    1.4082490    0.7743122   -0.8308264 -112.3800575    0.2973726 
##      Assists 
##    0.2831680

k=10
set.seed(1)
#每個樣本給予folds
folds=sample(1:k,nrow(Hitters),replace=TRUE)
#creat a k*19 matrix(NA值)，維度名稱為NULL和1:19
#因有k folds*best subset變數個數1-19
cv.errors=matrix(NA,k,19, dimnames=list(NULL, paste(1:19)))
for(j in 1:k){
  best.fit=regsubsets(Salary~.,data=Hitters[folds!=j,],nvmax=19)
  for(i in 1:19){
    pred=predict(best.fit,Hitters[folds==j,],id=i)
    cv.errors[j,i]=mean( (Hitters$Salary[folds==j]-pred)^2)
    }
}

#average the columns
mean.cv.errors=apply(cv.errors,2,mean)
mean.cv.errors

##        1        2        3        4        5        6        7        8 
## 149821.1 130922.0 139127.0 131028.8 131050.2 119538.6 124286.1 113580.0 
##        9       10       11       12       13       14       15       16 
## 115556.5 112216.7 113251.2 115755.9 117820.8 119481.2 120121.6 120074.3 
##       17       18       19 
## 120084.8 120085.8 120403.5

par(mfrow=c(1,1))
plot(mean.cv.errors,type='b')

reg.best=regsubsets(Salary~.,data=Hitters, nvmax=19)
coef(reg.best,11)

##  (Intercept)        AtBat         Hits        Walks       CAtBat 
##  135.7512195   -2.1277482    6.9236994    5.6202755   -0.1389914 
##        CRuns         CRBI       CWalks      LeagueN    DivisionW 
##    1.4553310    0.7852528   -0.8228559   43.1116152 -111.1460252 
##      PutOuts      Assists 
##    0.2894087    0.2688277

Chapter 6 Lab 2: Ridge Regression and the Lasso

#將類別變數轉換成dummy variables
x=model.matrix(Salary~.,Hitters)[,-1]
y=Hitters$Salary

# Ridge Regression(alpha parameter=0)
library(glmnet)
#lambda：
grid=10^seq(10,-2,length=100)
ridge.mod=glmnet(x,y,alpha=0,lambda=grid)
#20(predictor value+intercept)*100(lambda)
dim(coef(ridge.mod))

## [1]  20 100

#50th lambda值=11497，逞罰項：6.360612
ridge.mod$lambda[50]

## [1] 11497.57

coef(ridge.mod)[,50]

##   (Intercept)         AtBat          Hits         HmRun          Runs 
## 407.356050200   0.036957182   0.138180344   0.524629976   0.230701523 
##           RBI         Walks         Years        CAtBat         CHits 
##   0.239841459   0.289618741   1.107702929   0.003131815   0.011653637 
##        CHmRun         CRuns          CRBI        CWalks       LeagueN 
##   0.087545670   0.023379882   0.024138320   0.025015421   0.085028114 
##     DivisionW       PutOuts       Assists        Errors    NewLeagueN 
##  -6.215440973   0.016482577   0.002612988  -0.020502690   0.301433531

sqrt(sum(coef(ridge.mod)[-1,50]^2))#二階逞罰項

## [1] 6.360612

#60th lambda值=705.4802，逞罰項：57.11001
ridge.mod$lambda[60]

## [1] 705.4802

coef(ridge.mod)[,60]#60th 對應的係數

##  (Intercept)        AtBat         Hits        HmRun         Runs 
##  54.32519950   0.11211115   0.65622409   1.17980910   0.93769713 
##          RBI        Walks        Years       CAtBat        CHits 
##   0.84718546   1.31987948   2.59640425   0.01083413   0.04674557 
##       CHmRun        CRuns         CRBI       CWalks      LeagueN 
##   0.33777318   0.09355528   0.09780402   0.07189612  13.68370191 
##    DivisionW      PutOuts      Assists       Errors   NewLeagueN 
## -54.65877750   0.11852289   0.01606037  -0.70358655   8.61181213

sqrt(sum(coef(ridge.mod)[-1,60]^2))#二階逞罰項

## [1] 57.11001

#s：set lambda=50
predict(ridge.mod,s=50,type="coefficients")[1:20,]

##   (Intercept)         AtBat          Hits         HmRun          Runs 
##  4.876610e+01 -3.580999e-01  1.969359e+00 -1.278248e+00  1.145892e+00 
##           RBI         Walks         Years        CAtBat         CHits 
##  8.038292e-01  2.716186e+00 -6.218319e+00  5.447837e-03  1.064895e-01 
##        CHmRun         CRuns          CRBI        CWalks       LeagueN 
##  6.244860e-01  2.214985e-01  2.186914e-01 -1.500245e-01  4.592589e+01 
##     DivisionW       PutOuts       Assists        Errors    NewLeagueN 
## -1.182011e+02  2.502322e-01  1.215665e-01 -3.278600e+00 -9.496680e+00

set.seed(1)
train=sample(1:nrow(x), nrow(x)/2)
test=(-train)
y.test=y[test]

#use training set build model
ridge.mod=glmnet(x[train,],y[train],alpha=0,lambda=grid, thresh=1e-12)
ridge.pred=predict(ridge.mod,s=4,newx=x[test,])
mean((ridge.pred-y.test)^2)#test set MSE

## [1] 142199.2

mean((mean(y[train])-y.test)^2)

## [1] 224669.9

#set large value of lamnda 10^10
ridge.pred=predict(ridge.mod,s=1e10,newx=x[test,])
mean((ridge.pred-y.test)^2)

## [1] 224669.8

ridge.pred=predict(ridge.mod,s=0,newx=x[test,],exact=T,x=x[train,],y=y[train])
mean((ridge.pred-y.test)^2)

## [1] 168588.6

#unpenalized least model
lm(y~x, subset=train)

## 
## Call:
## lm(formula = y ~ x, subset = train)
## 
## Coefficients:
## (Intercept)       xAtBat        xHits       xHmRun        xRuns  
##    274.0145      -0.3521      -1.6377       5.8145       1.5424  
##        xRBI       xWalks       xYears      xCAtBat       xCHits  
##      1.1243       3.7287     -16.3773      -0.6412       3.1632  
##     xCHmRun       xCRuns        xCRBI      xCWalks     xLeagueN  
##      3.4008      -0.9739      -0.6005       0.3379     119.1486  
##  xDivisionW     xPutOuts     xAssists      xErrors  xNewLeagueN  
##   -144.0831       0.1976       0.6804      -4.7128     -71.0951

predict(ridge.mod,s=0,exact=T,type="coefficients",x=x[train,],y=y[train])[1:20,]

##  (Intercept)        AtBat         Hits        HmRun         Runs 
##  274.0200994   -0.3521900   -1.6371383    5.8146692    1.5423361 
##          RBI        Walks        Years       CAtBat        CHits 
##    1.1241837    3.7288406  -16.3795195   -0.6411235    3.1629444 
##       CHmRun        CRuns         CRBI       CWalks      LeagueN 
##    3.4005281   -0.9739405   -0.6003976    0.3378422  119.1434637 
##    DivisionW      PutOuts      Assists       Errors   NewLeagueN 
## -144.0853061    0.1976300    0.6804200   -4.7127879  -71.0898914

#使用cross-validation來選擇參數lambda
set.seed(1)
cv.out=cv.glmnet(x[train,],y[train],alpha=0)
plot(cv.out)

bestlam=cv.out$lambda.min
bestlam

## [1] 326.0828

ridge.pred=predict(ridge.mod,s=bestlam,newx=x[test,])
mean((ridge.pred-y.test)^2)

## [1] 139856.6

out=glmnet(x,y,alpha=0)
predict(out,type="coefficients",s=bestlam)[1:20,]

##  (Intercept)        AtBat         Hits        HmRun         Runs 
##  15.44383135   0.07715547   0.85911581   0.60103107   1.06369007 
##          RBI        Walks        Years       CAtBat        CHits 
##   0.87936105   1.62444616   1.35254780   0.01134999   0.05746654 
##       CHmRun        CRuns         CRBI       CWalks      LeagueN 
##   0.40680157   0.11456224   0.12116504   0.05299202  22.09143189 
##    DivisionW      PutOuts      Assists       Errors   NewLeagueN 
## -79.04032637   0.16619903   0.02941950  -1.36092945   9.12487767

The Lasso

lasso.mod=glmnet(x[train,],y[train],alpha=1,lambda=grid)
plot(lasso.mod)

#cross-validation find best lambda
set.seed(1)
cv.out=cv.glmnet(x[train,],y[train],alpha=1)
plot(cv.out)

bestlam=cv.out$lambda.min
lasso.pred=predict(lasso.mod,s=bestlam,newx=x[test,])
mean((lasso.pred-y.test)^2)#test set MSE

## [1] 143673.6

out=glmnet(x,y,alpha=1,lambda=grid)
lasso.coef=predict(out,type="coefficients",s=bestlam)[1:20,]
lasso.coef

##   (Intercept)         AtBat          Hits         HmRun          Runs 
##    1.27479059   -0.05497143    2.18034583    0.00000000    0.00000000 
##           RBI         Walks         Years        CAtBat         CHits 
##    0.00000000    2.29192406   -0.33806109    0.00000000    0.00000000 
##        CHmRun         CRuns          CRBI        CWalks       LeagueN 
##    0.02825013    0.21628385    0.41712537    0.00000000   20.28615023 
##     DivisionW       PutOuts       Assists        Errors    NewLeagueN 
## -116.16755870    0.23752385    0.00000000   -0.85629148    0.00000000

lasso.coef[lasso.coef!=0]

##   (Intercept)         AtBat          Hits         Walks         Years 
##    1.27479059   -0.05497143    2.18034583    2.29192406   -0.33806109 
##        CHmRun         CRuns          CRBI       LeagueN     DivisionW 
##    0.02825013    0.21628385    0.41712537   20.28615023 -116.16755870 
##       PutOuts        Errors 
##    0.23752385   -0.85629148

Chapter 6 Lab 3: PCR and PLS Regression

Principal Components Regression

library(pls)
set.seed(2)
pcr.fit=pcr(Salary~., data=Hitters,scale=TRUE,validation="CV")
summary(pcr.fit)

## Data:    X dimension: 263 19 
##  Y dimension: 263 1
## Fit method: svdpc
## Number of components considered: 19
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV             452    351.9    353.2    355.0    352.8    348.4    343.6
## adjCV          452    351.6    352.7    354.4    352.1    347.6    342.7
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV       345.5    347.7    349.6     351.4     352.1     353.5     358.2
## adjCV    344.7    346.7    348.5     350.1     350.7     352.0     356.5
##        14 comps  15 comps  16 comps  17 comps  18 comps  19 comps
## CV        349.7     349.4     339.9     341.6     339.2     339.6
## adjCV     348.0     347.7     338.2     339.7     337.2     337.6
## 
## TRAINING: % variance explained
##         1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X         38.31    60.16    70.84    79.03    84.29    88.63    92.26
## Salary    40.63    41.58    42.17    43.22    44.90    46.48    46.69
##         8 comps  9 comps  10 comps  11 comps  12 comps  13 comps  14 comps
## X         94.96    96.28     97.26     97.98     98.65     99.15     99.47
## Salary    46.75    46.86     47.76     47.82     47.85     48.10     50.40
##         15 comps  16 comps  17 comps  18 comps  19 comps
## X          99.75     99.89     99.97     99.99    100.00
## Salary     50.55     53.01     53.85     54.61     54.61

validationplot(pcr.fit,val.type="MSEP")

set.seed(1)
pcr.fit=pcr(Salary~., data=Hitters,subset=train,scale=TRUE, validation="CV")

#pcr.fit %>% summary()
#set M=1時，佔據39.32%的變異
validationplot(pcr.fit,val.type="MSEP")

pcr.pred=predict(pcr.fit,x[test,],ncomp=7)
mean((pcr.pred-y.test)^2)

## [1] 140751.3

pcr.fit=pcr(y~x,scale=TRUE,ncomp=7)
summary(pcr.fit)

## Data:    X dimension: 263 19 
##  Y dimension: 263 1
## Fit method: svdpc
## Number of components considered: 7
## TRAINING: % variance explained
##    1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X    38.31    60.16    70.84    79.03    84.29    88.63    92.26
## y    40.63    41.58    42.17    43.22    44.90    46.48    46.69

Partial Least Squares

set.seed(1)
pls.fit=plsr(Salary~., data=Hitters,subset=train,scale=TRUE, validation="CV")
summary(pls.fit)

## Data:    X dimension: 131 19 
##  Y dimension: 131 1
## Fit method: kernelpls
## Number of components considered: 19
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV           428.3    325.5    329.9    328.8    339.0    338.9    340.1
## adjCV        428.3    325.0    328.2    327.2    336.6    336.1    336.6
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV       339.0    347.1    346.4     343.4     341.5     345.4     356.4
## adjCV    336.2    343.4    342.8     340.2     338.3     341.8     351.1
##        14 comps  15 comps  16 comps  17 comps  18 comps  19 comps
## CV        348.4     349.1     350.0     344.2     344.5     345.0
## adjCV     344.2     345.0     345.9     340.4     340.6     341.1
## 
## TRAINING: % variance explained
##         1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X         39.13    48.80    60.09    75.07    78.58    81.12    88.21
## Salary    46.36    50.72    52.23    53.03    54.07    54.77    55.05
##         8 comps  9 comps  10 comps  11 comps  12 comps  13 comps  14 comps
## X         90.71    93.17     96.05     97.08     97.61     97.97     98.70
## Salary    55.66    55.95     56.12     56.47     56.68     57.37     57.76
##         15 comps  16 comps  17 comps  18 comps  19 comps
## X          99.12     99.61     99.70     99.95    100.00
## Salary     58.08     58.17     58.49     58.56     58.62

validationplot(pls.fit,val.type="MSEP")

pls.pred=predict(pls.fit,x[test,],ncomp=2)
mean((pls.pred-y.test)^2)

## [1] 145367.7

pls.fit=plsr(Salary~., data=Hitters,scale=TRUE,ncomp=2)

summary(pls.fit)

## Data:    X dimension: 263 19 
##  Y dimension: 263 1
## Fit method: kernelpls
## Number of components considered: 2
## TRAINING: % variance explained
##         1 comps  2 comps
## X         38.08    51.03
## Salary    43.05    46.40

Statistical Learning

Yi-Ting,Tsai

August 7, 2021

學習筆記

Chapter 6 Lab 1: Subset Selection Methods

1.Best Subset Selection

Forward and Backward Stepwise Selection

Choosing Among Models

Chapter 6 Lab 2: Ridge Regression and the Lasso

The Lasso

Chapter 6 Lab 3: PCR and PLS Regression

Principal Components Regression

Partial Least Squares