Tools for Big Data Analysis

Throughout this lab we will use the Hitters dataset. This dataset is from the 1986 and 1987 seasons of the Major League Baseball (MLR).

library(ISLR)
#names(Hitters)

Hitters=na.omit(Hitters)

PART I: Shrinkage/Regularization

In class we discussed the two most popular methods used for shrinkage and regularization:

Exercise #1: Ridge Regression

Recall that ridge regression uses L2 to penalize and create different coefficient estimates. This shrinks coefficients to zero (but doesnt obtain exactly zero). We will need to use the glmnet package.

# Ridge Regression 
# needs to input a matrix
x=model.matrix(Salary~., Hitters)[,-1]
y=Hitters$Salary

#install.packages("glmnet")
library(glmnet)

# CREATE A GRID OF LAMBDAS
grid=10^seq(10, -2, length=100)

# ALPHA=0 IS RIDGE
ridge.mod=glmnet(x, y, alpha=0, lambda=grid) #default is to standardize
dim(coef(ridge.mod))
## [1]  20 100
par(mfrow=c(1,1))
plot(ridge.mod)

Grid of \(\lambda\)’s

Now lets look at results for different lambda values.

# LOOK AT THE 50TH LAMBDA
ridge.mod$lambda[50]
## [1] 11497.57
coef(ridge.mod)[,50]
##   (Intercept)         AtBat          Hits         HmRun          Runs 
## 407.356050200   0.036957182   0.138180344   0.524629976   0.230701523 
##           RBI         Walks         Years        CAtBat         CHits 
##   0.239841459   0.289618741   1.107702929   0.003131815   0.011653637 
##        CHmRun         CRuns          CRBI        CWalks       LeagueN 
##   0.087545670   0.023379882   0.024138320   0.025015421   0.085028114 
##     DivisionW       PutOuts       Assists        Errors    NewLeagueN 
##  -6.215440973   0.016482577   0.002612988  -0.020502690   0.301433531
sqrt(sum(coef(ridge.mod)[-1,50]^2))
## [1] 6.360612

OR

# LOOK AT THE 60TH LAMBDA
ridge.mod$lambda[60]
## [1] 705.4802
coef(ridge.mod)[,60]
##  (Intercept)        AtBat         Hits        HmRun         Runs 
##  54.32519950   0.11211115   0.65622409   1.17980910   0.93769713 
##          RBI        Walks        Years       CAtBat        CHits 
##   0.84718546   1.31987948   2.59640425   0.01083413   0.04674557 
##       CHmRun        CRuns         CRBI       CWalks      LeagueN 
##   0.33777318   0.09355528   0.09780402   0.07189612  13.68370191 
##    DivisionW      PutOuts      Assists       Errors   NewLeagueN 
## -54.65877750   0.11852289   0.01606037  -0.70358655   8.61181213
sqrt(sum(coef(ridge.mod)[-1,60]^2))
## [1] 57.11001

Prediction for a New \(\lambda\)

We can also do prediction for a whole new value of \(\lambda=50\)

# NEW LAMBDA=50
predict(ridge.mod, s=50, type="coefficients")[1:20,]
##   (Intercept)         AtBat          Hits         HmRun          Runs 
##  4.876610e+01 -3.580999e-01  1.969359e+00 -1.278248e+00  1.145892e+00 
##           RBI         Walks         Years        CAtBat         CHits 
##  8.038292e-01  2.716186e+00 -6.218319e+00  5.447837e-03  1.064895e-01 
##        CHmRun         CRuns          CRBI        CWalks       LeagueN 
##  6.244860e-01  2.214985e-01  2.186914e-01 -1.500245e-01  4.592589e+01 
##     DivisionW       PutOuts       Assists        Errors    NewLeagueN 
## -1.182011e+02  2.502322e-01  1.215665e-01 -3.278600e+00 -9.496680e+00

Test vs Train

Realisticaly, we should partition our data into test and training sets (a validation approach). We will need to first set the seed, so that we can have consistant results.

set.seed(1)
train=sample(1:nrow(x), nrow(x)/2)
test=(-train)
y.test=y[test]

ridge.mod=glmnet(x[train,], y[train], alpha=0, lambda=grid,
                 thresh = 1e-12)

# LOOK LAMBDA = 4
ridge.pred4=predict(ridge.mod, s=4, newx=x[test,])
mean((ridge.pred4-y.test)^2)
## [1] 142199.2

Large Lambda

What happens when lambda goes to infinity? Looking at the equation from the Lecture 13B slides this heavily penalizes the sum of the squared coefficients, resulting in all the coefficients shrinking toward zero. So the model should be close to the null model.

#recall lambda=infinity
mean((mean(y[train])-y.test)^2)
## [1] 224669.9
ridge.predInfty=predict(ridge.mod, s=1e10, newx=x[test,])
mean((ridge.predInfty-y.test)^2)
## [1] 224669.8

Recall that LASSO with \(\lambda=0\) is the same as least squares regresion. These are very very close!

ridge.pred<-predict(ridge.mod, s=0, newx=x[test,])
mean((ridge.pred-y.test)^2)
## [1] 167789.8
lm(y~x, subset = train)
## 
## Call:
## lm(formula = y ~ x, subset = train)
## 
## Coefficients:
## (Intercept)       xAtBat        xHits       xHmRun        xRuns  
##    274.0145      -0.3521      -1.6377       5.8145       1.5424  
##        xRBI       xWalks       xYears      xCAtBat       xCHits  
##      1.1243       3.7287     -16.3773      -0.6412       3.1632  
##     xCHmRun       xCRuns        xCRBI      xCWalks     xLeagueN  
##      3.4008      -0.9739      -0.6005       0.3379     119.1486  
##  xDivisionW     xPutOuts     xAssists      xErrors  xNewLeagueN  
##   -144.0831       0.1976       0.6804      -4.7128     -71.0951
predict(ridge.mod, s=0, type="coefficients")[1:20,]
##  (Intercept)        AtBat         Hits        HmRun         Runs 
##  274.2089049   -0.3699455   -1.5370022    5.9129307    1.4811980 
##          RBI        Walks        Years       CAtBat        CHits 
##    1.0772844    3.7577989  -16.5600387   -0.6313336    3.1115575 
##       CHmRun        CRuns         CRBI       CWalks      LeagueN 
##    3.3297885   -0.9496641   -0.5694414    0.3300136  118.4000592 
##    DivisionW      PutOuts      Assists       Errors   NewLeagueN 
## -144.2867510    0.1971770    0.6775088   -4.6833775  -70.1616132

Cross-Validation to Choose \(\lambda\)

set.seed(1)
cv.out<-cv.glmnet(x[train, ], y[train], alpha=0)

plot(cv.out)

bestlam<-cv.out$lambda.min
bestlam
## [1] 326.0828
ridge.pred<-predict(ridge.mod, s=bestlam, newx=x[test,])
mean((ridge.pred-y.test)^2)
## [1] 139856.6

Now fit on full data with the best lambda

out<-glmnet(x,y, alpha=0)
predict(out, type="coefficients", s=bestlam)[1:20, ]
##  (Intercept)        AtBat         Hits        HmRun         Runs 
##  15.44383135   0.07715547   0.85911581   0.60103107   1.06369007 
##          RBI        Walks        Years       CAtBat        CHits 
##   0.87936105   1.62444616   1.35254780   0.01134999   0.05746654 
##       CHmRun        CRuns         CRBI       CWalks      LeagueN 
##   0.40680157   0.11456224   0.12116504   0.05299202  22.09143189 
##    DivisionW      PutOuts      Assists       Errors   NewLeagueN 
## -79.04032637   0.16619903   0.02941950  -1.36092945   9.12487767

Exercise #2: LASSO

LASSO uses L1 to penalize and create different coefficient estimates. One of the assumptions of this method is that some of the coefficients are exactly zero. This does effectively perform model select.

# ALPHA=1 is LASSO
lasso.mod=glmnet(x[train,], y[train], alpha=1, lambda=grid)
plot(lasso.mod)
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to
## unique 'x' values

Cross-Validation

set.seed(1)
cv.out<-cv.glmnet(x[train, ], y[train], alpha=1)
plot(cv.out)

bestlam<-cv.out$lambda.min
bestlam
## [1] 9.286955
lasso.pred<-predict(lasso.mod, s=bestlam, newx=x[test,])
mean((lasso.pred-y.test)^2)
## [1] 143673.6

LASSO for Variable Selection

out<-glmnet(x, y, alpha=1, lambda=grid)
lasso.coef<-predict(out, type="coefficients", s=bestlam)[1:20,]
lasso.coef
##   (Intercept)         AtBat          Hits         HmRun          Runs 
##    1.27479059   -0.05497143    2.18034583    0.00000000    0.00000000 
##           RBI         Walks         Years        CAtBat         CHits 
##    0.00000000    2.29192406   -0.33806109    0.00000000    0.00000000 
##        CHmRun         CRuns          CRBI        CWalks       LeagueN 
##    0.02825013    0.21628385    0.41712537    0.00000000   20.28615023 
##     DivisionW       PutOuts       Assists        Errors    NewLeagueN 
## -116.16755870    0.23752385    0.00000000   -0.85629148    0.00000000

PART II: Dimension Reduction

We’ve learned about two dimension reduction techniques:

Exercise #3: PCR

The pls package does all the heavy lifting for us! It will find the principal components and perform regression with M of them. It will also do cross-validation for us!

#install.packages("pls")
library(pls)

set.seed(2)
pcr.fit=pcr(Salary~., data=Hitters, scale=TRUE, 
            validation="CV")

summary(pcr.fit)
## Data:    X dimension: 263 19 
##  Y dimension: 263 1
## Fit method: svdpc
## Number of components considered: 19
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV             452    351.9    353.2    355.0    352.8    348.4    343.6
## adjCV          452    351.6    352.7    354.4    352.1    347.6    342.7
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV       345.5    347.7    349.6     351.4     352.1     353.5     358.2
## adjCV    344.7    346.7    348.5     350.1     350.7     352.0     356.5
##        14 comps  15 comps  16 comps  17 comps  18 comps  19 comps
## CV        349.7     349.4     339.9     341.6     339.2     339.6
## adjCV     348.0     347.7     338.2     339.7     337.2     337.6
## 
## TRAINING: % variance explained
##         1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X         38.31    60.16    70.84    79.03    84.29    88.63    92.26
## Salary    40.63    41.58    42.17    43.22    44.90    46.48    46.69
##         8 comps  9 comps  10 comps  11 comps  12 comps  13 comps  14 comps
## X         94.96    96.28     97.26     97.98     98.65     99.15     99.47
## Salary    46.75    46.86     47.76     47.82     47.85     48.10     50.40
##         15 comps  16 comps  17 comps  18 comps  19 comps
## X          99.75     99.89     99.97     99.99    100.00
## Salary     50.55     53.01     53.85     54.61     54.61
validationplot(pcr.fit, val.type="MSEP")

Again, we should split into testing and training sets. How many principle components should we include?

set.seed(1)
pcr.fit=pcr(Salary~., data=Hitters, subset=train, scale=TRUE, 
            validation="CV")
validationplot(pcr.fit, val.type="MSEP")

It appears that M=7 is best!

pcr.pred7=predict(pcr.fit, x[test,], ncomp=7)
mean((pcr.pred7-y.test)^2)
## [1] 140751.3
pcr.fit7=pcr(y~x, scale=TRUE, ncomp=7)
summary(pcr.fit7)
## Data:    X dimension: 263 19 
##  Y dimension: 263 1
## Fit method: svdpc
## Number of components considered: 7
## TRAINING: % variance explained
##    1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X    38.31    60.16    70.84    79.03    84.29    88.63    92.26
## y    40.63    41.58    42.17    43.22    44.90    46.48    46.69

Exercise #4: Partial LS

The partial ls method is supervised because it also uses information about the response. How many of the linear combinations should we use to make our LS model?

set.seed(1)
pls.fit=plsr(Salary~., data=Hitters, subset=train, 
             scale=TRUE, validation="CV")
summary(pls.fit)
## Data:    X dimension: 131 19 
##  Y dimension: 131 1
## Fit method: kernelpls
## Number of components considered: 19
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV           428.3    325.5    329.9    328.8    339.0    338.9    340.1
## adjCV        428.3    325.0    328.2    327.2    336.6    336.1    336.6
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV       339.0    347.1    346.4     343.4     341.5     345.4     356.4
## adjCV    336.2    343.4    342.8     340.2     338.3     341.8     351.1
##        14 comps  15 comps  16 comps  17 comps  18 comps  19 comps
## CV        348.4     349.1     350.0     344.2     344.5     345.0
## adjCV     344.2     345.0     345.9     340.4     340.6     341.1
## 
## TRAINING: % variance explained
##         1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X         39.13    48.80    60.09    75.07    78.58    81.12    88.21
## Salary    46.36    50.72    52.23    53.03    54.07    54.77    55.05
##         8 comps  9 comps  10 comps  11 comps  12 comps  13 comps  14 comps
## X         90.71    93.17     96.05     97.08     97.61     97.97     98.70
## Salary    55.66    55.95     56.12     56.47     56.68     57.37     57.76
##         15 comps  16 comps  17 comps  18 comps  19 comps
## X          99.12     99.61     99.70     99.95    100.00
## Salary     58.08     58.17     58.49     58.56     58.62
validationplot(pls.fit, val.type="MSEP")

It looks like 2 is best!

pls.pred2=predict(pls.fit, x[test,], ncomp=2)
mean((pls.pred2-y.test)^2)
## [1] 145367.7
pls.fit2=plsr(Salary~., data=Hitters, scale=TRUE, ncomp=2)
summary(pls.fit2)
## Data:    X dimension: 263 19 
##  Y dimension: 263 1
## Fit method: kernelpls
## Number of components considered: 2
## TRAINING: % variance explained
##         1 comps  2 comps
## X         38.08    51.03
## Salary    43.05    46.40