## From Johannes Ledolter Data Mining
# Penalty-Based Variable Selection
# in Regression Models with Many
# Parameters (LASSO)-chapter 6
# one needs to simplify models as the estimation of unneeded
# coefficients degrades the forecasting performance.
# The LASSO (least absolute shrinkage and selection operator) algorithm finds
# a least squares solution under the constraint that
# the L1 norm of the
# estimated parameter vector, is no greater than a certain given value; that is, the
# LASSO estimate of ?? = (??1, ??2, . . . , ??k
# see page 71 of the text for explanation
setwd("C:\\Users\\Luis\\Desktop\\johannes")
# 6.1 EXAMPLE 1: PROSTATE CANCER
# Let us consider an example on prostate cancer. The data, taken from Stamey et al.
# (1989), contains the results of biopsies on 97 men of various ages. This data set
# has been analyzed by Tibshirani (1996) in his introduction of the LASSO and by
# many texts on data mining. The biopsy information includes:
# Gleason score (gleason): scores are assigned to the two most common tumor
# patterns ranging from 2 to 10; in this data set, the range is from 6 to 9.
# Prostate-specific antigen (psa): laboratory results on protein production.
# Capsular penetration (cp): reach of cancer into the gland lining.
# Benign prostatic hyperplasia amount (bph): size of the prostate.
# The goal is to predict the tumor log volume (which measures the tumor's size or
# spread). We try to predict this variable from five covariates (age; logarithms of bph,
# cp, and psa; and the Gleason score). The predicted size of the tumor has important
# implications for the subsequent treatment options, which include chemotherapy,
# radiation treatment, and surgical removal of the prostate.
prostate <- read.csv("C:\\Users\\Luis\\Desktop\\johannes\\prostate.csv")
prostate[1:5,]
## lcavol age lbph lcp gleason lpsa
## 1 -0.5798185 50 -1.386294 -1.386294 6 -0.4307829
## 2 -0.9942523 58 -1.386294 -1.386294 6 -0.1625189
## 3 -0.5108256 74 -1.386294 -1.386294 7 -0.1625189
## 4 -1.2039728 58 -1.386294 -1.386294 6 -0.1625189
## 5 0.7514161 62 -1.386294 -1.386294 6 0.3715636
# The output of both the standard regression analysis and the LASSO estimation
# are shown as follows:
m1 <- lm(lcavol~.,data = prostate)
summary(m1)
##
## Call:
## lm(formula = lcavol ~ ., data = prostate)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.88964 -0.52719 -0.07263 0.57834 1.98728
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.49371 0.94261 -1.585 0.1165
## age 0.01902 0.01063 1.789 0.0769 .
## lbph -0.08918 0.05376 -1.659 0.1006
## lcp 0.29727 0.06762 4.396 2.98e-05 ***
## gleason 0.05240 0.11965 0.438 0.6625
## lpsa 0.53955 0.07648 7.054 3.30e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7015 on 91 degrees of freedom
## Multiple R-squared: 0.6642, Adjusted R-squared: 0.6457
## F-statistic: 36 on 5 and 91 DF, p-value: < 2.2e-16
x <- model.matrix(lcavol~age+lbph+lcp+gleason+lpsa,
data = prostate)
x=x[,-1]
library(lars)
## Loaded lars 1.2
lasso <- lars(x=x,y=prostate$lcavol,trace = TRUE)
## LASSO sequence
## Computing X'X .....
## LARS Step 1 : Variable 5 added
## LARS Step 2 : Variable 3 added
## LARS Step 3 : Variable 1 added
## LARS Step 4 : Variable 4 added
## LARS Step 5 : Variable 2 added
## Computing residuals, RSS etc .....
plot(lasso)

lasso
##
## Call:
## lars(x = x, y = prostate$lcavol, trace = TRUE)
## R-squared: 0.664
## Sequence of LASSO moves:
## lpsa lcp age gleason lbph
## Var 5 3 1 4 2
## Step 1 2 3 4 5
# The graph of the LASSO estimates as a function of the shrinkage illustrates
# the order in which variables enter the model as one relaxes the constraint on the
# L1 norm of their estimates. Initially there is nothing in the model (look to the
# left of the graph, where s = 0). Moving to the right on this graph, one finds that
# the first variable to enter is variable 5 (lpsa); then variable 3 (lcp) enters; then
# variable 1 (age). Or, scanning the graph from the right-hand side to the left, we
# notice that variables 2 (lbph), 4 (gleason), and 1 (age) are "zeroed out" in that
# order. Below, we list the parameter estimates for selected values of shrinkage;
# s = 0.25, 0.50, 0.75, and 1.00. The LASSO estimates for s = 1 are the ordinary
# least squares estimates (see the prior regression output).
objects(grep("lars",search()))
## [1] "backsolvet" "betabreaker" "coef.lars" "cv.folds"
## [5] "cv.lars" "delcol" "downdateR" "error.bars"
## [9] "lars" "nnls.lars" "plot.lars" "plotCVLars"
## [13] "predict.lars" "print.lars" "summary.lars" "updateR"
coef.lars(lasso)
## age lbph lcp gleason lpsa
## [1,] 0.000000000 0.00000000 0.0000000 0.00000000 0.0000000
## [2,] 0.000000000 0.00000000 0.0000000 0.00000000 0.1338576
## [3,] 0.000000000 0.00000000 0.2698036 0.00000000 0.4606733
## [4,] 0.003157103 0.00000000 0.2819937 0.00000000 0.4735472
## [5,] 0.005151903 0.00000000 0.2880676 0.01070794 0.4818743
## [6,] 0.019023772 -0.08918256 0.2972721 0.05239529 0.5395488
summary.lars(lasso)
## LARS/LASSO
## Call: lars(x = x, y = prostate$lcavol, trace = TRUE)
## Df Rss Cp
## 0 1 133.359 175.9821
## 1 2 109.970 130.4556
## 2 3 49.273 9.1207
## 3 4 48.073 8.6834
## 4 5 47.379 9.2735
## 5 6 44.784 6.0000
coef(lasso,s=c(0.25,0.50,0.75,1.0),
mode="fraction")
## age lbph lcp gleason lpsa
## [1,] 0.000000000 0.000000000 0.06519506 0.00000000 0.2128290
## [2,] 0.000000000 0.000000000 0.18564339 0.00000000 0.3587292
## [3,] 0.005369985 -0.001402051 0.28821232 0.01136331 0.4827810
## [4,] 0.019023772 -0.089182565 0.29727207 0.05239529 0.5395488
cv.lars(x=x,y=prostate$lcavol,K=10)

# The output of cross-validation (average mean square errors and their associated
# standard error bounds) shows that the mean square error increases quite rapidly if
# we shrink the coefficients too aggressively. The mean square curve is smallest for
# s = 1 (the least squares solution), but is actually quite flat for all values of s larger
# than 0.6. The present example does not call for much shrinkage. This is not too
# surprising as the number of estimated coefficients (five coefficients, excluding the
# intercept) is rather small relative to the size of the sample (n = 97).
# A similar conclusion about the required amount of shrinkage is obtained when
# selecting at random 80 of the 97 men for the training data set, and applying the
# LASSO estimates to predict the log volume of the remaining 17 men. Repeating the
# random sampling 10 times (each time leaving out 17 different randomly selected
# men) leads to the mean square prediction errors that are shown in the box plot.
# Again, one is best off staying with the least squares estimates.
MSElasso25=dim(10)
MSElasso50=dim(10)
MSElasso75=dim(10)
MSElasso100=dim(10)
set.seed(1)
for(i in 1:10){
train <- sample(1:nrow(prostate),80)
lasso <- lars(x=x[train,],y=prostate$lcavol[train])
MSElasso25[i]=
mean((predict(lasso,x[-train,],s=0.25,
mode = "fraction")$fit-
prostate$lcavol[-train])^2)
MSElasso50[i]=
mean((predict(lasso,x[-train,],s=.50,
mode="fraction")$fit-
prostate$lcavol[-train])^2)
MSElasso75[i]=
mean((predict(lasso,x[-train,],s=.75,
mode="fraction")$fit-
prostate$lcavol[-train])^2)
MSElasso100[i]=
mean((predict(lasso,x[-train,],s=1.00,
mode="fraction")$fit-
prostate$lcavol[-train])^2)
}
mean(MSElasso25)
## [1] 1.021938
mean(MSElasso50)
## [1] 0.6723226
mean(MSElasso75)
## [1] 0.5410033
mean(MSElasso100)
## [1] 0.5352386
boxplot(MSElasso25,MSElasso50,MSElasso75,
MSElasso100,
ylab="MSE",sub="Lasso Model",
xlab="s=0.25 s=0.50 s=0.75 s=1.00(LS)")
