## From Johannes Ledolter Data Mining
# Penalty-Based Variable Selection
# in Regression Models with Many
# Parameters (LASSO)-chapter 6
# one needs to simplify models as the estimation of unneeded
# coefficients degrades the forecasting performance.
# The LASSO (least absolute shrinkage and selection operator) algorithm finds
# a least squares solution under the constraint that
# the L1 norm of the
# estimated parameter vector, is no greater than a certain given value; that is, the
# LASSO estimate of ?? = (??1, ??2, . . . , ??k
# see page 71 of the text for explanation
setwd("C:\\Users\\Luis\\Desktop\\johannes")
# 6.1 EXAMPLE 1: PROSTATE CANCER
# Let us consider an example on prostate cancer. The data, taken from Stamey et al.
# (1989), contains the results of biopsies on 97 men of various ages. This data set
# has been analyzed by Tibshirani (1996) in his introduction of the LASSO and by
# many texts on data mining. The biopsy information includes:
#   Gleason score (gleason): scores are assigned to the two most common tumor
# patterns ranging from 2 to 10; in this data set, the range is from 6 to 9.
# Prostate-specific antigen (psa): laboratory results on protein production.
# Capsular penetration (cp): reach of cancer into the gland lining.
# Benign prostatic hyperplasia amount (bph): size of the prostate.
# The goal is to predict the tumor log volume (which measures the tumor's size or
# spread). We try to predict this variable from five covariates (age; logarithms of bph,
# cp, and psa; and the Gleason score). The predicted size of the tumor has important
# implications for the subsequent treatment options, which include chemotherapy,
# radiation treatment, and surgical removal of the prostate.
prostate <- read.csv("C:\\Users\\Luis\\Desktop\\johannes\\prostate.csv")
prostate[1:5,]
##       lcavol age      lbph       lcp gleason       lpsa
## 1 -0.5798185  50 -1.386294 -1.386294       6 -0.4307829
## 2 -0.9942523  58 -1.386294 -1.386294       6 -0.1625189
## 3 -0.5108256  74 -1.386294 -1.386294       7 -0.1625189
## 4 -1.2039728  58 -1.386294 -1.386294       6 -0.1625189
## 5  0.7514161  62 -1.386294 -1.386294       6  0.3715636
# The output of both the standard regression analysis and the LASSO estimation
# are shown as follows:
m1 <- lm(lcavol~.,data = prostate)
summary(m1)
## 
## Call:
## lm(formula = lcavol ~ ., data = prostate)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.88964 -0.52719 -0.07263  0.57834  1.98728 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.49371    0.94261  -1.585   0.1165    
## age          0.01902    0.01063   1.789   0.0769 .  
## lbph        -0.08918    0.05376  -1.659   0.1006    
## lcp          0.29727    0.06762   4.396 2.98e-05 ***
## gleason      0.05240    0.11965   0.438   0.6625    
## lpsa         0.53955    0.07648   7.054 3.30e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7015 on 91 degrees of freedom
## Multiple R-squared:  0.6642, Adjusted R-squared:  0.6457 
## F-statistic:    36 on 5 and 91 DF,  p-value: < 2.2e-16
x <- model.matrix(lcavol~age+lbph+lcp+gleason+lpsa,
                  data = prostate)
x=x[,-1]
library(lars)
## Loaded lars 1.2
lasso <- lars(x=x,y=prostate$lcavol,trace = TRUE)
## LASSO sequence
## Computing X'X .....
## LARS Step 1 :     Variable 5     added
## LARS Step 2 :     Variable 3     added
## LARS Step 3 :     Variable 1     added
## LARS Step 4 :     Variable 4     added
## LARS Step 5 :     Variable 2     added
## Computing residuals, RSS etc .....
plot(lasso)

lasso
## 
## Call:
## lars(x = x, y = prostate$lcavol, trace = TRUE)
## R-squared: 0.664 
## Sequence of LASSO moves:
##      lpsa lcp age gleason lbph
## Var     5   3   1       4    2
## Step    1   2   3       4    5
# The graph of the LASSO estimates as a function of the shrinkage illustrates
# the order in which variables enter the model as one relaxes the constraint on the
# L1 norm of their estimates. Initially there is nothing in the model (look to the
# left of the graph, where s = 0). Moving to the right on this graph, one finds that
# the first variable to enter is variable 5 (lpsa); then variable 3 (lcp) enters; then
# variable 1 (age). Or, scanning the graph from the right-hand side to the left, we
# notice that variables 2 (lbph), 4 (gleason), and 1 (age) are "zeroed out" in that
# order. Below, we list the parameter estimates for selected values of shrinkage;
# s = 0.25, 0.50, 0.75, and 1.00. The LASSO estimates for s = 1 are the ordinary
# least squares estimates (see the prior regression output).
objects(grep("lars",search()))
##  [1] "backsolvet"   "betabreaker"  "coef.lars"    "cv.folds"    
##  [5] "cv.lars"      "delcol"       "downdateR"    "error.bars"  
##  [9] "lars"         "nnls.lars"    "plot.lars"    "plotCVLars"  
## [13] "predict.lars" "print.lars"   "summary.lars" "updateR"
coef.lars(lasso)
##              age        lbph       lcp    gleason      lpsa
## [1,] 0.000000000  0.00000000 0.0000000 0.00000000 0.0000000
## [2,] 0.000000000  0.00000000 0.0000000 0.00000000 0.1338576
## [3,] 0.000000000  0.00000000 0.2698036 0.00000000 0.4606733
## [4,] 0.003157103  0.00000000 0.2819937 0.00000000 0.4735472
## [5,] 0.005151903  0.00000000 0.2880676 0.01070794 0.4818743
## [6,] 0.019023772 -0.08918256 0.2972721 0.05239529 0.5395488
summary.lars(lasso)
## LARS/LASSO
## Call: lars(x = x, y = prostate$lcavol, trace = TRUE)
##   Df     Rss       Cp
## 0  1 133.359 175.9821
## 1  2 109.970 130.4556
## 2  3  49.273   9.1207
## 3  4  48.073   8.6834
## 4  5  47.379   9.2735
## 5  6  44.784   6.0000
coef(lasso,s=c(0.25,0.50,0.75,1.0),
     mode="fraction")
##              age         lbph        lcp    gleason      lpsa
## [1,] 0.000000000  0.000000000 0.06519506 0.00000000 0.2128290
## [2,] 0.000000000  0.000000000 0.18564339 0.00000000 0.3587292
## [3,] 0.005369985 -0.001402051 0.28821232 0.01136331 0.4827810
## [4,] 0.019023772 -0.089182565 0.29727207 0.05239529 0.5395488
cv.lars(x=x,y=prostate$lcavol,K=10)

# The output of cross-validation (average mean square errors and their associated
# standard error bounds) shows that the mean square error increases quite rapidly if
# we shrink the coefficients too aggressively. The mean square curve is smallest for
# s = 1 (the least squares solution), but is actually quite flat for all values of s larger
# than 0.6. The present example does not call for much shrinkage. This is not too
# surprising as the number of estimated coefficients (five coefficients, excluding the
# intercept) is rather small relative to the size of the sample (n = 97).
# A similar conclusion about the required amount of shrinkage is obtained when
# selecting at random 80 of the 97 men for the training data set, and applying the
# LASSO estimates to predict the log volume of the remaining 17 men. Repeating the
# random sampling 10 times (each time leaving out 17 different randomly selected
# men) leads to the mean square prediction errors that are shown in the box plot.
# Again, one is best off staying with the least squares estimates.
MSElasso25=dim(10)
MSElasso50=dim(10)
MSElasso75=dim(10)
MSElasso100=dim(10)
set.seed(1)
for(i in 1:10){
  train <- sample(1:nrow(prostate),80)
  lasso <- lars(x=x[train,],y=prostate$lcavol[train])
  MSElasso25[i]=
    mean((predict(lasso,x[-train,],s=0.25,
                  mode = "fraction")$fit-
            prostate$lcavol[-train])^2)
  MSElasso50[i]=
    mean((predict(lasso,x[-train,],s=.50,
                    mode="fraction")$fit-
              prostate$lcavol[-train])^2)
  MSElasso75[i]=
    mean((predict(lasso,x[-train,],s=.75,
                     mode="fraction")$fit-
              prostate$lcavol[-train])^2)
  MSElasso100[i]=
     mean((predict(lasso,x[-train,],s=1.00,
                    mode="fraction")$fit-
              prostate$lcavol[-train])^2)
}
mean(MSElasso25)
## [1] 1.021938
mean(MSElasso50)
## [1] 0.6723226
mean(MSElasso75)
## [1] 0.5410033
mean(MSElasso100)
## [1] 0.5352386
boxplot(MSElasso25,MSElasso50,MSElasso75,
        MSElasso100,
        ylab="MSE",sub="Lasso Model",
        xlab="s=0.25      s=0.50       s=0.75      s=1.00(LS)")