Wisconsin Breast Cancer Dataset

For this example, we will be working with the Wisconsin Breast Cancer Dataset. Each of the 569 observations in this dataset contains 30 measurements taken from images of cell nuclei drawn from a potentially cancerous breast mass. Each observation is labeled as being benign (B) or malignant (M).

wbc <- read.table('data/breast_cancer.csv', header=TRUE, sep=',')
summary(wbc)

##        id            diagnosis  radius_mean      texture_mean  
##  Min.   :     8670   B:357     Min.   : 6.981   Min.   : 9.71  
##  1st Qu.:   869218   M:212     1st Qu.:11.700   1st Qu.:16.17  
##  Median :   906024             Median :13.370   Median :18.84  
##  Mean   : 30371831             Mean   :14.127   Mean   :19.29  
##  3rd Qu.:  8813129             3rd Qu.:15.780   3rd Qu.:21.80  
##  Max.   :911320502             Max.   :28.110   Max.   :39.28  
##  perimeter_mean     area_mean      smoothness_mean   compactness_mean 
##  Min.   : 43.79   Min.   : 143.5   Min.   :0.05263   Min.   :0.01938  
##  1st Qu.: 75.17   1st Qu.: 420.3   1st Qu.:0.08637   1st Qu.:0.06492  
##  Median : 86.24   Median : 551.1   Median :0.09587   Median :0.09263  
##  Mean   : 91.97   Mean   : 654.9   Mean   :0.09636   Mean   :0.10434  
##  3rd Qu.:104.10   3rd Qu.: 782.7   3rd Qu.:0.10530   3rd Qu.:0.13040  
##  Max.   :188.50   Max.   :2501.0   Max.   :0.16340   Max.   :0.34540  
##  concavity_mean    concave.points_mean symmetry_mean   
##  Min.   :0.00000   Min.   :0.00000     Min.   :0.1060  
##  1st Qu.:0.02956   1st Qu.:0.02031     1st Qu.:0.1619  
##  Median :0.06154   Median :0.03350     Median :0.1792  
##  Mean   :0.08880   Mean   :0.04892     Mean   :0.1812  
##  3rd Qu.:0.13070   3rd Qu.:0.07400     3rd Qu.:0.1957  
##  Max.   :0.42680   Max.   :0.20120     Max.   :0.3040  
##  fractal_dimension_mean   radius_se        texture_se      perimeter_se   
##  Min.   :0.04996        Min.   :0.1115   Min.   :0.3602   Min.   : 0.757  
##  1st Qu.:0.05770        1st Qu.:0.2324   1st Qu.:0.8339   1st Qu.: 1.606  
##  Median :0.06154        Median :0.3242   Median :1.1080   Median : 2.287  
##  Mean   :0.06280        Mean   :0.4052   Mean   :1.2169   Mean   : 2.866  
##  3rd Qu.:0.06612        3rd Qu.:0.4789   3rd Qu.:1.4740   3rd Qu.: 3.357  
##  Max.   :0.09744        Max.   :2.8730   Max.   :4.8850   Max.   :21.980  
##     area_se        smoothness_se      compactness_se      concavity_se    
##  Min.   :  6.802   Min.   :0.001713   Min.   :0.002252   Min.   :0.00000  
##  1st Qu.: 17.850   1st Qu.:0.005169   1st Qu.:0.013080   1st Qu.:0.01509  
##  Median : 24.530   Median :0.006380   Median :0.020450   Median :0.02589  
##  Mean   : 40.337   Mean   :0.007041   Mean   :0.025478   Mean   :0.03189  
##  3rd Qu.: 45.190   3rd Qu.:0.008146   3rd Qu.:0.032450   3rd Qu.:0.04205  
##  Max.   :542.200   Max.   :0.031130   Max.   :0.135400   Max.   :0.39600  
##  concave.points_se   symmetry_se       fractal_dimension_se
##  Min.   :0.000000   Min.   :0.007882   Min.   :0.0008948   
##  1st Qu.:0.007638   1st Qu.:0.015160   1st Qu.:0.0022480   
##  Median :0.010930   Median :0.018730   Median :0.0031870   
##  Mean   :0.011796   Mean   :0.020542   Mean   :0.0037949   
##  3rd Qu.:0.014710   3rd Qu.:0.023480   3rd Qu.:0.0045580   
##  Max.   :0.052790   Max.   :0.078950   Max.   :0.0298400   
##   radius_worst   texture_worst   perimeter_worst    area_worst    
##  Min.   : 7.93   Min.   :12.02   Min.   : 50.41   Min.   : 185.2  
##  1st Qu.:13.01   1st Qu.:21.08   1st Qu.: 84.11   1st Qu.: 515.3  
##  Median :14.97   Median :25.41   Median : 97.66   Median : 686.5  
##  Mean   :16.27   Mean   :25.68   Mean   :107.26   Mean   : 880.6  
##  3rd Qu.:18.79   3rd Qu.:29.72   3rd Qu.:125.40   3rd Qu.:1084.0  
##  Max.   :36.04   Max.   :49.54   Max.   :251.20   Max.   :4254.0  
##  smoothness_worst  compactness_worst concavity_worst  concave.points_worst
##  Min.   :0.07117   Min.   :0.02729   Min.   :0.0000   Min.   :0.00000     
##  1st Qu.:0.11660   1st Qu.:0.14720   1st Qu.:0.1145   1st Qu.:0.06493     
##  Median :0.13130   Median :0.21190   Median :0.2267   Median :0.09993     
##  Mean   :0.13237   Mean   :0.25427   Mean   :0.2722   Mean   :0.11461     
##  3rd Qu.:0.14600   3rd Qu.:0.33910   3rd Qu.:0.3829   3rd Qu.:0.16140     
##  Max.   :0.22260   Max.   :1.05800   Max.   :1.2520   Max.   :0.29100     
##  symmetry_worst   fractal_dimension_worst
##  Min.   :0.1565   Min.   :0.05504        
##  1st Qu.:0.2504   1st Qu.:0.07146        
##  Median :0.2822   Median :0.08004        
##  Mean   :0.2901   Mean   :0.08395        
##  3rd Qu.:0.3179   3rd Qu.:0.09208        
##  Max.   :0.6638   Max.   :0.20750

Drop ID Column

We do not need the ID column, so we will drop that from our dataframe.

wbc <- wbc[,-1]
names(wbc)

##  [1] "diagnosis"               "radius_mean"            
##  [3] "texture_mean"            "perimeter_mean"         
##  [5] "area_mean"               "smoothness_mean"        
##  [7] "compactness_mean"        "concavity_mean"         
##  [9] "concave.points_mean"     "symmetry_mean"          
## [11] "fractal_dimension_mean"  "radius_se"              
## [13] "texture_se"              "perimeter_se"           
## [15] "area_se"                 "smoothness_se"          
## [17] "compactness_se"          "concavity_se"           
## [19] "concave.points_se"       "symmetry_se"            
## [21] "fractal_dimension_se"    "radius_worst"           
## [23] "texture_worst"           "perimeter_worst"        
## [25] "area_worst"              "smoothness_worst"       
## [27] "compactness_worst"       "concavity_worst"        
## [29] "concave.points_worst"    "symmetry_worst"         
## [31] "fractal_dimension_worst"

Correlation Matrix

We will use the fuinction ggcorr() from GGally to explore the correlations between our predictors.

ggcorr(wbc[,-1], palette = "RdBu", label = TRUE)

Create Training and Validation Sets

The classes are slightly imbalanced, so we will use createDataPartition() from the carat library to created a stratified training/validation split.

set.seed(1)
train.index <- createDataPartition(wbc$diagnosis, p = .8, list=FALSE)
train <- wbc[ train.index,]
valid  <- wbc[-train.index,]

summary(train$diagnosis)

##   B   M 
## 286 170

summary(valid$diagnosis)

##  B  M 
## 71 42

Create Logistic Regression Model

We will now create a logistic regression model to predict the diagnosis based on all 30 predictors.

lr_mod <- glm(diagnosis ~ ., train, family=binomial)

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(lr_mod)

## 
## Call:
## glm(formula = diagnosis ~ ., family = binomial, data = train)
## 
## Deviance Residuals: 
##        Min          1Q      Median          3Q         Max  
## -1.092e-04  -2.100e-08  -2.100e-08   2.100e-08   1.166e-04  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)
## (Intercept)             -1.127e+03  1.151e+06  -0.001    0.999
## radius_mean             -1.363e+02  3.711e+05   0.000    1.000
## texture_mean            -3.301e+00  5.884e+03  -0.001    1.000
## perimeter_mean           2.250e+01  4.020e+04   0.001    1.000
## area_mean                3.005e-01  1.218e+03   0.000    1.000
## smoothness_mean          6.912e+01  1.867e+06   0.000    1.000
## compactness_mean        -1.443e+03  8.906e+05  -0.002    0.999
## concavity_mean          -3.363e+02  1.415e+06   0.000    1.000
## concave.points_mean      1.779e+03  1.378e+06   0.001    0.999
## symmetry_mean           -5.830e+02  4.782e+05  -0.001    0.999
## fractal_dimension_mean   6.908e+03  7.975e+06   0.001    0.999
## radius_se                4.309e+02  9.838e+05   0.000    1.000
## texture_se              -5.049e+01  3.899e+04  -0.001    0.999
## perimeter_se             1.731e+01  1.059e+05   0.000    1.000
## area_se                  7.539e-01  5.528e+03   0.000    1.000
## smoothness_se            2.972e+03  9.424e+06   0.000    1.000
## compactness_se          -3.616e+03  2.744e+06  -0.001    0.999
## concavity_se            -1.486e+03  1.031e+06  -0.001    0.999
## concave.points_se        7.884e+03  8.423e+06   0.001    0.999
## symmetry_se             -6.114e+03  2.151e+06  -0.003    0.998
## fractal_dimension_se     3.202e+03  8.623e+06   0.000    1.000
## radius_worst            -2.559e+01  8.205e+04   0.000    1.000
## texture_worst            1.052e+01  6.199e+03   0.002    0.999
## perimeter_worst          4.934e+00  7.484e+03   0.001    0.999
## area_worst              -2.154e-01  5.481e+02   0.000    1.000
## smoothness_worst         4.372e+01  1.724e+06   0.000    1.000
## compactness_worst       -4.695e+02  3.544e+05  -0.001    0.999
## concavity_worst          7.415e+02  3.316e+05   0.002    0.998
## concave.points_worst    -3.576e+02  8.622e+05   0.000    1.000
## symmetry_worst           1.065e+03  2.953e+05   0.004    0.997
## fractal_dimension_worst  9.989e+01  1.481e+06   0.000    1.000
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6.0231e+02  on 455  degrees of freedom
## Residual deviance: 1.6098e-07  on 425  degrees of freedom
## AIC: 62
## 
## Number of Fisher Scoring iterations: 25

Accuracy of Unregularized Model

train_prob <- predict(lr_mod, train, type="response")
train_pred <- ifelse(train_prob < 0.5, "B", "M")
train_acc <- mean(train_pred == train$diagnosis)

valid_prob <- predict(lr_mod, valid, type="response")
valid_pred <- ifelse(valid_prob < 0.5, "B", "M")
valid_acc <- mean(valid_pred == valid$diagnosis)

cat("Training Accuracy:   ", train_acc, "\n",
    "Validation Accuracy: ", valid_acc, sep="")

## Training Accuracy:   1
## Validation Accuracy: 0.9292035

Create Models with L1 Regularization

Xtrain <- as.matrix(train[,-1])
Xvalid <- as.matrix(valid[,-1])

log_lambda_grid <- seq(0, -4, length=100) 
lambda_grid <- 10^log_lambda_grid

L1_models <- glmnet(Xtrain, train$diagnosis, alpha=1, lambda=lambda_grid, family="binomial")

L1_coef <- coef(L1_models)

round(L1_coef[, c(1:3, 98:100)], 6)

## 31 x 6 sparse Matrix of class "dgCMatrix"
##                                s0        s1        s2         s97
## (Intercept)             -0.520193 -0.520193 -0.520193  -91.641623
## radius_mean              .         .         .           .       
## texture_mean             .         .         .          -0.376121
## perimeter_mean           .         .         .           .       
## area_mean                .         .         .           0.011231
## smoothness_mean          .         .         .           .       
## compactness_mean         .         .         .         -92.773666
## concavity_mean           .         .         .          29.561628
## concave.points_mean      .         .         .         206.421648
## symmetry_mean            .         .         .         -77.354320
## fractal_dimension_mean   .         .         .         445.169965
## radius_se                .         .         .           .       
## texture_se               .         .         .          -5.018798
## perimeter_se             .         .         .           4.721202
## area_se                  .         .         .           0.415790
## smoothness_se            .         .         .         404.254357
## compactness_se           .         .         .        -497.626249
## concavity_se             .         .         .         -88.423060
## concave.points_se        .         .         .         729.155984
## symmetry_se              .         .         .        -531.724530
## fractal_dimension_se     .         .         .           .       
## radius_worst             .         .         .           .       
## texture_worst            .         .         .           1.093473
## perimeter_worst          .         .         .           .       
## area_worst               .         .         .           0.003872
## smoothness_worst         .         .         .           0.056776
## compactness_worst        .         .         .          -7.156507
## concavity_worst          .         .         .          45.403565
## concave.points_worst     .         .         .           .       
## symmetry_worst           .         .         .          93.199865
## fractal_dimension_worst  .         .         .           .       
##                                 s98         s99
## (Intercept)              -95.596570  -99.530232
## radius_mean                .           .       
## texture_mean              -0.405276   -0.434595
## perimeter_mean             .           .       
## area_mean                  0.012729    0.014180
## smoothness_mean            .           .       
## compactness_mean         -96.308426  -99.845583
## concavity_mean            31.346292   33.085163
## concave.points_mean      212.977381  219.576766
## symmetry_mean            -80.875055  -84.354974
## fractal_dimension_mean   469.254407  492.509249
## radius_se                  .           .       
## texture_se                -5.335496   -5.650593
## perimeter_se               4.935302    5.147674
## area_se                    0.437377    0.458953
## smoothness_se            416.649278  428.243694
## compactness_se          -518.694896 -539.685298
## concavity_se             -92.982936  -97.433039
## concave.points_se        763.086267  797.221263
## symmetry_se             -559.875791 -587.472266
## fractal_dimension_se       .           .       
## radius_worst               .           .       
## texture_worst              1.150701    1.208161
## perimeter_worst            .           .       
## area_worst                 0.003022    0.002215
## smoothness_worst           0.375174    0.892680
## compactness_worst         -8.067822   -8.926024
## concavity_worst           47.442932   49.451673
## concave.points_worst       .           .       
## symmetry_worst            97.647607  102.009687
## fractal_dimension_worst    .           .

Plot Coefficient Estimates against Lambda

L1_coef_Df <- data.frame(t(as.matrix(L1_coef)[-1,]))
L1_coef_Df$grid <- log_lambda_grid

L1_melted <- melt(L1_coef_Df, id.vars="grid")

ggplot(L1_melted, aes(grid, value, col=variable)) + geom_line(size=1) +
  xlab('log(lambda)') + ylab('Coefficients') + theme(legend.position = "none")

Calculating Probability Estimates

valid_prob <- predict(L1_models, Xvalid, type='response')
train_prob <- predict(L1_models, Xtrain, type='response')

valid_prob[1:10, c(1:3, 98:100)]

##          s0       s1       s2          s97          s98          s99
## 14 0.372807 0.372807 0.372807 7.098823e-02 6.405827e-02 5.796044e-02
## 16 0.372807 0.372807 0.372807 1.000000e+00 1.000000e+00 1.000000e+00
## 19 0.372807 0.372807 0.372807 1.000000e+00 1.000000e+00 1.000000e+00
## 25 0.372807 0.372807 0.372807 1.000000e+00 1.000000e+00 1.000000e+00
## 27 0.372807 0.372807 0.372807 1.000000e+00 1.000000e+00 1.000000e+00
## 36 0.372807 0.372807 0.372807 1.000000e+00 1.000000e+00 1.000000e+00
## 38 0.372807 0.372807 0.372807 5.608363e-16 1.192839e-16 2.536755e-17
## 41 0.372807 0.372807 0.372807 1.227445e-03 9.349827e-04 7.119527e-04
## 43 0.372807 0.372807 0.372807 1.000000e+00 1.000000e+00 1.000000e+00
## 47 0.372807 0.372807 0.372807 5.277422e-18 1.090656e-18 2.236692e-19

Calculating Training and Validation Accuracies

train_acc_list <- c()
valid_acc_list <- c()

for(i in 1:100){
  train_pred <- ifelse(train_prob[,i] < 0.5, "B", "M")
  valid_pred <- ifelse(valid_prob[,i] < 0.5, "B", "M")
  
  train_acc <- mean(train_pred == train$diagnosis)
  valid_acc <- mean(valid_pred == valid$diagnosis)
  
  train_acc_list <- c(train_acc_list, train_acc)
  valid_acc_list <- c(valid_acc_list, valid_acc)
  
}

valid_acc_list[c(1:3, 98:100)]

## [1] 0.6283186 0.6283186 0.6283186 0.9292035 0.9292035 0.9292035

Plotting Training and Validation Curves

plot(log_lambda_grid, train_acc_list, ylim=c(0.9,1), pch=".", col="salmon", 
     xlab="ln(lambda)", ylab="r-Squared", main="Training and Validation Scores (L1 Regularization)")

lines(log_lambda_grid, train_acc_list, col="salmon", lwd=2)

lines(log_lambda_grid, valid_acc_list, col="cornflowerblue", lwd=2)

legend(-1, 1, legend=c("Training Acc", "Validation Acc"),
       col=c("salmon", "cornflowerblue"), lty=1, lwd=2, cex=0.8)

Finding Optimal Value of Lambda

best_valid_acc <- max(valid_acc_list)
best_valid_ix <- which.max(valid_acc_list)
best_log_lambda <- log_lambda_grid[best_valid_ix]

cat('Value of Optimal Accuracy:    ', best_valid_acc, '\n',
    'Index of Optimal Accuracy:    ', best_valid_ix, '\n',
    'Value of Optimal log(lambda): ', best_log_lambda, sep='')

## Value of Optimal Accuracy:    0.9734513
## Index of Optimal Accuracy:    61
## Value of Optimal log(lambda): -2.424242

Coefficients in Optimal Model

best_coef <- L1_coef[,best_valid_ix]
best_coef[best_coef > 0]

##         texture_mean  concave.points_mean            radius_se 
##          0.067624717         31.961681711          7.194423600 
##        smoothness_se         radius_worst        texture_worst 
##          6.149311710          0.346194962          0.171694441 
##      perimeter_worst           area_worst     smoothness_worst 
##          0.033123423          0.001727457         30.399453343 
##      concavity_worst concave.points_worst       symmetry_worst 
##          3.915570391         14.114810952          5.658251117

Lesson 6.2 - Regularized Logistic Regression

Load Packages