require(glmnet) # Provides lasso and ridge implementations
require(reshape)
require(ggplot2)
require(GGally)
library(caret)
For this example, we will be working with the Wisconsin Breast Cancer Dataset. Each of the 569 observations in this dataset contains 30 measurements taken from images of cell nuclei drawn from a potentially cancerous breast mass. Each observation is labeled as being benign (B) or malignant (M).
wbc <- read.table('data/breast_cancer.csv', header=TRUE, sep=',')
summary(wbc)
id diagnosis radius_mean texture_mean perimeter_mean
Min. : 8670 B:357 Min. : 6.981 Min. : 9.71 Min. : 43.79
1st Qu.: 869218 M:212 1st Qu.:11.700 1st Qu.:16.17 1st Qu.: 75.17
Median : 906024 Median :13.370 Median :18.84 Median : 86.24
Mean : 30371831 Mean :14.127 Mean :19.29 Mean : 91.97
3rd Qu.: 8813129 3rd Qu.:15.780 3rd Qu.:21.80 3rd Qu.:104.10
Max. :911320502 Max. :28.110 Max. :39.28 Max. :188.50
area_mean smoothness_mean compactness_mean concavity_mean concave.points_mean
Min. : 143.5 Min. :0.05263 Min. :0.01938 Min. :0.00000 Min. :0.00000
1st Qu.: 420.3 1st Qu.:0.08637 1st Qu.:0.06492 1st Qu.:0.02956 1st Qu.:0.02031
Median : 551.1 Median :0.09587 Median :0.09263 Median :0.06154 Median :0.03350
Mean : 654.9 Mean :0.09636 Mean :0.10434 Mean :0.08880 Mean :0.04892
3rd Qu.: 782.7 3rd Qu.:0.10530 3rd Qu.:0.13040 3rd Qu.:0.13070 3rd Qu.:0.07400
Max. :2501.0 Max. :0.16340 Max. :0.34540 Max. :0.42680 Max. :0.20120
symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se
Min. :0.1060 Min. :0.04996 Min. :0.1115 Min. :0.3602 Min. : 0.757
1st Qu.:0.1619 1st Qu.:0.05770 1st Qu.:0.2324 1st Qu.:0.8339 1st Qu.: 1.606
Median :0.1792 Median :0.06154 Median :0.3242 Median :1.1080 Median : 2.287
Mean :0.1812 Mean :0.06280 Mean :0.4052 Mean :1.2169 Mean : 2.866
3rd Qu.:0.1957 3rd Qu.:0.06612 3rd Qu.:0.4789 3rd Qu.:1.4740 3rd Qu.: 3.357
Max. :0.3040 Max. :0.09744 Max. :2.8730 Max. :4.8850 Max. :21.980
area_se smoothness_se compactness_se concavity_se concave.points_se
Min. : 6.802 Min. :0.001713 Min. :0.002252 Min. :0.00000 Min. :0.000000
1st Qu.: 17.850 1st Qu.:0.005169 1st Qu.:0.013080 1st Qu.:0.01509 1st Qu.:0.007638
Median : 24.530 Median :0.006380 Median :0.020450 Median :0.02589 Median :0.010930
Mean : 40.337 Mean :0.007041 Mean :0.025478 Mean :0.03189 Mean :0.011796
3rd Qu.: 45.190 3rd Qu.:0.008146 3rd Qu.:0.032450 3rd Qu.:0.04205 3rd Qu.:0.014710
Max. :542.200 Max. :0.031130 Max. :0.135400 Max. :0.39600 Max. :0.052790
symmetry_se fractal_dimension_se radius_worst texture_worst perimeter_worst
Min. :0.007882 Min. :0.0008948 Min. : 7.93 Min. :12.02 Min. : 50.41
1st Qu.:0.015160 1st Qu.:0.0022480 1st Qu.:13.01 1st Qu.:21.08 1st Qu.: 84.11
Median :0.018730 Median :0.0031870 Median :14.97 Median :25.41 Median : 97.66
Mean :0.020542 Mean :0.0037949 Mean :16.27 Mean :25.68 Mean :107.26
3rd Qu.:0.023480 3rd Qu.:0.0045580 3rd Qu.:18.79 3rd Qu.:29.72 3rd Qu.:125.40
Max. :0.078950 Max. :0.0298400 Max. :36.04 Max. :49.54 Max. :251.20
area_worst smoothness_worst compactness_worst concavity_worst concave.points_worst
Min. : 185.2 Min. :0.07117 Min. :0.02729 Min. :0.0000 Min. :0.00000
1st Qu.: 515.3 1st Qu.:0.11660 1st Qu.:0.14720 1st Qu.:0.1145 1st Qu.:0.06493
Median : 686.5 Median :0.13130 Median :0.21190 Median :0.2267 Median :0.09993
Mean : 880.6 Mean :0.13237 Mean :0.25427 Mean :0.2722 Mean :0.11461
3rd Qu.:1084.0 3rd Qu.:0.14600 3rd Qu.:0.33910 3rd Qu.:0.3829 3rd Qu.:0.16140
Max. :4254.0 Max. :0.22260 Max. :1.05800 Max. :1.2520 Max. :0.29100
symmetry_worst fractal_dimension_worst
Min. :0.1565 Min. :0.05504
1st Qu.:0.2504 1st Qu.:0.07146
Median :0.2822 Median :0.08004
Mean :0.2901
We do not need the ID column, so we will drop that from our dataframe.
wbc <- wbc[,-1]
names(wbc)
[1] "diagnosis" "radius_mean" "texture_mean"
[4] "perimeter_mean" "area_mean" "smoothness_mean"
[7] "compactness_mean" "concavity_mean" "concave.points_mean"
[10] "symmetry_mean" "fractal_dimension_mean" "radius_se"
[13] "texture_se" "perimeter_se" "area_se"
[16] "smoothness_se" "compactness_se" "concavity_se"
[19] "concave.points_se" "symmetry_se" "fractal_dimension_se"
[22] "radius_worst" "texture_worst" "perimeter_worst"
[25] "area_worst" "smoothness_worst" "compactness_worst"
[28] "concavity_worst" "concave.points_worst" "symmetry_worst"
[31] "fractal_dimension_worst"
We will use the fuinction ggcorr() from GGally to explore the correlations between our predictors.
ggcorr(wbc[,-1], palette = "RdBu", label = TRUE)
The classes are slightly imbalanced, so we will use createDataPartition() from the carat library to created a stratified training/validation split.
set.seed(1)
train.index <- createDataPartition(wbc$diagnosis, p = .8, list=FALSE)
train <- wbc[ train.index,]
valid <- wbc[-train.index,]
summary(train$diagnosis)
B M
286 170
summary(valid$diagnosis)
B M
71 42
We will now create a logistic regression model to predict the diagnosis based on all 30 predictors.
lr_mod <- glm(diagnosis ~ ., train, family=binomial)
glm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurred
summary(lr_mod)
Call:
glm(formula = diagnosis ~ ., family = binomial, data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.092e-04 -2.100e-08 -2.100e-08 2.100e-08 1.166e-04
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.127e+03 1.151e+06 -0.001 0.999
radius_mean -1.363e+02 3.711e+05 0.000 1.000
texture_mean -3.301e+00 5.884e+03 -0.001 1.000
perimeter_mean 2.250e+01 4.020e+04 0.001 1.000
area_mean 3.005e-01 1.218e+03 0.000 1.000
smoothness_mean 6.912e+01 1.867e+06 0.000 1.000
compactness_mean -1.443e+03 8.906e+05 -0.002 0.999
concavity_mean -3.363e+02 1.415e+06 0.000 1.000
concave.points_mean 1.779e+03 1.378e+06 0.001 0.999
symmetry_mean -5.830e+02 4.782e+05 -0.001 0.999
fractal_dimension_mean 6.908e+03 7.975e+06 0.001 0.999
radius_se 4.309e+02 9.838e+05 0.000 1.000
texture_se -5.049e+01 3.899e+04 -0.001 0.999
perimeter_se 1.731e+01 1.059e+05 0.000 1.000
area_se 7.539e-01 5.528e+03 0.000 1.000
smoothness_se 2.972e+03 9.424e+06 0.000 1.000
compactness_se -3.616e+03 2.744e+06 -0.001 0.999
concavity_se -1.486e+03 1.031e+06 -0.001 0.999
concave.points_se 7.884e+03 8.423e+06 0.001 0.999
symmetry_se -6.114e+03 2.151e+06 -0.003 0.998
fractal_dimension_se 3.202e+03 8.623e+06 0.000 1.000
radius_worst -2.559e+01 8.205e+04 0.000 1.000
texture_worst 1.052e+01 6.199e+03 0.002 0.999
perimeter_worst 4.934e+00 7.484e+03 0.001 0.999
area_worst -2.154e-01 5.481e+02 0.000 1.000
smoothness_worst 4.372e+01 1.724e+06 0.000 1.000
compactness_worst -4.695e+02 3.544e+05 -0.001 0.999
concavity_worst 7.415e+02 3.316e+05 0.002 0.998
concave.points_worst -3.576e+02 8.622e+05 0.000 1.000
symmetry_worst 1.065e+03 2.953e+05 0.004 0.997
fractal_dimension_worst 9.989e+01 1.481e+06 0.000 1.000
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 6.0231e+02 on 455 degrees of freedom
Residual deviance: 1.6098e-07 on 425 degrees of freedom
AIC: 62
Number of Fisher Scoring iterations: 25
train_prob <- predict(lr_mod, train, type="response")
train_pred <- ifelse(train_prob < 0.5, "B", "M")
train_acc <- mean(train_pred == train$diagnosis)
valid_prob <- predict(lr_mod, valid, type="response")
valid_pred <- ifelse(valid_prob < 0.5, "B", "M")
valid_acc <- mean(valid_pred == valid$diagnosis)
cat("Training Accuracy: ", train_acc, "\n",
"Validation Accuracy: ", valid_acc, sep="")
Training Accuracy: 1
Validation Accuracy: 0.9292035
Xtrain <- as.matrix(train[,-1])
Xvalid <- as.matrix(valid[,-1])
log_lambda_grid <- seq(0, -4, length=100)
lambda_grid <- 10^log_lambda_grid
L1_models <- glmnet(Xtrain, train$diagnosis, alpha=1, lambda=lambda_grid, family="binomial")
L1_coef <- coef(L1_models)
round(L1_coef[, c(1:3, 98:100)], 6)
31 x 6 sparse Matrix of class "dgCMatrix"
s0 s1 s2 s97 s98 s99
(Intercept) -0.520193 -0.520193 -0.520193 -91.641623 -95.596570 -99.530232
radius_mean . . . . . .
texture_mean . . . -0.376121 -0.405276 -0.434595
perimeter_mean . . . . . .
area_mean . . . 0.011231 0.012729 0.014180
smoothness_mean . . . . . .
compactness_mean . . . -92.773666 -96.308426 -99.845583
concavity_mean . . . 29.561628 31.346292 33.085163
concave.points_mean . . . 206.421648 212.977381 219.576766
symmetry_mean . . . -77.354320 -80.875055 -84.354974
fractal_dimension_mean . . . 445.169965 469.254407 492.509249
radius_se . . . . . .
texture_se . . . -5.018798 -5.335496 -5.650593
perimeter_se . . . 4.721202 4.935302 5.147674
area_se . . . 0.415790 0.437377 0.458953
smoothness_se . . . 404.254357 416.649278 428.243694
compactness_se . . . -497.626249 -518.694896 -539.685298
concavity_se . . . -88.423060 -92.982936 -97.433039
concave.points_se . . . 729.155984 763.086267 797.221263
symmetry_se . . . -531.724530 -559.875791 -587.472266
fractal_dimension_se . . . . . .
radius_worst . . . . . .
texture_worst . . . 1.093473 1.150701 1.208161
perimeter_worst . . . . . .
area_worst . . . 0.003872 0.003022 0.002215
smoothness_worst . . . 0.056776 0.375174 0.892680
compactness_worst . . . -7.156507 -8.067822 -8.926024
concavity_worst . . . 45.403565 47.442932 49.451673
concave.points_worst . . . . . .
symmetry_worst . . . 93.199865 97.647607 102.009687
fractal_dimension_worst . . . . . .
L1_coef_Df <- data.frame(t(as.matrix(L1_coef)[-1,]))
L1_coef_Df$grid <- log_lambda_grid
L1_melted <- melt(L1_coef_Df, id.vars="grid")
ggplot(L1_melted, aes(grid, value, col=variable)) + geom_line(size=1) +
xlab('log(lambda)') + ylab('Coefficients') + theme(legend.position = "none")
valid_prob <- predict(L1_models, Xvalid, type='response')
train_prob <- predict(L1_models, Xtrain, type='response')
valid_prob[1:10, c(1:3, 98:100)]
s0 s1 s2 s97 s98 s99
14 0.372807 0.372807 0.372807 7.098823e-02 6.405827e-02 5.796044e-02
16 0.372807 0.372807 0.372807 1.000000e+00 1.000000e+00 1.000000e+00
19 0.372807 0.372807 0.372807 1.000000e+00 1.000000e+00 1.000000e+00
25 0.372807 0.372807 0.372807 1.000000e+00 1.000000e+00 1.000000e+00
27 0.372807 0.372807 0.372807 1.000000e+00 1.000000e+00 1.000000e+00
36 0.372807 0.372807 0.372807 1.000000e+00 1.000000e+00 1.000000e+00
38 0.372807 0.372807 0.372807 5.608363e-16 1.192839e-16 2.536755e-17
41 0.372807 0.372807 0.372807 1.227445e-03 9.349827e-04 7.119527e-04
43 0.372807 0.372807 0.372807 1.000000e+00 1.000000e+00 1.000000e+00
47 0.372807 0.372807 0.372807 5.277422e-18 1.090656e-18 2.236692e-19
train_acc_list <- c()
valid_acc_list <- c()
for(i in 1:100){
train_pred <- ifelse(train_prob[,i] < 0.5, "B", "M")
valid_pred <- ifelse(valid_prob[,i] < 0.5, "B", "M")
train_acc <- mean(train_pred == train$diagnosis)
valid_acc <- mean(valid_pred == valid$diagnosis)
train_acc_list <- c(train_acc_list, train_acc)
valid_acc_list <- c(valid_acc_list, valid_acc)
}
valid_acc_list[c(1:3, 98:100)]
[1] 0.6283186 0.6283186 0.6283186 0.9292035 0.9292035 0.9292035
plot(log_lambda_grid, train_acc_list, ylim=c(0.9,1), pch=".", col="salmon",
xlab="ln(lambda)", ylab="r-Squared", main="Training and Validation Scores (L1 Regularization)")
lines(log_lambda_grid, train_acc_list, col="salmon", lwd=2)
lines(log_lambda_grid, valid_acc_list, col="cornflowerblue", lwd=2)
legend(-1, 1, legend=c("Training Acc", "Validation Acc"),
col=c("salmon", "cornflowerblue"), lty=1, lwd=2, cex=0.8)
best_valid_acc <- max(valid_acc_list)
best_valid_ix <- which.max(valid_acc_list)
best_log_lambda <- log_lambda_grid[best_valid_ix]
cat('Value of Optimal Accuracy: ', best_valid_acc, '\n',
'Index of Optimal Accuracy: ', best_valid_ix, '\n',
'Value of Optimal log(lambda): ', best_log_lambda, sep='')
Value of Optimal Accuracy: 0.9734513
Index of Optimal Accuracy: 61
Value of Optimal log(lambda): -2.424242
best_coef <- L1_coef[,best_valid_ix]
best_coef[best_coef > 0]
texture_mean concave.points_mean radius_se smoothness_se
0.067624717 31.961681711 7.194423600 6.149311710
radius_worst texture_worst perimeter_worst area_worst
0.346194962 0.171694441 0.033123423 0.001727457
smoothness_worst concavity_worst concave.points_worst symmetry_worst
30.399453343 3.915570391 14.114810952 5.658251117
ggcorr(wbc[,best_coef != 0][,-1], palette = "RdBu", label = TRUE)