libraries
library(ggplot2)
library(caret)
## Loading required package: lattice
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-10
library(class)
library(tree)
library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(gbm)
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tibble)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ lubridate 1.9.4 ✔ stringr 1.5.1
## ✔ purrr 1.1.0 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine() masks randomForest::combine()
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ randomForest::margin() masks ggplot2::margin()
## ✖ tidyr::pack() masks Matrix::pack()
## ✖ tidyr::unpack() masks Matrix::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
wdbc<- read.csv("wdbc copy.csv", header = FALSE)
colnames(wdbc) <- c(
"ID",
"Diagnosis",
"radius_mean", "texture_mean", "perimeter_mean", "area_mean", "smoothness_mean",
"compactness_mean", "concavity_mean", "concave_points_mean", "symmetry_mean", "fractal_dimension_mean",
"radius_se", "texture_se", "perimeter_se", "area_se", "smoothness_se",
"compactness_se", "concavity_se", "concave_points_se", "symmetry_se", "fractal_dimension_se",
"radius_worst", "texture_worst", "perimeter_worst", "area_worst", "smoothness_worst",
"compactness_worst", "concavity_worst", "concave_points_worst", "symmetry_worst", "fractal_dimension_worst"
)
head(wdbc)
## ID Diagnosis radius_mean texture_mean perimeter_mean area_mean
## 1 842302 M 17.99 10.38 122.80 1001.0
## 2 842517 M 20.57 17.77 132.90 1326.0
## 3 84300903 M 19.69 21.25 130.00 1203.0
## 4 84348301 M 11.42 20.38 77.58 386.1
## 5 84358402 M 20.29 14.34 135.10 1297.0
## 6 843786 M 12.45 15.70 82.57 477.1
## smoothness_mean compactness_mean concavity_mean concave_points_mean
## 1 0.11840 0.27760 0.3001 0.14710
## 2 0.08474 0.07864 0.0869 0.07017
## 3 0.10960 0.15990 0.1974 0.12790
## 4 0.14250 0.28390 0.2414 0.10520
## 5 0.10030 0.13280 0.1980 0.10430
## 6 0.12780 0.17000 0.1578 0.08089
## symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se
## 1 0.2419 0.07871 1.0950 0.9053 8.589
## 2 0.1812 0.05667 0.5435 0.7339 3.398
## 3 0.2069 0.05999 0.7456 0.7869 4.585
## 4 0.2597 0.09744 0.4956 1.1560 3.445
## 5 0.1809 0.05883 0.7572 0.7813 5.438
## 6 0.2087 0.07613 0.3345 0.8902 2.217
## area_se smoothness_se compactness_se concavity_se concave_points_se
## 1 153.40 0.006399 0.04904 0.05373 0.01587
## 2 74.08 0.005225 0.01308 0.01860 0.01340
## 3 94.03 0.006150 0.04006 0.03832 0.02058
## 4 27.23 0.009110 0.07458 0.05661 0.01867
## 5 94.44 0.011490 0.02461 0.05688 0.01885
## 6 27.19 0.007510 0.03345 0.03672 0.01137
## symmetry_se fractal_dimension_se radius_worst texture_worst perimeter_worst
## 1 0.03003 0.006193 25.38 17.33 184.60
## 2 0.01389 0.003532 24.99 23.41 158.80
## 3 0.02250 0.004571 23.57 25.53 152.50
## 4 0.05963 0.009208 14.91 26.50 98.87
## 5 0.01756 0.005115 22.54 16.67 152.20
## 6 0.02165 0.005082 15.47 23.75 103.40
## area_worst smoothness_worst compactness_worst concavity_worst
## 1 2019.0 0.1622 0.6656 0.7119
## 2 1956.0 0.1238 0.1866 0.2416
## 3 1709.0 0.1444 0.4245 0.4504
## 4 567.7 0.2098 0.8663 0.6869
## 5 1575.0 0.1374 0.2050 0.4000
## 6 741.6 0.1791 0.5249 0.5355
## concave_points_worst symmetry_worst fractal_dimension_worst
## 1 0.2654 0.4601 0.11890
## 2 0.1860 0.2750 0.08902
## 3 0.2430 0.3613 0.08758
## 4 0.2575 0.6638 0.17300
## 5 0.1625 0.2364 0.07678
## 6 0.1741 0.3985 0.12440
clean data
wdbc$ID <- NULL
wdbc$Diagnosis <- factor(wdbc$Diagnosis, levels = c("B","M"))
summary(wdbc$Diagnosis)
## B M
## 357 212
head(wdbc)
## Diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean
## 1 M 17.99 10.38 122.80 1001.0 0.11840
## 2 M 20.57 17.77 132.90 1326.0 0.08474
## 3 M 19.69 21.25 130.00 1203.0 0.10960
## 4 M 11.42 20.38 77.58 386.1 0.14250
## 5 M 20.29 14.34 135.10 1297.0 0.10030
## 6 M 12.45 15.70 82.57 477.1 0.12780
## compactness_mean concavity_mean concave_points_mean symmetry_mean
## 1 0.27760 0.3001 0.14710 0.2419
## 2 0.07864 0.0869 0.07017 0.1812
## 3 0.15990 0.1974 0.12790 0.2069
## 4 0.28390 0.2414 0.10520 0.2597
## 5 0.13280 0.1980 0.10430 0.1809
## 6 0.17000 0.1578 0.08089 0.2087
## fractal_dimension_mean radius_se texture_se perimeter_se area_se
## 1 0.07871 1.0950 0.9053 8.589 153.40
## 2 0.05667 0.5435 0.7339 3.398 74.08
## 3 0.05999 0.7456 0.7869 4.585 94.03
## 4 0.09744 0.4956 1.1560 3.445 27.23
## 5 0.05883 0.7572 0.7813 5.438 94.44
## 6 0.07613 0.3345 0.8902 2.217 27.19
## smoothness_se compactness_se concavity_se concave_points_se symmetry_se
## 1 0.006399 0.04904 0.05373 0.01587 0.03003
## 2 0.005225 0.01308 0.01860 0.01340 0.01389
## 3 0.006150 0.04006 0.03832 0.02058 0.02250
## 4 0.009110 0.07458 0.05661 0.01867 0.05963
## 5 0.011490 0.02461 0.05688 0.01885 0.01756
## 6 0.007510 0.03345 0.03672 0.01137 0.02165
## fractal_dimension_se radius_worst texture_worst perimeter_worst area_worst
## 1 0.006193 25.38 17.33 184.60 2019.0
## 2 0.003532 24.99 23.41 158.80 1956.0
## 3 0.004571 23.57 25.53 152.50 1709.0
## 4 0.009208 14.91 26.50 98.87 567.7
## 5 0.005115 22.54 16.67 152.20 1575.0
## 6 0.005082 15.47 23.75 103.40 741.6
## smoothness_worst compactness_worst concavity_worst concave_points_worst
## 1 0.1622 0.6656 0.7119 0.2654
## 2 0.1238 0.1866 0.2416 0.1860
## 3 0.1444 0.4245 0.4504 0.2430
## 4 0.2098 0.8663 0.6869 0.2575
## 5 0.1374 0.2050 0.4000 0.1625
## 6 0.1791 0.5249 0.5355 0.1741
## symmetry_worst fractal_dimension_worst
## 1 0.4601 0.11890
## 2 0.2750 0.08902
## 3 0.3613 0.08758
## 4 0.6638 0.17300
## 5 0.2364 0.07678
## 6 0.3985 0.12440
standardize predictors
predictor_names <- colnames(wdbc)[-1]
wdbc_scaled <- wdbc
wdbc_scaled[, predictor_names] <- scale(wdbc[, predictor_names])
head(wdbc_scaled)
## Diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean
## 1 M 1.0960995 -2.0715123 1.2688173 0.9835095 1.5670875
## 2 M 1.8282120 -0.3533215 1.6844726 1.9070303 -0.8262354
## 3 M 1.5784992 0.4557859 1.5651260 1.5575132 0.9413821
## 4 M -0.7682333 0.2535091 -0.5921661 -0.7637917 3.2806668
## 5 M 1.7487579 -1.1508038 1.7750113 1.8246238 0.2801253
## 6 M -0.4759559 -0.8346009 -0.3868077 -0.5052059 2.2354545
## compactness_mean concavity_mean concave_points_mean symmetry_mean
## 1 3.2806281 2.65054179 2.5302489 2.215565542
## 2 -0.4866435 -0.02382489 0.5476623 0.001391139
## 3 1.0519999 1.36227979 2.0354398 0.938858720
## 4 3.3999174 1.91421287 1.4504311 2.864862154
## 5 0.5388663 1.36980615 1.4272370 -0.009552062
## 6 1.2432416 0.86554001 0.8239307 1.004517928
## fractal_dimension_mean radius_se texture_se perimeter_se area_se
## 1 2.2537638 2.4875451 -0.5647681 2.8305403 2.4853907
## 2 -0.8678888 0.4988157 -0.8754733 0.2630955 0.7417493
## 3 -0.3976580 1.2275958 -0.7793976 0.8501802 1.1802975
## 4 4.9066020 0.3260865 -0.1103120 0.2863415 -0.2881246
## 5 -0.5619555 1.2694258 -0.7895490 1.2720701 1.1893103
## 6 1.8883435 -0.2548461 -0.5921406 -0.3210217 -0.2890039
## smoothness_se compactness_se concavity_se concave_points_se symmetry_se
## 1 -0.2138135 1.31570389 0.7233897 0.66023900 1.1477468
## 2 -0.6048187 -0.69231710 -0.4403926 0.25993335 -0.8047423
## 3 -0.2967439 0.81425704 0.2128891 1.42357487 0.2368272
## 4 0.6890953 2.74186785 0.8187979 1.11402678 4.7285198
## 5 1.4817634 -0.04847723 0.8277425 1.14319885 -0.3607748
## 6 0.1562093 0.44515196 0.1598845 -0.06906279 0.1340009
## fractal_dimension_se radius_worst texture_worst perimeter_worst area_worst
## 1 0.90628565 1.8850310 -1.35809849 2.3015755 1.9994782
## 2 -0.09935632 1.8043398 -0.36887865 1.5337764 1.8888270
## 3 0.29330133 1.5105411 -0.02395331 1.3462906 1.4550043
## 4 2.04571087 -0.2812170 0.13386631 -0.2497196 -0.5495377
## 5 0.49888916 1.2974336 -1.46548091 1.3373627 1.2196511
## 6 0.48641784 -0.1653528 -0.31356043 -0.1149083 -0.2441054
## smoothness_worst compactness_worst concavity_worst concave_points_worst
## 1 1.3065367 2.6143647 2.1076718 2.2940576
## 2 -0.3752817 -0.4300658 -0.1466200 1.0861286
## 3 0.5269438 1.0819801 0.8542223 1.9532817
## 4 3.3912907 3.8899747 1.9878392 2.1738732
## 5 0.2203623 -0.3131190 0.6126397 0.7286181
## 6 2.0467119 1.7201029 1.2621327 0.9050914
## symmetry_worst fractal_dimension_worst
## 1 2.7482041 1.9353117
## 2 -0.2436753 0.2809428
## 3 1.1512420 0.2012142
## 4 6.0407261 4.9306719
## 5 -0.8675896 -0.3967505
## 6 1.7525273 2.2398308
train/test split
set.seed(4630)
n <- nrow(wdbc_scaled)
train_idx <- sample(1:n, size = floor(0.7 * n))
wdbc_train <- wdbc_scaled[train_idx, ]
wdbc_test <- wdbc_scaled[-train_idx, ]
table(wdbc_train$Diagnosis)
##
## B M
## 250 148
table(wdbc_test$Diagnosis)
##
## B M
## 107 64
EDA
PCA plot
# 1. Create matrix of predictors
X_pca <- wdbc %>%
dplyr::select(-Diagnosis) %>% # keep only numeric columns
as.matrix()
# 2. Run PCA on standardized predictors
wdbc_pca <- prcomp(X_pca, center = TRUE, scale. = TRUE)
# 3. PCA biplot: points = observations, arrows = variables
fviz_pca_biplot(
wdbc_pca,
geom = "point",
habillage = wdbc$Diagnosis, # color by diagnosis
addEllipses = TRUE, # optional: class ellipses
label = "var", # show variable names as arrows
col.var = "black",
alpha.ind = 0.6
) +
theme_minimal() +
labs(
title = "PCA Biplot of WDBC Predictors",
color = "Diagnosis"
)
class imbalance
ggplot(wdbc, aes(x = Diagnosis)) +
geom_bar() +
labs(title = "Class Distribution", y = "Count")
The dataset contains more observations in the benign category than the
malignant, which could potentially lead to problems in classification
where the model is better at recognizing cases of benign but not as good
at recognizing malignant. This could potentially lead to higher
specificity than sensitivity of the models.
summary stats
summary(wdbc[, predictor_names])
## radius_mean texture_mean perimeter_mean area_mean
## Min. : 6.981 Min. : 9.71 Min. : 43.79 Min. : 143.5
## 1st Qu.:11.700 1st Qu.:16.17 1st Qu.: 75.17 1st Qu.: 420.3
## Median :13.370 Median :18.84 Median : 86.24 Median : 551.1
## Mean :14.127 Mean :19.29 Mean : 91.97 Mean : 654.9
## 3rd Qu.:15.780 3rd Qu.:21.80 3rd Qu.:104.10 3rd Qu.: 782.7
## Max. :28.110 Max. :39.28 Max. :188.50 Max. :2501.0
## smoothness_mean compactness_mean concavity_mean concave_points_mean
## Min. :0.05263 Min. :0.01938 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.08637 1st Qu.:0.06492 1st Qu.:0.02956 1st Qu.:0.02031
## Median :0.09587 Median :0.09263 Median :0.06154 Median :0.03350
## Mean :0.09636 Mean :0.10434 Mean :0.08880 Mean :0.04892
## 3rd Qu.:0.10530 3rd Qu.:0.13040 3rd Qu.:0.13070 3rd Qu.:0.07400
## Max. :0.16340 Max. :0.34540 Max. :0.42680 Max. :0.20120
## symmetry_mean fractal_dimension_mean radius_se texture_se
## Min. :0.1060 Min. :0.04996 Min. :0.1115 Min. :0.3602
## 1st Qu.:0.1619 1st Qu.:0.05770 1st Qu.:0.2324 1st Qu.:0.8339
## Median :0.1792 Median :0.06154 Median :0.3242 Median :1.1080
## Mean :0.1812 Mean :0.06280 Mean :0.4052 Mean :1.2169
## 3rd Qu.:0.1957 3rd Qu.:0.06612 3rd Qu.:0.4789 3rd Qu.:1.4740
## Max. :0.3040 Max. :0.09744 Max. :2.8730 Max. :4.8850
## perimeter_se area_se smoothness_se compactness_se
## Min. : 0.757 Min. : 6.802 Min. :0.001713 Min. :0.002252
## 1st Qu.: 1.606 1st Qu.: 17.850 1st Qu.:0.005169 1st Qu.:0.013080
## Median : 2.287 Median : 24.530 Median :0.006380 Median :0.020450
## Mean : 2.866 Mean : 40.337 Mean :0.007041 Mean :0.025478
## 3rd Qu.: 3.357 3rd Qu.: 45.190 3rd Qu.:0.008146 3rd Qu.:0.032450
## Max. :21.980 Max. :542.200 Max. :0.031130 Max. :0.135400
## concavity_se concave_points_se symmetry_se fractal_dimension_se
## Min. :0.00000 Min. :0.000000 Min. :0.007882 Min. :0.0008948
## 1st Qu.:0.01509 1st Qu.:0.007638 1st Qu.:0.015160 1st Qu.:0.0022480
## Median :0.02589 Median :0.010930 Median :0.018730 Median :0.0031870
## Mean :0.03189 Mean :0.011796 Mean :0.020542 Mean :0.0037949
## 3rd Qu.:0.04205 3rd Qu.:0.014710 3rd Qu.:0.023480 3rd Qu.:0.0045580
## Max. :0.39600 Max. :0.052790 Max. :0.078950 Max. :0.0298400
## radius_worst texture_worst perimeter_worst area_worst
## Min. : 7.93 Min. :12.02 Min. : 50.41 Min. : 185.2
## 1st Qu.:13.01 1st Qu.:21.08 1st Qu.: 84.11 1st Qu.: 515.3
## Median :14.97 Median :25.41 Median : 97.66 Median : 686.5
## Mean :16.27 Mean :25.68 Mean :107.26 Mean : 880.6
## 3rd Qu.:18.79 3rd Qu.:29.72 3rd Qu.:125.40 3rd Qu.:1084.0
## Max. :36.04 Max. :49.54 Max. :251.20 Max. :4254.0
## smoothness_worst compactness_worst concavity_worst concave_points_worst
## Min. :0.07117 Min. :0.02729 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.11660 1st Qu.:0.14720 1st Qu.:0.1145 1st Qu.:0.06493
## Median :0.13130 Median :0.21190 Median :0.2267 Median :0.09993
## Mean :0.13237 Mean :0.25427 Mean :0.2722 Mean :0.11461
## 3rd Qu.:0.14600 3rd Qu.:0.33910 3rd Qu.:0.3829 3rd Qu.:0.16140
## Max. :0.22260 Max. :1.05800 Max. :1.2520 Max. :0.29100
## symmetry_worst fractal_dimension_worst
## Min. :0.1565 Min. :0.05504
## 1st Qu.:0.2504 1st Qu.:0.07146
## Median :0.2822 Median :0.08004
## Mean :0.2901 Mean :0.08395
## 3rd Qu.:0.3179 3rd Qu.:0.09208
## Max. :0.6638 Max. :0.20750
correlation plots (for the means of each feature)
mean_vars <- grep("_mean$", names(wdbc_scaled), value = TRUE)
mean_vars
## [1] "radius_mean" "texture_mean" "perimeter_mean"
## [4] "area_mean" "smoothness_mean" "compactness_mean"
## [7] "concavity_mean" "concave_points_mean" "symmetry_mean"
## [10] "fractal_dimension_mean"
cor_mean <- cor(wdbc_scaled[, mean_vars])
cor_mean
## radius_mean texture_mean perimeter_mean area_mean
## radius_mean 1.0000000 0.32378189 0.9978553 0.9873572
## texture_mean 0.3237819 1.00000000 0.3295331 0.3210857
## perimeter_mean 0.9978553 0.32953306 1.0000000 0.9865068
## area_mean 0.9873572 0.32108570 0.9865068 1.0000000
## smoothness_mean 0.1705812 -0.02338852 0.2072782 0.1770284
## compactness_mean 0.5061236 0.23670222 0.5569362 0.4985017
## concavity_mean 0.6767636 0.30241783 0.7161357 0.6859828
## concave_points_mean 0.8225285 0.29346405 0.8509770 0.8232689
## symmetry_mean 0.1477412 0.07140098 0.1830272 0.1512931
## fractal_dimension_mean -0.3116308 -0.07643718 -0.2614769 -0.2831098
## smoothness_mean compactness_mean concavity_mean
## radius_mean 0.17058119 0.5061236 0.6767636
## texture_mean -0.02338852 0.2367022 0.3024178
## perimeter_mean 0.20727816 0.5569362 0.7161357
## area_mean 0.17702838 0.4985017 0.6859828
## smoothness_mean 1.00000000 0.6591232 0.5219838
## compactness_mean 0.65912322 1.0000000 0.8831207
## concavity_mean 0.52198377 0.8831207 1.0000000
## concave_points_mean 0.55369517 0.8311350 0.9213910
## symmetry_mean 0.55777479 0.6026410 0.5006666
## fractal_dimension_mean 0.58479200 0.5653687 0.3367834
## concave_points_mean symmetry_mean fractal_dimension_mean
## radius_mean 0.8225285 0.14774124 -0.31163083
## texture_mean 0.2934641 0.07140098 -0.07643718
## perimeter_mean 0.8509770 0.18302721 -0.26147691
## area_mean 0.8232689 0.15129308 -0.28310981
## smoothness_mean 0.5536952 0.55777479 0.58479200
## compactness_mean 0.8311350 0.60264105 0.56536866
## concavity_mean 0.9213910 0.50066662 0.33678336
## concave_points_mean 1.0000000 0.46249739 0.16691738
## symmetry_mean 0.4624974 1.00000000 0.47992133
## fractal_dimension_mean 0.1669174 0.47992133 1.00000000
library(corrplot)
## corrplot 0.95 loaded
corrplot(cor_mean,
method = "circle",
type = "lower",
tl.col = "black",
tl.cex = 0.8)
for just the se’s of each predictor
Logistic Regression
fit_log <- glm(Diagnosis ~ .,
data = wdbc_train,
family = binomial)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(fit_log)
##
## Call:
## glm(formula = Diagnosis ~ ., family = binomial, data = wdbc_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 164.426 32237.326 0.005 0.996
## radius_mean -1818.787 495097.811 -0.004 0.997
## texture_mean 114.368 16695.906 0.007 0.995
## perimeter_mean 660.558 294904.859 0.002 0.998
## area_mean 729.018 252887.644 0.003 0.998
## smoothness_mean -18.118 10295.461 -0.002 0.999
## compactness_mean -350.487 29475.178 -0.012 0.991
## concavity_mean 172.462 50796.003 0.003 0.997
## concave_points_mean 413.978 61856.969 0.007 0.995
## symmetry_mean -28.075 7851.281 -0.004 0.997
## fractal_dimension_mean 9.596 21439.885 0.000 1.000
## radius_se 63.627 188417.693 0.000 1.000
## texture_se 87.928 7404.713 0.012 0.991
## perimeter_se 119.091 86913.618 0.001 0.999
## area_se -173.212 224417.259 -0.001 0.999
## smoothness_se -80.479 14229.172 -0.006 0.995
## compactness_se 228.015 51837.798 0.004 0.996
## concavity_se -201.618 102725.466 -0.002 0.998
## concave_points_se 47.028 22467.813 0.002 0.998
## symmetry_se -61.114 12337.303 -0.005 0.996
## fractal_dimension_se -247.702 28016.501 -0.009 0.993
## radius_worst 453.117 505855.074 0.001 0.999
## texture_worst -60.789 13930.301 -0.004 0.997
## perimeter_worst -149.276 251486.916 -0.001 1.000
## area_worst 548.190 380001.247 0.001 0.999
## smoothness_worst 26.950 24617.224 0.001 0.999
## compactness_worst -275.046 46786.449 -0.006 0.995
## concavity_worst 175.137 71684.498 0.002 0.998
## concave_points_worst 83.082 39612.407 0.002 0.998
## symmetry_worst 143.219 11053.504 0.013 0.990
## fractal_dimension_worst 219.791 31661.056 0.007 0.994
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5.2531e+02 on 397 degrees of freedom
## Residual deviance: 1.0723e-06 on 367 degrees of freedom
## AIC: 62
##
## Number of Fisher Scoring iterations: 25
The logistic regression model (without regularization) does not perform variable selection, so all 30 predictors are included in the model making it relatively difficult to interpret. None of the variables have a p-value less than 0.05, suggesting that none of them significantly contribute to the model.
prediction on test set
log_prob <- predict(fit_log,
newdata = wdbc_test,
type = "response")
#head(log_prob)
model eval
log_pred <- ifelse(log_prob > 0.5, "M", "B")
log_pred <- factor(log_pred, levels = c("B","M"))
#Confusion Matrix
cm_log <- confusionMatrix(log_pred, wdbc_test$Diagnosis, positive = "M")
cm_log
## Confusion Matrix and Statistics
##
## Reference
## Prediction B M
## B 104 4
## M 3 60
##
## Accuracy : 0.9591
## 95% CI : (0.9175, 0.9834)
## No Information Rate : 0.6257
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9123
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9375
## Specificity : 0.9720
## Pos Pred Value : 0.9524
## Neg Pred Value : 0.9630
## Prevalence : 0.3743
## Detection Rate : 0.3509
## Detection Prevalence : 0.3684
## Balanced Accuracy : 0.9547
##
## 'Positive' Class : M
##
roc_log <- roc(wdbc_test$Diagnosis, log_prob, levels = c("B","M"))
## Setting direction: controls < cases
plot(roc_log, main = "ROC Curve - Logistic Regression")
auc_log <- auc(roc_log)
auc_log
## Area under the curve: 0.9745
Although none of the predictors were signficant, the baseline logistic regression model performs well on the test set with a 96% accuracy and and AUC of 0.9745.
Logistic Regression
X_train <- as.matrix(wdbc_train[, -1])
y_train <- wdbc_train$Diagnosis
y_train_bin <- ifelse(y_train == "M", 1, 0)
# Test set
X_test <- as.matrix(wdbc_test[, -1])
y_test <- wdbc_test$Diagnosis
y_test_bin <- ifelse(y_test == "M", 1, 0)
#Cross Validation
set.seed(4630)
cv_lasso <- cv.glmnet(
X_train,
y_train_bin,
alpha = 1,
family = "binomial",
nfolds = 10,
type.measure = "deviance"
)
plot(cv_lasso)
minimum lambda
lambda_min <- cv_lasso$lambda.min # λ that gives minimum CV error
lambda_1se <- cv_lasso$lambda.1se # more conservative (simpler model)
lambda_min
## [1] 0.004844294
lambda_1se
## [1] 0.01781915
The minimum lamda for LASSO as determined by 10-fold cross validation is 0.003, which is what we will use for the model on the training set.
LASSO coefficients
# Extract coefficients at lambda_min (your object)
coef_lasso <- coef(cv_lasso, s = lambda_min)
# Convert from sparse matrix to a regular matrix
coef_matrix <- as.matrix(coef_lasso)
# Pull only the non-zero coefficients
nonzero_coefs <- coef_matrix[coef_matrix[,1] != 0, , drop = FALSE]
nonzero_coefs
## s=0.004844294
## (Intercept) -0.7335676
## concave_points_mean 0.4832564
## radius_se 0.5834270
## radius_worst 4.1295741
## texture_worst 1.2001398
## smoothness_worst 0.7689339
## concavity_worst 0.2805019
## concave_points_worst 1.0198362
## symmetry_worst 0.1355000
Interpretation: Using the lambda selected by 10-fold CV, the LASSO model produced 12 predictors. This model is easier to interpret because LASSO performs variable selection, and since the predictors are standardized, we can compare the values to each other. The LASSO model predicted the standard error and the most extreme value of nuclei radius, the extreme values of nuclei texture, area, and concave points to be most predictive of malignancy in breast cancer cells. The most important predictors determined by the model also have positive values for their coefficients, indicating and increase in these values increases the probability of a cell being malignant. While the most extreme value for texture was an important indicator with a value of 1.405, the standard error of texture was retained as a predictor in the lasso model with a negative coefficient of -0.169, indicating that worse values of texture increase likelihood of malignancy whereas the model decreases the prediction probability of malignancy when there’s more variation between texture values in the data set. Out of the ten predictors that recorded the most extreme measurements of each feature, seven were retained suggesting that the extreme values provided the most diagnostic information compared to the mean and se values.
-how do i interpret the coefficient of the feature se’s in terms of how it effects a single nuclei’s likelihood of being malignant (considering that individual nuclei won’t have an se to plug into the equation)?
model eval
lasso_prob <- predict(cv_lasso, newx = X_test, s = lambda_min, type = "response")
lasso_pred <- ifelse(lasso_prob > 0.5, "M", "B")
lasso_pred <- factor(lasso_pred, levels = c("B","M"))
cm_lasso <- confusionMatrix(lasso_pred, y_test, positive = "M")
cm_lasso
## Confusion Matrix and Statistics
##
## Reference
## Prediction B M
## B 105 4
## M 2 60
##
## Accuracy : 0.9649
## 95% CI : (0.9252, 0.987)
## No Information Rate : 0.6257
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9246
##
## Mcnemar's Test P-Value : 0.6831
##
## Sensitivity : 0.9375
## Specificity : 0.9813
## Pos Pred Value : 0.9677
## Neg Pred Value : 0.9633
## Prevalence : 0.3743
## Detection Rate : 0.3509
## Detection Prevalence : 0.3626
## Balanced Accuracy : 0.9594
##
## 'Positive' Class : M
##
library(pROC)
roc_lasso <- roc(y_test, as.numeric(lasso_prob))
## Setting levels: control = B, case = M
## Setting direction: controls < cases
plot(roc_lasso, main = "ROC Curve - LASSO Logistic Regression")
auc_lasso <- auc(roc_lasso)
auc_lasso
## Area under the curve: 0.9945
Both the accuracy and the AUC improved for the LASSO model compared to the basline logistic regression model.
KNN
X <- as.matrix(wdbc_train[, -1])
y <- ifelse(wdbc_train$Diagnosis == "M", 1, 0)
n <- nrow(X)
Kfold <- 10
set.seed(4630)
fold_id <- sample(rep(1:Kfold, length.out = n))
k_grid <- seq(1, 25, by = 2)
cv_auc <- numeric(length(k_grid))
for (i in seq_along(k_grid)) {
k <- k_grid[i]
fold_auc <- numeric(Kfold)
for (f in 1:Kfold) {
val_idx <- which(fold_id == f)
train_idx <- which(fold_id != f)
X_tr <- X[train_idx, , drop = FALSE]
y_tr <- y[train_idx]
X_val <- X[val_idx, , drop = FALSE]
y_val <- y[val_idx]
# kNN with probability output
pred_f <- knn(
train = X_tr,
test = X_val,
cl = factor(y_tr, levels = c(0,1), labels = c("B","M")),
k = k,
prob = TRUE
)
# proportion of votes for predicted class
p_win <- attr(pred_f, "prob")
# convert to P(M)
p_hat_M <- ifelse(pred_f == "M", p_win, 1 - p_win)
# AUC for this fold (M is positive class)
roc_f <- roc(
response = factor(y_val, levels = c(0,1), labels = c("B","M")),
predictor = p_hat_M,
levels = c("B","M"),
quiet = TRUE
)
fold_auc[f] <- auc(roc_f)
}
cv_auc[i] <- mean(fold_auc)
}
data.frame(k = k_grid, CV_AUC = cv_auc)
## k CV_AUC
## 1 1 0.9482415
## 2 3 0.9805070
## 3 5 0.9820192
## 4 7 0.9886477
## 5 9 0.9879004
## 6 11 0.9868566
## 7 13 0.9862328
## 8 15 0.9853127
## 9 17 0.9856022
## 10 19 0.9848013
## 11 21 0.9857586
## 12 23 0.9864944
## 13 25 0.9860223
plot(k_grid, cv_auc, type = "b",
xlab = "k (Number of Neighbors)",
ylab = "10-fold CV AUC",
main = "KNN Model Selection Using 10-fold CV (AUC)")
abline(v = k_grid[which.max(cv_auc)], lty = 2)
model
best_k <- k_grid[which.max(cv_auc)]
best_k
## [1] 7
knn_best <- knn(train = X_train,
test = X_test,
cl = y_train,
k = best_k)
knn_best
## [1] M M M M M M B M M M M B M M M B B M M M B M B M M M M B B M B M B B B B B
## [38] B B B B B M B M B B M B B M B M B M B B M B B B M M M M B B M B M B M B M
## [75] B B B B M M B M M M B M B B B B B B B B M M B B B M B B B B B B B M B B B
## [112] B B B B B B B B B B B B B B B M B M M M B B B B B B B B M M M B B B M B B
## [149] B B M B M M B B B B B M B B B B B B B B B M B
## Levels: B M
Based on 10-fold cross validation, the k value with the highest AUC is 7 so we will run our KNN model on the training set using k = 7.
model eval
mean(knn_best == y_test)
## [1] 0.9824561
cm_knn <- confusionMatrix(knn_best, y_test, positive = "M")
cm_knn
## Confusion Matrix and Statistics
##
## Reference
## Prediction B M
## B 107 3
## M 0 61
##
## Accuracy : 0.9825
## 95% CI : (0.9496, 0.9964)
## No Information Rate : 0.6257
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9622
##
## Mcnemar's Test P-Value : 0.2482
##
## Sensitivity : 0.9531
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9727
## Prevalence : 0.3743
## Detection Rate : 0.3567
## Detection Prevalence : 0.3567
## Balanced Accuracy : 0.9766
##
## 'Positive' Class : M
##
ROC and AUC (based on vote proportions?)
set.seed(4630)
knn_pred <- knn(
train = X_train,
test = X_test,
cl = y_train,
k = best_k, # or any chosen k
prob = TRUE
)
# predicted class labels
head(knn_pred)
## [1] M M M M M M
## Levels: B M
# proportion of votes for the predicted (winning) class
p_hat_win <- attr(knn_pred, "prob")
# convert to probability of "M"
# if predicted "M", use p_hat_win; if predicted "B", use 1 - p_hat_win
p_hat_M <- ifelse(knn_pred == "M", p_hat_win, 1 - p_hat_win)
head(p_hat_M)
## [1] 1.0000000 0.8571429 1.0000000 0.5714286 0.7142857 1.0000000
# y_test should be factor with levels c("B", "M")
roc_knn <- roc(y_test, p_hat_M, levels = c("B","M"))
## Setting direction: controls < cases
plot(roc_knn, main = "ROC Curve - KNN")
auc_knn <- auc(roc_knn)
Simple Decision Tree:
wdbc_train_raw <- wdbc[train_idx, ]
wdbc_test_raw <- wdbc[-train_idx, ]
table(wdbc_train_raw$Diagnosis)
##
## B M
## 202 157
table(wdbc_test_raw$Diagnosis)
##
## B M
## 155 55
set.seed(4630)
tree_bc <- tree(Diagnosis ~ ., data = wdbc_train_raw)
summary(tree_bc)
##
## Classification tree:
## tree(formula = Diagnosis ~ ., data = wdbc_train_raw)
## Variables actually used in tree construction:
## [1] "perimeter_worst" "concave_points_worst" "radius_se"
## [4] "texture_worst" "smoothness_worst"
## Number of terminal nodes: 8
## Residual mean deviance: 0.09148 = 32.11 / 351
## Misclassification error rate: 0.02228 = 8 / 359
plot(tree_bc)
text(tree_bc, pretty = 0)
model eval (unpruned tree)
tree_unpruned_class <- predict(tree_bc,
newdata = wdbc_test_raw,
type = "class")
cm_tree_unpruned <- confusionMatrix(tree_unpruned_class,
wdbc_test_raw$Diagnosis,
positive = "M")
cm_tree_unpruned
## Confusion Matrix and Statistics
##
## Reference
## Prediction B M
## B 137 0
## M 18 55
##
## Accuracy : 0.9143
## 95% CI : (0.8679, 0.9484)
## No Information Rate : 0.7381
## P-Value [Acc > NIR] : 1.003e-10
##
## Kappa : 0.7995
##
## Mcnemar's Test P-Value : 6.151e-05
##
## Sensitivity : 1.0000
## Specificity : 0.8839
## Pos Pred Value : 0.7534
## Neg Pred Value : 1.0000
## Prevalence : 0.2619
## Detection Rate : 0.2619
## Detection Prevalence : 0.3476
## Balanced Accuracy : 0.9419
##
## 'Positive' Class : M
##
# Probabilities for M
# Probabilities matrix (columns: B, M)
tree_unpruned_probs <- predict(tree_bc,
newdata = wdbc_test_raw,
type = "vector")
# Extract probability for M
tree_unpruned_probM <- tree_unpruned_probs[, "M"]
roc_tree_unpruned <- roc(wdbc_test_raw$Diagnosis,
tree_unpruned_probM,
levels = c("B", "M"))
## Setting direction: controls < cases
plot(roc_tree_unpruned,
main = "ROC Curve – Unpruned Decision Tree")
auc_tree_unpruned <- auc(roc_tree_unpruned)
auc_tree_unpruned
## Area under the curve: 0.9864
cv for tree size
cv_bc <- cv.tree(tree_bc, FUN = prune.misclass)
plot(cv_bc$size, cv_bc$dev,
type = "b",
xlab = "Tree Size (Terminal Nodes)",
ylab = "CV Misclassification Error",
main = "CV Error vs Tree Size")
best_size <- cv_bc$size[which.min(cv_bc$dev)]
best_size
## [1] 8
pruned_bc <- prune.misclass(tree_bc, best = best_size)
plot(pruned_bc)
text(pruned_bc, pretty = 0)
model eval (pruned tree)
tree_pred <- predict(pruned_bc, newdata = wdbc_test_raw, type = "class")
cm_tree_pruned <- confusionMatrix(tree_pred,
wdbc_test_raw$Diagnosis,
positive = "M")
cm_tree_pruned
## Confusion Matrix and Statistics
##
## Reference
## Prediction B M
## B 137 0
## M 18 55
##
## Accuracy : 0.9143
## 95% CI : (0.8679, 0.9484)
## No Information Rate : 0.7381
## P-Value [Acc > NIR] : 1.003e-10
##
## Kappa : 0.7995
##
## Mcnemar's Test P-Value : 6.151e-05
##
## Sensitivity : 1.0000
## Specificity : 0.8839
## Pos Pred Value : 0.7534
## Neg Pred Value : 1.0000
## Prevalence : 0.2619
## Detection Rate : 0.2619
## Detection Prevalence : 0.3476
## Balanced Accuracy : 0.9419
##
## 'Positive' Class : M
##
tree_prob <- predict(pruned_bc, newdata = wdbc_test_raw)[,"M"]
roc_tree <- roc(wdbc_test_raw$Diagnosis, tree_prob, levels = c("B","M"))
## Setting direction: controls < cases
plot(roc_tree, main = "ROC Curve — Decision Tree")
auc_tree_pruned <- auc(roc_tree)
auc_tree_pruned
## Area under the curve: 0.9864
While the AUC is slightly less for the pruned tree, it is more interpretable and has a higher accuracy than the unpruned tree because it is more generalizable.
Random Forest cross validation for mtry
# Make sure M is the positive (first) level for caret
wdbc_train_raw$Diagnosis <- relevel(wdbc_train_raw$Diagnosis, ref = "M")
wdbc_test_raw$Diagnosis <- relevel(wdbc_test_raw$Diagnosis, ref = "M")
ctrl_rf <- trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary, # returns ROC, Sens, Spec
savePredictions = "final"
)
set.seed(4630)
rf_tuned <- train(
Diagnosis ~ .,
data = wdbc_train_raw,
method = "rf",
metric = "ROC", # choose hyperparameters by CV AUC
trControl = ctrl_rf,
tuneGrid = data.frame(
mtry = c(4, 8, 12, 16) # you can tweak this grid
),
ntree = 500
)
rf_tuned
## Random Forest
##
## 359 samples
## 30 predictor
## 2 classes: 'M', 'B'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 286, 287, 287, 288, 288
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec
## 4 0.9900880 0.9679435 0.955122
## 8 0.9892216 0.9616935 0.945122
## 12 0.9896001 0.9679435 0.945122
## 16 0.9880231 0.9616935 0.940122
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 4.
rf_tuned$bestTune
## mtry
## 1 4
# Variable importance (caret style)
varImp(rf_tuned)
## rf variable importance
##
## only 20 most important variables shown (out of 30)
##
## Overall
## perimeter_worst 100.000
## radius_worst 97.897
## concave_points_worst 90.520
## area_worst 87.995
## concave_points_mean 74.235
## concavity_worst 44.606
## concavity_mean 39.463
## area_mean 39.409
## perimeter_mean 36.987
## area_se 30.868
## radius_mean 28.284
## texture_worst 22.521
## texture_mean 12.998
## radius_se 10.585
## smoothness_worst 10.260
## compactness_mean 8.641
## compactness_worst 7.158
## perimeter_se 5.884
## symmetry_worst 3.756
## concavity_se 3.415
# Test set predictions
rf_pred_class <- predict(rf_tuned, newdata = wdbc_test_raw)
cm_rf <- confusionMatrix(rf_pred_class,
wdbc_test_raw$Diagnosis,
positive = "M")
cm_rf
## Confusion Matrix and Statistics
##
## Reference
## Prediction M B
## M 54 10
## B 1 145
##
## Accuracy : 0.9476
## 95% CI : (0.9082, 0.9736)
## No Information Rate : 0.7381
## P-Value [Acc > NIR] : 1.792e-15
##
## Kappa : 0.8713
##
## Mcnemar's Test P-Value : 0.01586
##
## Sensitivity : 0.9818
## Specificity : 0.9355
## Pos Pred Value : 0.8437
## Neg Pred Value : 0.9932
## Prevalence : 0.2619
## Detection Rate : 0.2571
## Detection Prevalence : 0.3048
## Balanced Accuracy : 0.9587
##
## 'Positive' Class : M
##
# Probabilities for M for ROC/AUC
rf_prob <- predict(rf_tuned, newdata = wdbc_test_raw, type = "prob")[, "M"]
roc_rf <- roc(wdbc_test_raw$Diagnosis, rf_prob, levels = c("B","M"))
## Setting direction: controls < cases
plot(roc_rf, main = "ROC Curve – Random Forest (tuned mtry)")
auc_rf <- auc(roc_rf)
auc_rf
## Area under the curve: 0.9982
Boosting (GBM)
ctrl_gbm <- trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary,
savePredictions = "final"
)
grid_gbm <- expand.grid(
interaction.depth = c(1, 3, 5), # tree depth
n.trees = c(1000, 2000, 3000), # number of trees
shrinkage = c(0.01, 0.05), # learning rate
n.minobsinnode = 10
)
set.seed(4630)
gbm_tuned <- train(
Diagnosis ~ .,
data = wdbc_train_raw,
method = "gbm",
distribution = "bernoulli",
trControl = ctrl_gbm,
tuneGrid = grid_gbm,
metric = "ROC",
verbose = FALSE
)
gbm_tuned
## Stochastic Gradient Boosting
##
## 359 samples
## 30 predictor
## 2 classes: 'M', 'B'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 286, 287, 287, 288, 288
## Resampling results across tuning parameters:
##
## shrinkage interaction.depth n.trees ROC Sens Spec
## 0.01 1 1000 0.9936095 0.9425403 0.9650000
## 0.01 1 2000 0.9950688 0.9554435 0.9651220
## 0.01 1 3000 0.9960354 0.9554435 0.9801220
## 0.01 3 1000 0.9926694 0.9489919 0.9651220
## 0.01 3 2000 0.9931622 0.9554435 0.9503659
## 0.01 3 3000 0.9936472 0.9554435 0.9553659
## 0.01 5 1000 0.9931550 0.9552419 0.9452439
## 0.01 5 2000 0.9936300 0.9487903 0.9552439
## 0.01 5 3000 0.9936491 0.9487903 0.9652439
## 0.05 1 1000 0.9960303 0.9618952 0.9702439
## 0.05 1 2000 0.9961949 0.9556452 0.9751220
## 0.05 1 3000 0.9960297 0.9556452 0.9701220
## 0.05 3 1000 0.9945858 0.9616935 0.9553659
## 0.05 3 2000 0.9944386 0.9616935 0.9652439
## 0.05 3 3000 0.9942784 0.9616935 0.9652439
## 0.05 5 1000 0.9929899 0.9616935 0.9652439
## 0.05 5 2000 0.9926763 0.9616935 0.9602439
## 0.05 5 3000 0.9914066 0.9552419 0.9750000
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 2000, interaction.depth =
## 1, shrinkage = 0.05 and n.minobsinnode = 10.
gbm_tuned$bestTune
## n.trees interaction.depth shrinkage n.minobsinnode
## 11 2000 1 0.05 10
# Probabilities for M
boost_prob <- predict(gbm_tuned,
newdata = wdbc_test_raw,
type = "prob")[, "M"]
boost_pred_class <- ifelse(boost_prob > 0.5, "M", "B")
boost_pred_class <- factor(boost_pred_class, levels = c("B","M"))
cm_boost <- confusionMatrix(boost_pred_class,
wdbc_test_raw$Diagnosis,
positive = "M")
## Warning in confusionMatrix.default(boost_pred_class, wdbc_test_raw$Diagnosis, :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
cm_boost
## Confusion Matrix and Statistics
##
## Reference
## Prediction M B
## M 54 10
## B 1 145
##
## Accuracy : 0.9476
## 95% CI : (0.9082, 0.9736)
## No Information Rate : 0.7381
## P-Value [Acc > NIR] : 1.792e-15
##
## Kappa : 0.8713
##
## Mcnemar's Test P-Value : 0.01586
##
## Sensitivity : 0.9818
## Specificity : 0.9355
## Pos Pred Value : 0.8437
## Neg Pred Value : 0.9932
## Prevalence : 0.2619
## Detection Rate : 0.2571
## Detection Prevalence : 0.3048
## Balanced Accuracy : 0.9587
##
## 'Positive' Class : M
##
roc_boost <- roc(wdbc_test_raw$Diagnosis, boost_prob, levels = c("B","M"))
## Setting direction: controls < cases
plot(roc_boost, main = "ROC Curve – Boosting (tuned)")
auc_boost <- auc(roc_boost)
auc_boost
## Area under the curve: 0.9972
Model comparisons:
model_results <- tibble(
Model = c(
"Logistic (no reg)",
"LASSO Logistic",
"kNN",
"Tree (Unpruned)",
"Tree (Pruned)",
"Random Forest",
"Boosting (GBM)"
),
Accuracy = c(
cm_log$overall["Accuracy"],
cm_lasso$overall["Accuracy"],
cm_knn$overall["Accuracy"],
cm_tree_unpruned$overall["Accuracy"],
cm_tree_pruned$overall["Accuracy"],
cm_rf$overall["Accuracy"],
cm_boost$overall["Accuracy"]
),
Sensitivity = c(
cm_log$byClass["Sensitivity"],
cm_lasso$byClass["Sensitivity"],
cm_knn$byClass["Sensitivity"],
cm_tree_unpruned$byClass["Sensitivity"],
cm_tree_pruned$byClass["Sensitivity"],
cm_rf$byClass["Sensitivity"],
cm_boost$byClass["Sensitivity"]
),
Specificity = c(
cm_log$byClass["Specificity"],
cm_lasso$byClass["Specificity"],
cm_knn$byClass["Specificity"],
cm_tree_unpruned$byClass["Specificity"],
cm_tree_pruned$byClass["Specificity"],
cm_rf$byClass["Specificity"],
cm_boost$byClass["Specificity"]
),
AUC = c(
as.numeric(auc_log),
as.numeric(auc_lasso),
as.numeric(auc_knn),
as.numeric(auc_tree_unpruned),
as.numeric(auc_tree_pruned),
as.numeric(auc_rf),
as.numeric(auc_boost)
)
)
model_results
## # A tibble: 7 × 5
## Model Accuracy Sensitivity Specificity AUC
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Logistic (no reg) 0.959 0.938 0.972 0.975
## 2 LASSO Logistic 0.965 0.938 0.981 0.994
## 3 kNN 0.982 0.953 1 0.997
## 4 Tree (Unpruned) 0.914 1 0.884 0.986
## 5 Tree (Pruned) 0.914 1 0.884 0.986
## 6 Random Forest 0.948 0.982 0.935 0.998
## 7 Boosting (GBM) 0.948 0.982 0.935 0.997