Machine Learning Project

libraries

library(ggplot2)
library(caret)

## Loading required package: lattice

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(glmnet)

## Loading required package: Matrix

## Loaded glmnet 4.1-10

library(class)
library(tree)
library(rpart)
library(rpart.plot)
library(randomForest)

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(gbm)

## Loaded gbm 2.2.2

## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:randomForest':
## 
##     combine

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tibble)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ lubridate 1.9.4     ✔ stringr   1.5.1
## ✔ purrr     1.1.0     ✔ tidyr     1.3.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine()       masks randomForest::combine()
## ✖ tidyr::expand()        masks Matrix::expand()
## ✖ dplyr::filter()        masks stats::filter()
## ✖ dplyr::lag()           masks stats::lag()
## ✖ purrr::lift()          masks caret::lift()
## ✖ randomForest::margin() masks ggplot2::margin()
## ✖ tidyr::pack()          masks Matrix::pack()
## ✖ tidyr::unpack()        masks Matrix::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(MASS)

## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select

wdbc<- read.csv("wdbc copy.csv", header = FALSE)

colnames(wdbc) <- c(
  "ID",
  "Diagnosis",
  "radius_mean", "texture_mean", "perimeter_mean", "area_mean", "smoothness_mean",
  "compactness_mean", "concavity_mean", "concave_points_mean", "symmetry_mean", "fractal_dimension_mean",
  
  "radius_se", "texture_se", "perimeter_se", "area_se", "smoothness_se",
  "compactness_se", "concavity_se", "concave_points_se", "symmetry_se", "fractal_dimension_se",

  "radius_worst", "texture_worst", "perimeter_worst", "area_worst", "smoothness_worst",
  "compactness_worst", "concavity_worst", "concave_points_worst", "symmetry_worst", "fractal_dimension_worst"
)

head(wdbc)

##         ID Diagnosis radius_mean texture_mean perimeter_mean area_mean
## 1   842302         M       17.99        10.38         122.80    1001.0
## 2   842517         M       20.57        17.77         132.90    1326.0
## 3 84300903         M       19.69        21.25         130.00    1203.0
## 4 84348301         M       11.42        20.38          77.58     386.1
## 5 84358402         M       20.29        14.34         135.10    1297.0
## 6   843786         M       12.45        15.70          82.57     477.1
##   smoothness_mean compactness_mean concavity_mean concave_points_mean
## 1         0.11840          0.27760         0.3001             0.14710
## 2         0.08474          0.07864         0.0869             0.07017
## 3         0.10960          0.15990         0.1974             0.12790
## 4         0.14250          0.28390         0.2414             0.10520
## 5         0.10030          0.13280         0.1980             0.10430
## 6         0.12780          0.17000         0.1578             0.08089
##   symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se
## 1        0.2419                0.07871    1.0950     0.9053        8.589
## 2        0.1812                0.05667    0.5435     0.7339        3.398
## 3        0.2069                0.05999    0.7456     0.7869        4.585
## 4        0.2597                0.09744    0.4956     1.1560        3.445
## 5        0.1809                0.05883    0.7572     0.7813        5.438
## 6        0.2087                0.07613    0.3345     0.8902        2.217
##   area_se smoothness_se compactness_se concavity_se concave_points_se
## 1  153.40      0.006399        0.04904      0.05373           0.01587
## 2   74.08      0.005225        0.01308      0.01860           0.01340
## 3   94.03      0.006150        0.04006      0.03832           0.02058
## 4   27.23      0.009110        0.07458      0.05661           0.01867
## 5   94.44      0.011490        0.02461      0.05688           0.01885
## 6   27.19      0.007510        0.03345      0.03672           0.01137
##   symmetry_se fractal_dimension_se radius_worst texture_worst perimeter_worst
## 1     0.03003             0.006193        25.38         17.33          184.60
## 2     0.01389             0.003532        24.99         23.41          158.80
## 3     0.02250             0.004571        23.57         25.53          152.50
## 4     0.05963             0.009208        14.91         26.50           98.87
## 5     0.01756             0.005115        22.54         16.67          152.20
## 6     0.02165             0.005082        15.47         23.75          103.40
##   area_worst smoothness_worst compactness_worst concavity_worst
## 1     2019.0           0.1622            0.6656          0.7119
## 2     1956.0           0.1238            0.1866          0.2416
## 3     1709.0           0.1444            0.4245          0.4504
## 4      567.7           0.2098            0.8663          0.6869
## 5     1575.0           0.1374            0.2050          0.4000
## 6      741.6           0.1791            0.5249          0.5355
##   concave_points_worst symmetry_worst fractal_dimension_worst
## 1               0.2654         0.4601                 0.11890
## 2               0.1860         0.2750                 0.08902
## 3               0.2430         0.3613                 0.08758
## 4               0.2575         0.6638                 0.17300
## 5               0.1625         0.2364                 0.07678
## 6               0.1741         0.3985                 0.12440

clean data

wdbc$ID <- NULL
wdbc$Diagnosis <- factor(wdbc$Diagnosis, levels = c("B","M"))
summary(wdbc$Diagnosis)

##   B   M 
## 357 212

head(wdbc)

##   Diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean
## 1         M       17.99        10.38         122.80    1001.0         0.11840
## 2         M       20.57        17.77         132.90    1326.0         0.08474
## 3         M       19.69        21.25         130.00    1203.0         0.10960
## 4         M       11.42        20.38          77.58     386.1         0.14250
## 5         M       20.29        14.34         135.10    1297.0         0.10030
## 6         M       12.45        15.70          82.57     477.1         0.12780
##   compactness_mean concavity_mean concave_points_mean symmetry_mean
## 1          0.27760         0.3001             0.14710        0.2419
## 2          0.07864         0.0869             0.07017        0.1812
## 3          0.15990         0.1974             0.12790        0.2069
## 4          0.28390         0.2414             0.10520        0.2597
## 5          0.13280         0.1980             0.10430        0.1809
## 6          0.17000         0.1578             0.08089        0.2087
##   fractal_dimension_mean radius_se texture_se perimeter_se area_se
## 1                0.07871    1.0950     0.9053        8.589  153.40
## 2                0.05667    0.5435     0.7339        3.398   74.08
## 3                0.05999    0.7456     0.7869        4.585   94.03
## 4                0.09744    0.4956     1.1560        3.445   27.23
## 5                0.05883    0.7572     0.7813        5.438   94.44
## 6                0.07613    0.3345     0.8902        2.217   27.19
##   smoothness_se compactness_se concavity_se concave_points_se symmetry_se
## 1      0.006399        0.04904      0.05373           0.01587     0.03003
## 2      0.005225        0.01308      0.01860           0.01340     0.01389
## 3      0.006150        0.04006      0.03832           0.02058     0.02250
## 4      0.009110        0.07458      0.05661           0.01867     0.05963
## 5      0.011490        0.02461      0.05688           0.01885     0.01756
## 6      0.007510        0.03345      0.03672           0.01137     0.02165
##   fractal_dimension_se radius_worst texture_worst perimeter_worst area_worst
## 1             0.006193        25.38         17.33          184.60     2019.0
## 2             0.003532        24.99         23.41          158.80     1956.0
## 3             0.004571        23.57         25.53          152.50     1709.0
## 4             0.009208        14.91         26.50           98.87      567.7
## 5             0.005115        22.54         16.67          152.20     1575.0
## 6             0.005082        15.47         23.75          103.40      741.6
##   smoothness_worst compactness_worst concavity_worst concave_points_worst
## 1           0.1622            0.6656          0.7119               0.2654
## 2           0.1238            0.1866          0.2416               0.1860
## 3           0.1444            0.4245          0.4504               0.2430
## 4           0.2098            0.8663          0.6869               0.2575
## 5           0.1374            0.2050          0.4000               0.1625
## 6           0.1791            0.5249          0.5355               0.1741
##   symmetry_worst fractal_dimension_worst
## 1         0.4601                 0.11890
## 2         0.2750                 0.08902
## 3         0.3613                 0.08758
## 4         0.6638                 0.17300
## 5         0.2364                 0.07678
## 6         0.3985                 0.12440

standardize predictors

predictor_names <- colnames(wdbc)[-1]   

wdbc_scaled <- wdbc
wdbc_scaled[, predictor_names] <- scale(wdbc[, predictor_names])

head(wdbc_scaled)

##   Diagnosis radius_mean texture_mean perimeter_mean  area_mean smoothness_mean
## 1         M   1.0960995   -2.0715123      1.2688173  0.9835095       1.5670875
## 2         M   1.8282120   -0.3533215      1.6844726  1.9070303      -0.8262354
## 3         M   1.5784992    0.4557859      1.5651260  1.5575132       0.9413821
## 4         M  -0.7682333    0.2535091     -0.5921661 -0.7637917       3.2806668
## 5         M   1.7487579   -1.1508038      1.7750113  1.8246238       0.2801253
## 6         M  -0.4759559   -0.8346009     -0.3868077 -0.5052059       2.2354545
##   compactness_mean concavity_mean concave_points_mean symmetry_mean
## 1        3.2806281     2.65054179           2.5302489   2.215565542
## 2       -0.4866435    -0.02382489           0.5476623   0.001391139
## 3        1.0519999     1.36227979           2.0354398   0.938858720
## 4        3.3999174     1.91421287           1.4504311   2.864862154
## 5        0.5388663     1.36980615           1.4272370  -0.009552062
## 6        1.2432416     0.86554001           0.8239307   1.004517928
##   fractal_dimension_mean  radius_se texture_se perimeter_se    area_se
## 1              2.2537638  2.4875451 -0.5647681    2.8305403  2.4853907
## 2             -0.8678888  0.4988157 -0.8754733    0.2630955  0.7417493
## 3             -0.3976580  1.2275958 -0.7793976    0.8501802  1.1802975
## 4              4.9066020  0.3260865 -0.1103120    0.2863415 -0.2881246
## 5             -0.5619555  1.2694258 -0.7895490    1.2720701  1.1893103
## 6              1.8883435 -0.2548461 -0.5921406   -0.3210217 -0.2890039
##   smoothness_se compactness_se concavity_se concave_points_se symmetry_se
## 1    -0.2138135     1.31570389    0.7233897        0.66023900   1.1477468
## 2    -0.6048187    -0.69231710   -0.4403926        0.25993335  -0.8047423
## 3    -0.2967439     0.81425704    0.2128891        1.42357487   0.2368272
## 4     0.6890953     2.74186785    0.8187979        1.11402678   4.7285198
## 5     1.4817634    -0.04847723    0.8277425        1.14319885  -0.3607748
## 6     0.1562093     0.44515196    0.1598845       -0.06906279   0.1340009
##   fractal_dimension_se radius_worst texture_worst perimeter_worst area_worst
## 1           0.90628565    1.8850310   -1.35809849       2.3015755  1.9994782
## 2          -0.09935632    1.8043398   -0.36887865       1.5337764  1.8888270
## 3           0.29330133    1.5105411   -0.02395331       1.3462906  1.4550043
## 4           2.04571087   -0.2812170    0.13386631      -0.2497196 -0.5495377
## 5           0.49888916    1.2974336   -1.46548091       1.3373627  1.2196511
## 6           0.48641784   -0.1653528   -0.31356043      -0.1149083 -0.2441054
##   smoothness_worst compactness_worst concavity_worst concave_points_worst
## 1        1.3065367         2.6143647       2.1076718            2.2940576
## 2       -0.3752817        -0.4300658      -0.1466200            1.0861286
## 3        0.5269438         1.0819801       0.8542223            1.9532817
## 4        3.3912907         3.8899747       1.9878392            2.1738732
## 5        0.2203623        -0.3131190       0.6126397            0.7286181
## 6        2.0467119         1.7201029       1.2621327            0.9050914
##   symmetry_worst fractal_dimension_worst
## 1      2.7482041               1.9353117
## 2     -0.2436753               0.2809428
## 3      1.1512420               0.2012142
## 4      6.0407261               4.9306719
## 5     -0.8675896              -0.3967505
## 6      1.7525273               2.2398308

train/test split

set.seed(4630)
n <- nrow(wdbc_scaled)

train_idx <- sample(1:n, size = floor(0.7 * n))

wdbc_train <- wdbc_scaled[train_idx, ]
wdbc_test  <- wdbc_scaled[-train_idx, ]

table(wdbc_train$Diagnosis)

## 
##   B   M 
## 250 148

table(wdbc_test$Diagnosis)

## 
##   B   M 
## 107  64

EDA

PCA plot

# 1. Create matrix of predictors
X_pca <- wdbc %>%
  dplyr::select(-Diagnosis) %>%   # keep only numeric columns
  as.matrix()

# 2. Run PCA on standardized predictors
wdbc_pca <- prcomp(X_pca, center = TRUE, scale. = TRUE)

# 3. PCA biplot: points = observations, arrows = variables
fviz_pca_biplot(
  wdbc_pca,
  geom        = "point",
  habillage   = wdbc$Diagnosis,   # color by diagnosis
  addEllipses = TRUE,             # optional: class ellipses
  label       = "var",            # show variable names as arrows
  col.var     = "black",
  alpha.ind   = 0.6
) +
  theme_minimal() +
  labs(
    title = "PCA Biplot of WDBC Predictors",
    color = "Diagnosis"
  )

class imbalance

ggplot(wdbc, aes(x = Diagnosis)) +
  geom_bar() +
  labs(title = "Class Distribution", y = "Count")

The dataset contains more observations in the benign category than the malignant, which could potentially lead to problems in classification where the model is better at recognizing cases of benign but not as good at recognizing malignant. This could potentially lead to higher specificity than sensitivity of the models.

summary stats

summary(wdbc[, predictor_names])

##   radius_mean      texture_mean   perimeter_mean     area_mean     
##  Min.   : 6.981   Min.   : 9.71   Min.   : 43.79   Min.   : 143.5  
##  1st Qu.:11.700   1st Qu.:16.17   1st Qu.: 75.17   1st Qu.: 420.3  
##  Median :13.370   Median :18.84   Median : 86.24   Median : 551.1  
##  Mean   :14.127   Mean   :19.29   Mean   : 91.97   Mean   : 654.9  
##  3rd Qu.:15.780   3rd Qu.:21.80   3rd Qu.:104.10   3rd Qu.: 782.7  
##  Max.   :28.110   Max.   :39.28   Max.   :188.50   Max.   :2501.0  
##  smoothness_mean   compactness_mean  concavity_mean    concave_points_mean
##  Min.   :0.05263   Min.   :0.01938   Min.   :0.00000   Min.   :0.00000    
##  1st Qu.:0.08637   1st Qu.:0.06492   1st Qu.:0.02956   1st Qu.:0.02031    
##  Median :0.09587   Median :0.09263   Median :0.06154   Median :0.03350    
##  Mean   :0.09636   Mean   :0.10434   Mean   :0.08880   Mean   :0.04892    
##  3rd Qu.:0.10530   3rd Qu.:0.13040   3rd Qu.:0.13070   3rd Qu.:0.07400    
##  Max.   :0.16340   Max.   :0.34540   Max.   :0.42680   Max.   :0.20120    
##  symmetry_mean    fractal_dimension_mean   radius_se        texture_se    
##  Min.   :0.1060   Min.   :0.04996        Min.   :0.1115   Min.   :0.3602  
##  1st Qu.:0.1619   1st Qu.:0.05770        1st Qu.:0.2324   1st Qu.:0.8339  
##  Median :0.1792   Median :0.06154        Median :0.3242   Median :1.1080  
##  Mean   :0.1812   Mean   :0.06280        Mean   :0.4052   Mean   :1.2169  
##  3rd Qu.:0.1957   3rd Qu.:0.06612        3rd Qu.:0.4789   3rd Qu.:1.4740  
##  Max.   :0.3040   Max.   :0.09744        Max.   :2.8730   Max.   :4.8850  
##   perimeter_se       area_se        smoothness_se      compactness_se    
##  Min.   : 0.757   Min.   :  6.802   Min.   :0.001713   Min.   :0.002252  
##  1st Qu.: 1.606   1st Qu.: 17.850   1st Qu.:0.005169   1st Qu.:0.013080  
##  Median : 2.287   Median : 24.530   Median :0.006380   Median :0.020450  
##  Mean   : 2.866   Mean   : 40.337   Mean   :0.007041   Mean   :0.025478  
##  3rd Qu.: 3.357   3rd Qu.: 45.190   3rd Qu.:0.008146   3rd Qu.:0.032450  
##  Max.   :21.980   Max.   :542.200   Max.   :0.031130   Max.   :0.135400  
##   concavity_se     concave_points_se   symmetry_se       fractal_dimension_se
##  Min.   :0.00000   Min.   :0.000000   Min.   :0.007882   Min.   :0.0008948   
##  1st Qu.:0.01509   1st Qu.:0.007638   1st Qu.:0.015160   1st Qu.:0.0022480   
##  Median :0.02589   Median :0.010930   Median :0.018730   Median :0.0031870   
##  Mean   :0.03189   Mean   :0.011796   Mean   :0.020542   Mean   :0.0037949   
##  3rd Qu.:0.04205   3rd Qu.:0.014710   3rd Qu.:0.023480   3rd Qu.:0.0045580   
##  Max.   :0.39600   Max.   :0.052790   Max.   :0.078950   Max.   :0.0298400   
##   radius_worst   texture_worst   perimeter_worst    area_worst    
##  Min.   : 7.93   Min.   :12.02   Min.   : 50.41   Min.   : 185.2  
##  1st Qu.:13.01   1st Qu.:21.08   1st Qu.: 84.11   1st Qu.: 515.3  
##  Median :14.97   Median :25.41   Median : 97.66   Median : 686.5  
##  Mean   :16.27   Mean   :25.68   Mean   :107.26   Mean   : 880.6  
##  3rd Qu.:18.79   3rd Qu.:29.72   3rd Qu.:125.40   3rd Qu.:1084.0  
##  Max.   :36.04   Max.   :49.54   Max.   :251.20   Max.   :4254.0  
##  smoothness_worst  compactness_worst concavity_worst  concave_points_worst
##  Min.   :0.07117   Min.   :0.02729   Min.   :0.0000   Min.   :0.00000     
##  1st Qu.:0.11660   1st Qu.:0.14720   1st Qu.:0.1145   1st Qu.:0.06493     
##  Median :0.13130   Median :0.21190   Median :0.2267   Median :0.09993     
##  Mean   :0.13237   Mean   :0.25427   Mean   :0.2722   Mean   :0.11461     
##  3rd Qu.:0.14600   3rd Qu.:0.33910   3rd Qu.:0.3829   3rd Qu.:0.16140     
##  Max.   :0.22260   Max.   :1.05800   Max.   :1.2520   Max.   :0.29100     
##  symmetry_worst   fractal_dimension_worst
##  Min.   :0.1565   Min.   :0.05504        
##  1st Qu.:0.2504   1st Qu.:0.07146        
##  Median :0.2822   Median :0.08004        
##  Mean   :0.2901   Mean   :0.08395        
##  3rd Qu.:0.3179   3rd Qu.:0.09208        
##  Max.   :0.6638   Max.   :0.20750

correlation plots (for the means of each feature)

mean_vars <- grep("_mean$", names(wdbc_scaled), value = TRUE)
mean_vars

##  [1] "radius_mean"            "texture_mean"           "perimeter_mean"        
##  [4] "area_mean"              "smoothness_mean"        "compactness_mean"      
##  [7] "concavity_mean"         "concave_points_mean"    "symmetry_mean"         
## [10] "fractal_dimension_mean"

cor_mean <- cor(wdbc_scaled[, mean_vars])
cor_mean

##                        radius_mean texture_mean perimeter_mean  area_mean
## radius_mean              1.0000000   0.32378189      0.9978553  0.9873572
## texture_mean             0.3237819   1.00000000      0.3295331  0.3210857
## perimeter_mean           0.9978553   0.32953306      1.0000000  0.9865068
## area_mean                0.9873572   0.32108570      0.9865068  1.0000000
## smoothness_mean          0.1705812  -0.02338852      0.2072782  0.1770284
## compactness_mean         0.5061236   0.23670222      0.5569362  0.4985017
## concavity_mean           0.6767636   0.30241783      0.7161357  0.6859828
## concave_points_mean      0.8225285   0.29346405      0.8509770  0.8232689
## symmetry_mean            0.1477412   0.07140098      0.1830272  0.1512931
## fractal_dimension_mean  -0.3116308  -0.07643718     -0.2614769 -0.2831098
##                        smoothness_mean compactness_mean concavity_mean
## radius_mean                 0.17058119        0.5061236      0.6767636
## texture_mean               -0.02338852        0.2367022      0.3024178
## perimeter_mean              0.20727816        0.5569362      0.7161357
## area_mean                   0.17702838        0.4985017      0.6859828
## smoothness_mean             1.00000000        0.6591232      0.5219838
## compactness_mean            0.65912322        1.0000000      0.8831207
## concavity_mean              0.52198377        0.8831207      1.0000000
## concave_points_mean         0.55369517        0.8311350      0.9213910
## symmetry_mean               0.55777479        0.6026410      0.5006666
## fractal_dimension_mean      0.58479200        0.5653687      0.3367834
##                        concave_points_mean symmetry_mean fractal_dimension_mean
## radius_mean                      0.8225285    0.14774124            -0.31163083
## texture_mean                     0.2934641    0.07140098            -0.07643718
## perimeter_mean                   0.8509770    0.18302721            -0.26147691
## area_mean                        0.8232689    0.15129308            -0.28310981
## smoothness_mean                  0.5536952    0.55777479             0.58479200
## compactness_mean                 0.8311350    0.60264105             0.56536866
## concavity_mean                   0.9213910    0.50066662             0.33678336
## concave_points_mean              1.0000000    0.46249739             0.16691738
## symmetry_mean                    0.4624974    1.00000000             0.47992133
## fractal_dimension_mean           0.1669174    0.47992133             1.00000000

library(corrplot)

## corrplot 0.95 loaded

corrplot(cor_mean,
         method = "circle",
         type = "lower",
         tl.col = "black",
         tl.cex = 0.8)

for just the se’s of each predictor

Logistic Regression

fit_log <- glm(Diagnosis ~ ., 
               data = wdbc_train, 
               family = binomial)

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(fit_log)

## 
## Call:
## glm(formula = Diagnosis ~ ., family = binomial, data = wdbc_train)
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)
## (Intercept)                164.426  32237.326   0.005    0.996
## radius_mean              -1818.787 495097.811  -0.004    0.997
## texture_mean               114.368  16695.906   0.007    0.995
## perimeter_mean             660.558 294904.859   0.002    0.998
## area_mean                  729.018 252887.644   0.003    0.998
## smoothness_mean            -18.118  10295.461  -0.002    0.999
## compactness_mean          -350.487  29475.178  -0.012    0.991
## concavity_mean             172.462  50796.003   0.003    0.997
## concave_points_mean        413.978  61856.969   0.007    0.995
## symmetry_mean              -28.075   7851.281  -0.004    0.997
## fractal_dimension_mean       9.596  21439.885   0.000    1.000
## radius_se                   63.627 188417.693   0.000    1.000
## texture_se                  87.928   7404.713   0.012    0.991
## perimeter_se               119.091  86913.618   0.001    0.999
## area_se                   -173.212 224417.259  -0.001    0.999
## smoothness_se              -80.479  14229.172  -0.006    0.995
## compactness_se             228.015  51837.798   0.004    0.996
## concavity_se              -201.618 102725.466  -0.002    0.998
## concave_points_se           47.028  22467.813   0.002    0.998
## symmetry_se                -61.114  12337.303  -0.005    0.996
## fractal_dimension_se      -247.702  28016.501  -0.009    0.993
## radius_worst               453.117 505855.074   0.001    0.999
## texture_worst              -60.789  13930.301  -0.004    0.997
## perimeter_worst           -149.276 251486.916  -0.001    1.000
## area_worst                 548.190 380001.247   0.001    0.999
## smoothness_worst            26.950  24617.224   0.001    0.999
## compactness_worst         -275.046  46786.449  -0.006    0.995
## concavity_worst            175.137  71684.498   0.002    0.998
## concave_points_worst        83.082  39612.407   0.002    0.998
## symmetry_worst             143.219  11053.504   0.013    0.990
## fractal_dimension_worst    219.791  31661.056   0.007    0.994
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5.2531e+02  on 397  degrees of freedom
## Residual deviance: 1.0723e-06  on 367  degrees of freedom
## AIC: 62
## 
## Number of Fisher Scoring iterations: 25

The logistic regression model (without regularization) does not perform variable selection, so all 30 predictors are included in the model making it relatively difficult to interpret. None of the variables have a p-value less than 0.05, suggesting that none of them significantly contribute to the model.

prediction on test set

log_prob <- predict(fit_log, 
                    newdata = wdbc_test, 
                    type = "response")
#head(log_prob)

model eval

log_pred <- ifelse(log_prob > 0.5, "M", "B")
log_pred <- factor(log_pred, levels = c("B","M"))

#Confusion Matrix 
cm_log <- confusionMatrix(log_pred, wdbc_test$Diagnosis, positive = "M")
cm_log

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   B   M
##          B 104   4
##          M   3  60
##                                           
##                Accuracy : 0.9591          
##                  95% CI : (0.9175, 0.9834)
##     No Information Rate : 0.6257          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9123          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9375          
##             Specificity : 0.9720          
##          Pos Pred Value : 0.9524          
##          Neg Pred Value : 0.9630          
##              Prevalence : 0.3743          
##          Detection Rate : 0.3509          
##    Detection Prevalence : 0.3684          
##       Balanced Accuracy : 0.9547          
##                                           
##        'Positive' Class : M               
##

roc_log <- roc(wdbc_test$Diagnosis, log_prob, levels = c("B","M"))

## Setting direction: controls < cases

plot(roc_log, main = "ROC Curve - Logistic Regression")

auc_log <- auc(roc_log)
auc_log

## Area under the curve: 0.9745

Although none of the predictors were signficant, the baseline logistic regression model performs well on the test set with a 96% accuracy and and AUC of 0.9745.

Logistic Regression

X_train <- as.matrix(wdbc_train[, -1]) 
y_train <- wdbc_train$Diagnosis        


y_train_bin <- ifelse(y_train == "M", 1, 0)

# Test set
X_test <- as.matrix(wdbc_test[, -1])
y_test <- wdbc_test$Diagnosis
y_test_bin <- ifelse(y_test == "M", 1, 0)


#Cross Validation
set.seed(4630)

cv_lasso <- cv.glmnet(
  X_train,
  y_train_bin,
  alpha = 1,
  family = "binomial",
  nfolds = 10,
  type.measure = "deviance"  
)

plot(cv_lasso)

minimum lambda

lambda_min <- cv_lasso$lambda.min   # λ that gives minimum CV error
lambda_1se <- cv_lasso$lambda.1se   # more conservative (simpler model)

lambda_min

## [1] 0.004844294

lambda_1se

## [1] 0.01781915

The minimum lamda for LASSO as determined by 10-fold cross validation is 0.003, which is what we will use for the model on the training set.

LASSO coefficients

# Extract coefficients at lambda_min (your object)
coef_lasso <- coef(cv_lasso, s = lambda_min)

# Convert from sparse matrix to a regular matrix
coef_matrix <- as.matrix(coef_lasso)

# Pull only the non-zero coefficients
nonzero_coefs <- coef_matrix[coef_matrix[,1] != 0, , drop = FALSE]

nonzero_coefs

##                      s=0.004844294
## (Intercept)             -0.7335676
## concave_points_mean      0.4832564
## radius_se                0.5834270
## radius_worst             4.1295741
## texture_worst            1.2001398
## smoothness_worst         0.7689339
## concavity_worst          0.2805019
## concave_points_worst     1.0198362
## symmetry_worst           0.1355000

Interpretation: Using the lambda selected by 10-fold CV, the LASSO model produced 12 predictors. This model is easier to interpret because LASSO performs variable selection, and since the predictors are standardized, we can compare the values to each other. The LASSO model predicted the standard error and the most extreme value of nuclei radius, the extreme values of nuclei texture, area, and concave points to be most predictive of malignancy in breast cancer cells. The most important predictors determined by the model also have positive values for their coefficients, indicating and increase in these values increases the probability of a cell being malignant. While the most extreme value for texture was an important indicator with a value of 1.405, the standard error of texture was retained as a predictor in the lasso model with a negative coefficient of -0.169, indicating that worse values of texture increase likelihood of malignancy whereas the model decreases the prediction probability of malignancy when there’s more variation between texture values in the data set. Out of the ten predictors that recorded the most extreme measurements of each feature, seven were retained suggesting that the extreme values provided the most diagnostic information compared to the mean and se values.

-how do i interpret the coefficient of the feature se’s in terms of how it effects a single nuclei’s likelihood of being malignant (considering that individual nuclei won’t have an se to plug into the equation)?

model eval

lasso_prob <- predict(cv_lasso, newx = X_test, s = lambda_min, type = "response")
lasso_pred <- ifelse(lasso_prob > 0.5, "M", "B")
lasso_pred <- factor(lasso_pred, levels = c("B","M"))

cm_lasso <- confusionMatrix(lasso_pred, y_test, positive = "M")
cm_lasso

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   B   M
##          B 105   4
##          M   2  60
##                                          
##                Accuracy : 0.9649         
##                  95% CI : (0.9252, 0.987)
##     No Information Rate : 0.6257         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.9246         
##                                          
##  Mcnemar's Test P-Value : 0.6831         
##                                          
##             Sensitivity : 0.9375         
##             Specificity : 0.9813         
##          Pos Pred Value : 0.9677         
##          Neg Pred Value : 0.9633         
##              Prevalence : 0.3743         
##          Detection Rate : 0.3509         
##    Detection Prevalence : 0.3626         
##       Balanced Accuracy : 0.9594         
##                                          
##        'Positive' Class : M              
##

library(pROC)

roc_lasso <- roc(y_test, as.numeric(lasso_prob))

## Setting levels: control = B, case = M

## Setting direction: controls < cases

plot(roc_lasso, main = "ROC Curve - LASSO Logistic Regression")

auc_lasso <- auc(roc_lasso)
auc_lasso

## Area under the curve: 0.9945

Both the accuracy and the AUC improved for the LASSO model compared to the basline logistic regression model.

KNN

X <- as.matrix(wdbc_train[, -1])      
y <- ifelse(wdbc_train$Diagnosis == "M", 1, 0)   
n <- nrow(X)
Kfold <- 10

set.seed(4630)
fold_id <- sample(rep(1:Kfold, length.out = n))

k_grid <- seq(1, 25, by = 2)
cv_auc <- numeric(length(k_grid))

for (i in seq_along(k_grid)) {
  k <- k_grid[i]
  fold_auc <- numeric(Kfold)
  
  for (f in 1:Kfold) {
    val_idx   <- which(fold_id == f)
    train_idx <- which(fold_id != f)
    
    X_tr <- X[train_idx, , drop = FALSE]
    y_tr <- y[train_idx]
    
    X_val <- X[val_idx, , drop = FALSE]
    y_val <- y[val_idx]
    
    # kNN with probability output
    pred_f <- knn(
      train = X_tr,
      test  = X_val,
      cl    = factor(y_tr, levels = c(0,1), labels = c("B","M")),
      k     = k,
      prob  = TRUE
    )
    
    # proportion of votes for predicted class
    p_win <- attr(pred_f, "prob")
    # convert to P(M)
    p_hat_M <- ifelse(pred_f == "M", p_win, 1 - p_win)
    
    # AUC for this fold (M is positive class)
    roc_f <- roc(
      response = factor(y_val, levels = c(0,1), labels = c("B","M")),
      predictor = p_hat_M,
      levels = c("B","M"),
      quiet = TRUE
    )
    fold_auc[f] <- auc(roc_f)
  }
  
  cv_auc[i] <- mean(fold_auc)
}

data.frame(k = k_grid, CV_AUC = cv_auc)

##     k    CV_AUC
## 1   1 0.9482415
## 2   3 0.9805070
## 3   5 0.9820192
## 4   7 0.9886477
## 5   9 0.9879004
## 6  11 0.9868566
## 7  13 0.9862328
## 8  15 0.9853127
## 9  17 0.9856022
## 10 19 0.9848013
## 11 21 0.9857586
## 12 23 0.9864944
## 13 25 0.9860223

plot(k_grid, cv_auc, type = "b",
     xlab = "k (Number of Neighbors)",
     ylab = "10-fold CV AUC",
     main = "KNN Model Selection Using 10-fold CV (AUC)")

abline(v = k_grid[which.max(cv_auc)], lty = 2)

model

best_k <- k_grid[which.max(cv_auc)]
best_k

## [1] 7

knn_best <- knn(train = X_train,
                test  = X_test,
                cl    = y_train,
                k     = best_k)
knn_best

##   [1] M M M M M M B M M M M B M M M B B M M M B M B M M M M B B M B M B B B B B
##  [38] B B B B B M B M B B M B B M B M B M B B M B B B M M M M B B M B M B M B M
##  [75] B B B B M M B M M M B M B B B B B B B B M M B B B M B B B B B B B M B B B
## [112] B B B B B B B B B B B B B B B M B M M M B B B B B B B B M M M B B B M B B
## [149] B B M B M M B B B B B M B B B B B B B B B M B
## Levels: B M

Based on 10-fold cross validation, the k value with the highest AUC is 7 so we will run our KNN model on the training set using k = 7.

model eval

mean(knn_best == y_test)

## [1] 0.9824561

cm_knn <- confusionMatrix(knn_best, y_test, positive = "M")
cm_knn

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   B   M
##          B 107   3
##          M   0  61
##                                           
##                Accuracy : 0.9825          
##                  95% CI : (0.9496, 0.9964)
##     No Information Rate : 0.6257          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9622          
##                                           
##  Mcnemar's Test P-Value : 0.2482          
##                                           
##             Sensitivity : 0.9531          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.9727          
##              Prevalence : 0.3743          
##          Detection Rate : 0.3567          
##    Detection Prevalence : 0.3567          
##       Balanced Accuracy : 0.9766          
##                                           
##        'Positive' Class : M               
##

ROC and AUC (based on vote proportions?)

set.seed(4630)

knn_pred <- knn(
  train = X_train,
  test  = X_test,
  cl    = y_train,
  k     = best_k,    # or any chosen k
  prob  = TRUE
)

# predicted class labels
head(knn_pred)

## [1] M M M M M M
## Levels: B M

# proportion of votes for the predicted (winning) class
p_hat_win <- attr(knn_pred, "prob")

# convert to probability of "M"
# if predicted "M", use p_hat_win; if predicted "B", use 1 - p_hat_win
p_hat_M <- ifelse(knn_pred == "M", p_hat_win, 1 - p_hat_win)

head(p_hat_M)

## [1] 1.0000000 0.8571429 1.0000000 0.5714286 0.7142857 1.0000000

# y_test should be factor with levels c("B", "M")
roc_knn <- roc(y_test, p_hat_M, levels = c("B","M"))

## Setting direction: controls < cases

plot(roc_knn, main = "ROC Curve - KNN")

auc_knn <- auc(roc_knn)

Simple Decision Tree:

wdbc_train_raw <- wdbc[train_idx, ]
wdbc_test_raw  <- wdbc[-train_idx, ]

table(wdbc_train_raw$Diagnosis)

## 
##   B   M 
## 202 157

table(wdbc_test_raw$Diagnosis)

## 
##   B   M 
## 155  55

set.seed(4630)

tree_bc <- tree(Diagnosis ~ ., data = wdbc_train_raw)
summary(tree_bc)

## 
## Classification tree:
## tree(formula = Diagnosis ~ ., data = wdbc_train_raw)
## Variables actually used in tree construction:
## [1] "perimeter_worst"      "concave_points_worst" "radius_se"           
## [4] "texture_worst"        "smoothness_worst"    
## Number of terminal nodes:  8 
## Residual mean deviance:  0.09148 = 32.11 / 351 
## Misclassification error rate: 0.02228 = 8 / 359

plot(tree_bc)
text(tree_bc, pretty = 0)

model eval (unpruned tree)

tree_unpruned_class <- predict(tree_bc,
                               newdata = wdbc_test_raw,
                               type = "class")

cm_tree_unpruned <- confusionMatrix(tree_unpruned_class,
                wdbc_test_raw$Diagnosis,
                positive = "M")
cm_tree_unpruned

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   B   M
##          B 137   0
##          M  18  55
##                                           
##                Accuracy : 0.9143          
##                  95% CI : (0.8679, 0.9484)
##     No Information Rate : 0.7381          
##     P-Value [Acc > NIR] : 1.003e-10       
##                                           
##                   Kappa : 0.7995          
##                                           
##  Mcnemar's Test P-Value : 6.151e-05       
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.8839          
##          Pos Pred Value : 0.7534          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.2619          
##          Detection Rate : 0.2619          
##    Detection Prevalence : 0.3476          
##       Balanced Accuracy : 0.9419          
##                                           
##        'Positive' Class : M               
##

# Probabilities for M
# Probabilities matrix (columns: B, M)
tree_unpruned_probs <- predict(tree_bc,
                               newdata = wdbc_test_raw,
                               type = "vector")

# Extract probability for M
tree_unpruned_probM <- tree_unpruned_probs[, "M"]


roc_tree_unpruned <- roc(wdbc_test_raw$Diagnosis,
                         tree_unpruned_probM,
                         levels = c("B", "M"))

## Setting direction: controls < cases

plot(roc_tree_unpruned,
     main = "ROC Curve – Unpruned Decision Tree")

auc_tree_unpruned <- auc(roc_tree_unpruned)
auc_tree_unpruned

## Area under the curve: 0.9864

cv for tree size

cv_bc <- cv.tree(tree_bc, FUN = prune.misclass)

plot(cv_bc$size, cv_bc$dev,
     type = "b",
     xlab = "Tree Size (Terminal Nodes)",
     ylab = "CV Misclassification Error",
     main = "CV Error vs Tree Size")

best_size <- cv_bc$size[which.min(cv_bc$dev)]
best_size

## [1] 8

pruned_bc <- prune.misclass(tree_bc, best = best_size)
plot(pruned_bc)
text(pruned_bc, pretty = 0)

model eval (pruned tree)

tree_pred <- predict(pruned_bc, newdata = wdbc_test_raw, type = "class")
cm_tree_pruned <- confusionMatrix(tree_pred,
                wdbc_test_raw$Diagnosis,
                positive = "M")
cm_tree_pruned

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   B   M
##          B 137   0
##          M  18  55
##                                           
##                Accuracy : 0.9143          
##                  95% CI : (0.8679, 0.9484)
##     No Information Rate : 0.7381          
##     P-Value [Acc > NIR] : 1.003e-10       
##                                           
##                   Kappa : 0.7995          
##                                           
##  Mcnemar's Test P-Value : 6.151e-05       
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.8839          
##          Pos Pred Value : 0.7534          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.2619          
##          Detection Rate : 0.2619          
##    Detection Prevalence : 0.3476          
##       Balanced Accuracy : 0.9419          
##                                           
##        'Positive' Class : M               
##

tree_prob <- predict(pruned_bc, newdata = wdbc_test_raw)[,"M"]

roc_tree <- roc(wdbc_test_raw$Diagnosis, tree_prob, levels = c("B","M"))

## Setting direction: controls < cases

plot(roc_tree, main = "ROC Curve — Decision Tree")

auc_tree_pruned <- auc(roc_tree)
auc_tree_pruned

## Area under the curve: 0.9864

While the AUC is slightly less for the pruned tree, it is more interpretable and has a higher accuracy than the unpruned tree because it is more generalizable.

Random Forest cross validation for mtry

# Make sure M is the positive (first) level for caret
wdbc_train_raw$Diagnosis <- relevel(wdbc_train_raw$Diagnosis, ref = "M")
wdbc_test_raw$Diagnosis  <- relevel(wdbc_test_raw$Diagnosis,  ref = "M")

ctrl_rf <- trainControl(
  method = "cv",
  number = 5,
  classProbs = TRUE,
  summaryFunction = twoClassSummary,  # returns ROC, Sens, Spec
  savePredictions = "final"
)

set.seed(4630)
rf_tuned <- train(
  Diagnosis ~ .,
  data = wdbc_train_raw,
  method = "rf",
  metric = "ROC",   # choose hyperparameters by CV AUC
  trControl = ctrl_rf,
  tuneGrid = data.frame(
    mtry = c(4, 8, 12, 16)          # you can tweak this grid
  ),
  ntree = 500
)

rf_tuned

## Random Forest 
## 
## 359 samples
##  30 predictor
##   2 classes: 'M', 'B' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 286, 287, 287, 288, 288 
## Resampling results across tuning parameters:
## 
##   mtry  ROC        Sens       Spec    
##    4    0.9900880  0.9679435  0.955122
##    8    0.9892216  0.9616935  0.945122
##   12    0.9896001  0.9679435  0.945122
##   16    0.9880231  0.9616935  0.940122
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 4.

rf_tuned$bestTune

##   mtry
## 1    4

# Variable importance (caret style)
varImp(rf_tuned)

## rf variable importance
## 
##   only 20 most important variables shown (out of 30)
## 
##                      Overall
## perimeter_worst      100.000
## radius_worst          97.897
## concave_points_worst  90.520
## area_worst            87.995
## concave_points_mean   74.235
## concavity_worst       44.606
## concavity_mean        39.463
## area_mean             39.409
## perimeter_mean        36.987
## area_se               30.868
## radius_mean           28.284
## texture_worst         22.521
## texture_mean          12.998
## radius_se             10.585
## smoothness_worst      10.260
## compactness_mean       8.641
## compactness_worst      7.158
## perimeter_se           5.884
## symmetry_worst         3.756
## concavity_se           3.415

# Test set predictions
rf_pred_class <- predict(rf_tuned, newdata = wdbc_test_raw)
cm_rf <- confusionMatrix(rf_pred_class,
                         wdbc_test_raw$Diagnosis,
                         positive = "M")
cm_rf

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   M   B
##          M  54  10
##          B   1 145
##                                           
##                Accuracy : 0.9476          
##                  95% CI : (0.9082, 0.9736)
##     No Information Rate : 0.7381          
##     P-Value [Acc > NIR] : 1.792e-15       
##                                           
##                   Kappa : 0.8713          
##                                           
##  Mcnemar's Test P-Value : 0.01586         
##                                           
##             Sensitivity : 0.9818          
##             Specificity : 0.9355          
##          Pos Pred Value : 0.8437          
##          Neg Pred Value : 0.9932          
##              Prevalence : 0.2619          
##          Detection Rate : 0.2571          
##    Detection Prevalence : 0.3048          
##       Balanced Accuracy : 0.9587          
##                                           
##        'Positive' Class : M               
##

# Probabilities for M for ROC/AUC
rf_prob <- predict(rf_tuned, newdata = wdbc_test_raw, type = "prob")[, "M"]

roc_rf <- roc(wdbc_test_raw$Diagnosis, rf_prob, levels = c("B","M"))

## Setting direction: controls < cases

plot(roc_rf, main = "ROC Curve – Random Forest (tuned mtry)")

auc_rf <- auc(roc_rf)
auc_rf

## Area under the curve: 0.9982

Boosting (GBM)

ctrl_gbm <- trainControl(
  method = "cv",
  number = 5,
  classProbs = TRUE,
  summaryFunction = twoClassSummary,
  savePredictions = "final"
)

grid_gbm <- expand.grid(
  interaction.depth = c(1, 3, 5),          # tree depth
  n.trees           = c(1000, 2000, 3000), # number of trees
  shrinkage         = c(0.01, 0.05),       # learning rate
  n.minobsinnode    = 10                   
)

set.seed(4630)
gbm_tuned <- train(
  Diagnosis ~ .,
  data = wdbc_train_raw,
  method = "gbm",
  distribution = "bernoulli",
  trControl = ctrl_gbm,
  tuneGrid = grid_gbm,
  metric = "ROC",
  verbose = FALSE
)

gbm_tuned

## Stochastic Gradient Boosting 
## 
## 359 samples
##  30 predictor
##   2 classes: 'M', 'B' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 286, 287, 287, 288, 288 
## Resampling results across tuning parameters:
## 
##   shrinkage  interaction.depth  n.trees  ROC        Sens       Spec     
##   0.01       1                  1000     0.9936095  0.9425403  0.9650000
##   0.01       1                  2000     0.9950688  0.9554435  0.9651220
##   0.01       1                  3000     0.9960354  0.9554435  0.9801220
##   0.01       3                  1000     0.9926694  0.9489919  0.9651220
##   0.01       3                  2000     0.9931622  0.9554435  0.9503659
##   0.01       3                  3000     0.9936472  0.9554435  0.9553659
##   0.01       5                  1000     0.9931550  0.9552419  0.9452439
##   0.01       5                  2000     0.9936300  0.9487903  0.9552439
##   0.01       5                  3000     0.9936491  0.9487903  0.9652439
##   0.05       1                  1000     0.9960303  0.9618952  0.9702439
##   0.05       1                  2000     0.9961949  0.9556452  0.9751220
##   0.05       1                  3000     0.9960297  0.9556452  0.9701220
##   0.05       3                  1000     0.9945858  0.9616935  0.9553659
##   0.05       3                  2000     0.9944386  0.9616935  0.9652439
##   0.05       3                  3000     0.9942784  0.9616935  0.9652439
##   0.05       5                  1000     0.9929899  0.9616935  0.9652439
##   0.05       5                  2000     0.9926763  0.9616935  0.9602439
##   0.05       5                  3000     0.9914066  0.9552419  0.9750000
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 2000, interaction.depth =
##  1, shrinkage = 0.05 and n.minobsinnode = 10.

gbm_tuned$bestTune

##    n.trees interaction.depth shrinkage n.minobsinnode
## 11    2000                 1      0.05             10

# Probabilities for M
boost_prob <- predict(gbm_tuned,
                      newdata = wdbc_test_raw,
                      type = "prob")[, "M"]

boost_pred_class <- ifelse(boost_prob > 0.5, "M", "B")
boost_pred_class <- factor(boost_pred_class, levels = c("B","M"))

cm_boost <- confusionMatrix(boost_pred_class,
                            wdbc_test_raw$Diagnosis,
                            positive = "M")

## Warning in confusionMatrix.default(boost_pred_class, wdbc_test_raw$Diagnosis, :
## Levels are not in the same order for reference and data. Refactoring data to
## match.

cm_boost

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   M   B
##          M  54  10
##          B   1 145
##                                           
##                Accuracy : 0.9476          
##                  95% CI : (0.9082, 0.9736)
##     No Information Rate : 0.7381          
##     P-Value [Acc > NIR] : 1.792e-15       
##                                           
##                   Kappa : 0.8713          
##                                           
##  Mcnemar's Test P-Value : 0.01586         
##                                           
##             Sensitivity : 0.9818          
##             Specificity : 0.9355          
##          Pos Pred Value : 0.8437          
##          Neg Pred Value : 0.9932          
##              Prevalence : 0.2619          
##          Detection Rate : 0.2571          
##    Detection Prevalence : 0.3048          
##       Balanced Accuracy : 0.9587          
##                                           
##        'Positive' Class : M               
##

roc_boost <- roc(wdbc_test_raw$Diagnosis, boost_prob, levels = c("B","M"))

## Setting direction: controls < cases

plot(roc_boost, main = "ROC Curve – Boosting (tuned)")

auc_boost <- auc(roc_boost)
auc_boost

## Area under the curve: 0.9972

Model comparisons:

model_results <- tibble(
  Model = c(
    "Logistic (no reg)",
    "LASSO Logistic",
    "kNN",
    "Tree (Unpruned)",
    "Tree (Pruned)",
    "Random Forest",
    "Boosting (GBM)"
  ),
  Accuracy = c(
    cm_log$overall["Accuracy"],
    cm_lasso$overall["Accuracy"],
    cm_knn$overall["Accuracy"],
    cm_tree_unpruned$overall["Accuracy"],
    cm_tree_pruned$overall["Accuracy"],
    cm_rf$overall["Accuracy"],
    cm_boost$overall["Accuracy"]
  ),
  Sensitivity = c(
    cm_log$byClass["Sensitivity"],
    cm_lasso$byClass["Sensitivity"],
    cm_knn$byClass["Sensitivity"],
    cm_tree_unpruned$byClass["Sensitivity"],
    cm_tree_pruned$byClass["Sensitivity"],
    cm_rf$byClass["Sensitivity"],
    cm_boost$byClass["Sensitivity"]
  ),
  Specificity = c(
    cm_log$byClass["Specificity"],
    cm_lasso$byClass["Specificity"],
    cm_knn$byClass["Specificity"],
    cm_tree_unpruned$byClass["Specificity"],
    cm_tree_pruned$byClass["Specificity"],
    cm_rf$byClass["Specificity"],
    cm_boost$byClass["Specificity"]
  ),
  AUC = c(
    as.numeric(auc_log),
    as.numeric(auc_lasso),
    as.numeric(auc_knn),
    as.numeric(auc_tree_unpruned),
    as.numeric(auc_tree_pruned),
    as.numeric(auc_rf),
    as.numeric(auc_boost)
  )
)

model_results

## # A tibble: 7 × 5
##   Model             Accuracy Sensitivity Specificity   AUC
##   <chr>                <dbl>       <dbl>       <dbl> <dbl>
## 1 Logistic (no reg)    0.959       0.938       0.972 0.975
## 2 LASSO Logistic       0.965       0.938       0.981 0.994
## 3 kNN                  0.982       0.953       1     0.997
## 4 Tree (Unpruned)      0.914       1           0.884 0.986
## 5 Tree (Pruned)        0.914       1           0.884 0.986
## 6 Random Forest        0.948       0.982       0.935 0.998
## 7 Boosting (GBM)       0.948       0.982       0.935 0.997

Machine Learning Project

Medha Chhetri

2025-11-25