knitr::opts_chunk$set(echo = TRUE)
library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(corrplot)
## corrplot 0.92 loaded
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(caTools)
library(caret)
## Loading required package: lattice
library(caretEnsemble)
## Warning: package 'caretEnsemble' was built under R version 4.3.2
## 
## Attaching package: 'caretEnsemble'
## The following object is masked from 'package:ggplot2':
## 
##     autoplot

Breast Cancer Data Analysis

Analyze the Breast Cancer DataSet, and apply two machine learning classification models to compare their results and find the model which is best to predict breast cancer.

DataSet

I am considering Breast Cancer Wisconsin (Diagnostic) DataSet obtained from Kaggle. https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data

# URL and CSV file names
url <- "https://raw.githubusercontent.com/jayatveluri/Data622/main/"
csv_name <- "data"
# 1. Read the CSV files into data frames
df <- read_csv(paste0(url, csv_name, ".csv"))
## New names:
## • `` -> `...33`
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 568 Columns: 33
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): diagnosis
## dbl (31): id, radius_mean, texture_mean, perimeter_mean, area_mean, smoothne...
## lgl  (1): ...33
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(df)
## Rows: 568
## Columns: 33
## $ id                      <dbl> 842302, 842517, 84300903, 84348301, 84358402, …
## $ diagnosis               <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "…
## $ radius_mean             <dbl> 17.990, 20.570, 19.690, 11.420, 20.290, 12.450…
## $ texture_mean            <dbl> 10.38, 17.77, 21.25, 20.38, 14.34, 15.70, 19.9…
## $ perimeter_mean          <dbl> 122.80, 132.90, 130.00, 77.58, 135.10, 82.57, …
## $ area_mean               <dbl> 1001.0, 1326.0, 1203.0, 386.1, 1297.0, 477.1, …
## $ smoothness_mean         <dbl> 0.11840, 0.08474, 0.10960, 0.14250, 0.10030, 0…
## $ compactness_mean        <dbl> 0.27760, 0.07864, 0.15990, 0.28390, 0.13280, 0…
## $ concavity_mean          <dbl> 0.30010, 0.08690, 0.19740, 0.24140, 0.19800, 0…
## $ `concave points_mean`   <dbl> 0.14710, 0.07017, 0.12790, 0.10520, 0.10430, 0…
## $ symmetry_mean           <dbl> 0.2419, 0.1812, 0.2069, 0.2597, 0.1809, 0.2087…
## $ fractal_dimension_mean  <dbl> 0.07871, 0.05667, 0.05999, 0.09744, 0.05883, 0…
## $ radius_se               <dbl> 1.0950, 0.5435, 0.7456, 0.4956, 0.7572, 0.3345…
## $ texture_se              <dbl> 0.9053, 0.7339, 0.7869, 1.1560, 0.7813, 0.8902…
## $ perimeter_se            <dbl> 8.589, 3.398, 4.585, 3.445, 5.438, 2.217, 3.18…
## $ area_se                 <dbl> 153.40, 74.08, 94.03, 27.23, 94.44, 27.19, 53.…
## $ smoothness_se           <dbl> 0.006399, 0.005225, 0.006150, 0.009110, 0.0114…
## $ compactness_se          <dbl> 0.049040, 0.013080, 0.040060, 0.074580, 0.0246…
## $ concavity_se            <dbl> 0.05373, 0.01860, 0.03832, 0.05661, 0.05688, 0…
## $ `concave points_se`     <dbl> 0.015870, 0.013400, 0.020580, 0.018670, 0.0188…
## $ symmetry_se             <dbl> 0.03003, 0.01389, 0.02250, 0.05963, 0.01756, 0…
## $ fractal_dimension_se    <dbl> 0.006193, 0.003532, 0.004571, 0.009208, 0.0051…
## $ radius_worst            <dbl> 25.38, 24.99, 23.57, 14.91, 22.54, 15.47, 22.8…
## $ texture_worst           <dbl> 17.33, 23.41, 25.53, 26.50, 16.67, 23.75, 27.6…
## $ perimeter_worst         <dbl> 184.60, 158.80, 152.50, 98.87, 152.20, 103.40,…
## $ area_worst              <dbl> 2019.0, 1956.0, 1709.0, 567.7, 1575.0, 741.6, …
## $ smoothness_worst        <dbl> 0.1622, 0.1238, 0.1444, 0.2098, 0.1374, 0.1791…
## $ compactness_worst       <dbl> 0.6656, 0.1866, 0.4245, 0.8663, 0.2050, 0.5249…
## $ concavity_worst         <dbl> 0.71190, 0.24160, 0.45040, 0.68690, 0.40000, 0…
## $ `concave points_worst`  <dbl> 0.26540, 0.18600, 0.24300, 0.25750, 0.16250, 0…
## $ symmetry_worst          <dbl> 0.4601, 0.2750, 0.3613, 0.6638, 0.2364, 0.3985…
## $ fractal_dimension_worst <dbl> 0.11890, 0.08902, 0.08758, 0.17300, 0.07678, 0…
## $ ...33                   <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
str(df)
## spc_tbl_ [568 × 33] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ id                     : num [1:568] 842302 842517 84300903 84348301 84358402 ...
##  $ diagnosis              : chr [1:568] "M" "M" "M" "M" ...
##  $ radius_mean            : num [1:568] 18 20.6 19.7 11.4 20.3 ...
##  $ texture_mean           : num [1:568] 10.4 17.8 21.2 20.4 14.3 ...
##  $ perimeter_mean         : num [1:568] 122.8 132.9 130 77.6 135.1 ...
##  $ area_mean              : num [1:568] 1001 1326 1203 386 1297 ...
##  $ smoothness_mean        : num [1:568] 0.1184 0.0847 0.1096 0.1425 0.1003 ...
##  $ compactness_mean       : num [1:568] 0.2776 0.0786 0.1599 0.2839 0.1328 ...
##  $ concavity_mean         : num [1:568] 0.3001 0.0869 0.1974 0.2414 0.198 ...
##  $ concave points_mean    : num [1:568] 0.1471 0.0702 0.1279 0.1052 0.1043 ...
##  $ symmetry_mean          : num [1:568] 0.242 0.181 0.207 0.26 0.181 ...
##  $ fractal_dimension_mean : num [1:568] 0.0787 0.0567 0.06 0.0974 0.0588 ...
##  $ radius_se              : num [1:568] 1.095 0.543 0.746 0.496 0.757 ...
##  $ texture_se             : num [1:568] 0.905 0.734 0.787 1.156 0.781 ...
##  $ perimeter_se           : num [1:568] 8.59 3.4 4.58 3.44 5.44 ...
##  $ area_se                : num [1:568] 153.4 74.1 94 27.2 94.4 ...
##  $ smoothness_se          : num [1:568] 0.0064 0.00522 0.00615 0.00911 0.01149 ...
##  $ compactness_se         : num [1:568] 0.049 0.0131 0.0401 0.0746 0.0246 ...
##  $ concavity_se           : num [1:568] 0.0537 0.0186 0.0383 0.0566 0.0569 ...
##  $ concave points_se      : num [1:568] 0.0159 0.0134 0.0206 0.0187 0.0188 ...
##  $ symmetry_se            : num [1:568] 0.03 0.0139 0.0225 0.0596 0.0176 ...
##  $ fractal_dimension_se   : num [1:568] 0.00619 0.00353 0.00457 0.00921 0.00511 ...
##  $ radius_worst           : num [1:568] 25.4 25 23.6 14.9 22.5 ...
##  $ texture_worst          : num [1:568] 17.3 23.4 25.5 26.5 16.7 ...
##  $ perimeter_worst        : num [1:568] 184.6 158.8 152.5 98.9 152.2 ...
##  $ area_worst             : num [1:568] 2019 1956 1709 568 1575 ...
##  $ smoothness_worst       : num [1:568] 0.162 0.124 0.144 0.21 0.137 ...
##  $ compactness_worst      : num [1:568] 0.666 0.187 0.424 0.866 0.205 ...
##  $ concavity_worst        : num [1:568] 0.712 0.242 0.45 0.687 0.4 ...
##  $ concave points_worst   : num [1:568] 0.265 0.186 0.243 0.258 0.163 ...
##  $ symmetry_worst         : num [1:568] 0.46 0.275 0.361 0.664 0.236 ...
##  $ fractal_dimension_worst: num [1:568] 0.1189 0.089 0.0876 0.173 0.0768 ...
##  $ ...33                  : logi [1:568] NA NA NA NA NA NA ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   id = col_double(),
##   ..   diagnosis = col_character(),
##   ..   radius_mean = col_double(),
##   ..   texture_mean = col_double(),
##   ..   perimeter_mean = col_double(),
##   ..   area_mean = col_double(),
##   ..   smoothness_mean = col_double(),
##   ..   compactness_mean = col_double(),
##   ..   concavity_mean = col_double(),
##   ..   `concave points_mean` = col_double(),
##   ..   symmetry_mean = col_double(),
##   ..   fractal_dimension_mean = col_double(),
##   ..   radius_se = col_double(),
##   ..   texture_se = col_double(),
##   ..   perimeter_se = col_double(),
##   ..   area_se = col_double(),
##   ..   smoothness_se = col_double(),
##   ..   compactness_se = col_double(),
##   ..   concavity_se = col_double(),
##   ..   `concave points_se` = col_double(),
##   ..   symmetry_se = col_double(),
##   ..   fractal_dimension_se = col_double(),
##   ..   radius_worst = col_double(),
##   ..   texture_worst = col_double(),
##   ..   perimeter_worst = col_double(),
##   ..   area_worst = col_double(),
##   ..   smoothness_worst = col_double(),
##   ..   compactness_worst = col_double(),
##   ..   concavity_worst = col_double(),
##   ..   `concave points_worst` = col_double(),
##   ..   symmetry_worst = col_double(),
##   ..   fractal_dimension_worst = col_double(),
##   ..   ...33 = col_logical()
##   .. )
##  - attr(*, "problems")=<externalptr>

Including Plots

You can also embed plots, for example:

df$diagnosis <- as.factor(df$diagnosis)
# the 33 column is not right
df[,33] <- NULL
summary(df)
##        id            diagnosis  radius_mean      texture_mean  
##  Min.   :     8670   B:356     Min.   : 6.981   Min.   : 9.71  
##  1st Qu.:   869222   M:212     1st Qu.:11.707   1st Qu.:16.17  
##  Median :   906157             Median :13.375   Median :18.84  
##  Mean   : 30425140             Mean   :14.139   Mean   :19.28  
##  3rd Qu.:  8825022             3rd Qu.:15.797   3rd Qu.:21.79  
##  Max.   :911320502             Max.   :28.110   Max.   :39.28  
##  perimeter_mean     area_mean      smoothness_mean   compactness_mean 
##  Min.   : 43.79   Min.   : 143.5   Min.   :0.06251   Min.   :0.01938  
##  1st Qu.: 75.20   1st Qu.: 420.3   1st Qu.:0.08640   1st Qu.:0.06517  
##  Median : 86.29   Median : 551.4   Median :0.09589   Median :0.09312  
##  Mean   : 92.05   Mean   : 655.7   Mean   :0.09644   Mean   :0.10445  
##  3rd Qu.:104.15   3rd Qu.: 784.1   3rd Qu.:0.10533   3rd Qu.:0.13043  
##  Max.   :188.50   Max.   :2501.0   Max.   :0.16340   Max.   :0.34540  
##  concavity_mean    concave points_mean symmetry_mean    fractal_dimension_mean
##  Min.   :0.00000   Min.   :0.00000     Min.   :0.1060   Min.   :0.04996       
##  1st Qu.:0.02958   1st Qu.:0.02035     1st Qu.:0.1620   1st Qu.:0.05770       
##  Median :0.06155   Median :0.03360     Median :0.1792   Median :0.06155       
##  Mean   :0.08896   Mean   :0.04901     Mean   :0.1812   Mean   :0.06280       
##  3rd Qu.:0.13100   3rd Qu.:0.07401     3rd Qu.:0.1957   3rd Qu.:0.06613       
##  Max.   :0.42680   Max.   :0.20120     Max.   :0.3040   Max.   :0.09744       
##    radius_se        texture_se      perimeter_se       area_se       
##  Min.   :0.1115   Min.   :0.3602   Min.   : 0.757   Min.   :  6.802  
##  1st Qu.:0.2324   1st Qu.:0.8331   1st Qu.: 1.605   1st Qu.: 17.850  
##  Median :0.3240   Median :1.1080   Median : 2.285   Median : 24.565  
##  Mean   :0.4052   Mean   :1.2165   Mean   : 2.867   Mean   : 40.374  
##  3rd Qu.:0.4798   3rd Qu.:1.4743   3rd Qu.: 3.360   3rd Qu.: 45.237  
##  Max.   :2.8730   Max.   :4.8850   Max.   :21.980   Max.   :542.200  
##  smoothness_se      compactness_se      concavity_se     concave points_se 
##  Min.   :0.001713   Min.   :0.002252   Min.   :0.00000   Min.   :0.000000  
##  1st Qu.:0.005166   1st Qu.:0.013133   1st Qu.:0.01510   1st Qu.:0.007663  
##  Median :0.006374   Median :0.020460   Median :0.02592   Median :0.010950  
##  Mean   :0.007041   Mean   :0.025515   Mean   :0.03195   Mean   :0.011817  
##  3rd Qu.:0.008151   3rd Qu.:0.032455   3rd Qu.:0.04212   3rd Qu.:0.014730  
##  Max.   :0.031130   Max.   :0.135400   Max.   :0.39600   Max.   :0.052790  
##   symmetry_se       fractal_dimension_se  radius_worst   texture_worst  
##  Min.   :0.007882   Min.   :0.0008948    Min.   : 7.93   Min.   :12.02  
##  1st Qu.:0.015128   1st Qu.:0.0022445    1st Qu.:13.03   1st Qu.:21.07  
##  Median :0.018725   Median :0.0031955    Median :14.97   Median :25.41  
##  Mean   :0.020531   Mean   :0.0037967    Mean   :16.28   Mean   :25.67  
##  3rd Qu.:0.023398   3rd Qu.:0.0045585    3rd Qu.:18.80   3rd Qu.:29.68  
##  Max.   :0.078950   Max.   :0.0298400    Max.   :36.04   Max.   :49.54  
##  perimeter_worst    area_worst     smoothness_worst  compactness_worst
##  Min.   : 50.41   Min.   : 185.2   Min.   :0.07117   Min.   :0.02729  
##  1st Qu.: 84.15   1st Qu.: 515.7   1st Qu.:0.11660   1st Qu.:0.14758  
##  Median : 97.67   Median : 686.5   Median :0.13135   Median :0.21300  
##  Mean   :107.35   Mean   : 881.7   Mean   :0.13244   Mean   :0.25460  
##  3rd Qu.:125.53   3rd Qu.:1085.0   3rd Qu.:0.14602   3rd Qu.:0.33930  
##  Max.   :251.20   Max.   :4254.0   Max.   :0.22260   Max.   :1.05800  
##  concavity_worst  concave points_worst symmetry_worst   fractal_dimension_worst
##  Min.   :0.0000   Min.   :0.00000      Min.   :0.1565   Min.   :0.05504        
##  1st Qu.:0.1159   1st Qu.:0.06497      1st Qu.:0.2504   1st Qu.:0.07147        
##  Median :0.2275   Median :0.10002      Median :0.2821   Median :0.08005        
##  Mean   :0.2727   Mean   :0.11481      Mean   :0.2901   Mean   :0.08397        
##  3rd Qu.:0.3835   3rd Qu.:0.16168      3rd Qu.:0.3180   3rd Qu.:0.09208        
##  Max.   :1.2520   Max.   :0.29100      Max.   :0.6638   Max.   :0.20750

There aren’t NA values.

The dataset is a bit unbalanced:

prop.table(table(df$diagnosis))
## 
##         B         M 
## 0.6267606 0.3732394

Correlation between variables:

corr_mat <- cor(df[,3:ncol(df)])
corrplot(corr_mat, order = "hclust", tl.cex = 1, addrect = 9)

## There is a great correlation between some variables

Modelling

We are going to get a training and a testing set to use when building some models:

set.seed(1234)
df_index <- createDataPartition(df$diagnosis, p=0.7, list = FALSE)
train_df <- df[df_index, -1]
test_df <- df[-df_index, -1]

Data preprocessing

Because there is so much correlation, some machine learning models can fail. Let us create a PCA and LDA version of the data

PCA Data version

pca_res <- prcomp(df[,3:ncol(df)], center = TRUE, scale = TRUE)
plot(pca_res, type="l")

summary(pca_res)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5    PC6     PC7
## Standard deviation     3.6430 2.3887 1.67894 1.40544 1.28662 1.0982 0.81949
## Proportion of Variance 0.4424 0.1902 0.09396 0.06584 0.05518 0.0402 0.02239
## Cumulative Proportion  0.4424 0.6326 0.72653 0.79237 0.84755 0.8878 0.91013
##                            PC8     PC9    PC10    PC11    PC12    PC13    PC14
## Standard deviation     0.68973 0.64618 0.59266 0.54282 0.51175 0.49126 0.39418
## Proportion of Variance 0.01586 0.01392 0.01171 0.00982 0.00873 0.00804 0.00518
## Cumulative Proportion  0.92599 0.93991 0.95162 0.96144 0.97017 0.97821 0.98339
##                           PC15    PC16    PC17    PC18    PC19    PC20   PC21
## Standard deviation     0.30696 0.28022 0.24367 0.22980 0.22256 0.17656 0.1729
## Proportion of Variance 0.00314 0.00262 0.00198 0.00176 0.00165 0.00104 0.0010
## Cumulative Proportion  0.98653 0.98915 0.99113 0.99289 0.99454 0.99558 0.9966
##                           PC22    PC23   PC24    PC25    PC26    PC27    PC28
## Standard deviation     0.16547 0.15629 0.1344 0.12458 0.08929 0.08295 0.03993
## Proportion of Variance 0.00091 0.00081 0.0006 0.00052 0.00027 0.00023 0.00005
## Cumulative Proportion  0.99749 0.99830 0.9989 0.99942 0.99969 0.99992 0.99997
##                           PC29    PC30
## Standard deviation     0.02728 0.01153
## Proportion of Variance 0.00002 0.00000
## Cumulative Proportion  1.00000 1.00000

If we look at the above data, the two first components explains the 0.6324 of the variance, and 10 principal components to explain more than 0.95 of the variance and 17 to explain more than 0.99

pca_df <- as.data.frame(pca_res$x)
ggplot(pca_df, aes(x=PC1, y=PC2, col=df$diagnosis)) + geom_point(alpha=0.5)

#### Separating the data

g_pc1 <- ggplot(pca_df, aes(x=PC1, fill=df$diagnosis)) + geom_density(alpha=0.25)  
g_pc2 <- ggplot(pca_df, aes(x=PC2, fill=df$diagnosis)) + geom_density(alpha=0.25)  
grid.arrange(g_pc1, g_pc2, ncol=2)

LDA Data version

LDA take in consideration the different classes and could get better results

lda_res <- lda(diagnosis~., df, center = TRUE, scale = TRUE) 
lda_df <- predict(lda_res, df)$x %>% as.data.frame() %>% cbind(diagnosis=df$diagnosis)
lda_res
## Call:
## lda(diagnosis ~ ., data = df, center = TRUE, scale = TRUE)
## 
## Prior probabilities of groups:
##         B         M 
## 0.6267606 0.3732394 
## 
## Group means:
##         id radius_mean texture_mean perimeter_mean area_mean smoothness_mean
## B 26618125    12.15885     17.89615       78.16011  463.5817      0.09258958
## M 36818050    17.46283     21.60491      115.36538  978.3764      0.10289849
##   compactness_mean concavity_mean `concave points_mean` symmetry_mean
## B       0.08018705      0.0461870            0.02578965     0.1742295
## M       0.14518778      0.1607747            0.08799000     0.1929090
##   fractal_dimension_mean radius_se texture_se perimeter_se  area_se
## B             0.06287871 0.2837969   1.219797     1.998783 21.14072
## M             0.06268009 0.6090825   1.210915     4.323929 72.67241
##   smoothness_se compactness_se concavity_se `concave points_se` symmetry_se
## B   0.007195921     0.02148538   0.02606976         0.009885343  0.02056646
## M   0.006780094     0.03228117   0.04182401         0.015060472  0.02047240
##   fractal_dimension_se radius_worst texture_worst perimeter_worst area_worst
## B          0.003638447     13.39082      23.49581        87.08416   559.7149
## M          0.004062406     21.13481      29.31821       141.37033  1422.2863
##   smoothness_worst compactness_worst concavity_worst `concave points_worst`
## B        0.1250578         0.1830047       0.1667047             0.07465346
## M        0.1448452         0.3748241       0.4506056             0.18223731
##   symmetry_worst fractal_dimension_worst
## B      0.2701986              0.07946750
## M      0.3234679              0.09152995
## 
## Coefficients of linear discriminants:
##                                   LD1
## id                      -2.471712e-10
## radius_mean             -1.116030e+00
## texture_mean             2.226287e-02
## perimeter_mean           1.278227e-01
## area_mean                1.344838e-03
## smoothness_mean          1.704319e+00
## compactness_mean        -2.197485e+01
## concavity_mean           7.061429e+00
## `concave points_mean`    9.772217e+00
## symmetry_mean            6.537618e-01
## fractal_dimension_mean   2.934736e+00
## radius_se                2.108651e+00
## texture_se              -3.815391e-02
## perimeter_se            -1.179492e-01
## area_se                 -3.885717e-03
## smoothness_se            7.983966e+01
## compactness_se           8.614795e-01
## concavity_se            -1.766068e+01
## `concave points_se`      5.401010e+01
## symmetry_se              8.214913e+00
## fractal_dimension_se    -3.782109e+01
## radius_worst             9.608121e-01
## texture_worst            3.587123e-02
## perimeter_worst         -1.205477e-02
## area_worst              -4.960733e-03
## smoothness_worst         2.517400e+00
## compactness_worst        4.427600e-01
## concavity_worst          1.845604e+00
## `concave points_worst`   2.354210e+00
## symmetry_worst           2.702041e+00
## fractal_dimension_worst  2.109905e+01
ggplot(lda_df, aes(x=LD1, y=0, col=diagnosis)) + geom_point(alpha=0.5)

ggplot(lda_df, aes(x=LD1, fill=diagnosis)) + geom_density(alpha=0.5)

Create a training and test set of LDA data

train_df_lda <- lda_df[df_index,]
test_df_lda <- lda_df[-df_index,]

Applying machine learning Models

fitControl <- trainControl(method="cv",
                            number = 5,
                            preProcOptions = list(thresh = 0.99), # threshold for pca preprocess
                            classProbs = TRUE,
                            summaryFunction = twoClassSummary)

Machine Learning Methodolgies

Applying KNNMethodolgy

model_knn <- train(diagnosis~.,
                   train_df,
                   method="knn",
                   metric="ROC",
                   preProcess = c('center', 'scale'),
                   tuneLength=10,
                   trControl=fitControl)
pred_knn <- predict(model_knn, test_df)
cm_knn <- confusionMatrix(pred_knn, test_df$diagnosis, positive = "M")
cm_knn
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   B   M
##          B 106   8
##          M   0  55
##                                           
##                Accuracy : 0.9527          
##                  95% CI : (0.9089, 0.9793)
##     No Information Rate : 0.6272          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.8961          
##                                           
##  Mcnemar's Test P-Value : 0.01333         
##                                           
##             Sensitivity : 0.8730          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.9298          
##              Prevalence : 0.3728          
##          Detection Rate : 0.3254          
##    Detection Prevalence : 0.3254          
##       Balanced Accuracy : 0.9365          
##                                           
##        'Positive' Class : M               
## 
pred_prob_knn <- predict(model_knn, test_df, type="prob")
roc_knn <- roc(test_df$diagnosis, pred_prob_knn$M)
## Setting levels: control = B, case = M
## Setting direction: controls < cases
plot(roc_knn)

#### Applying KNN ON PCA

model_knn_pca <- train(diagnosis~.,
                   train_df,
                   method="knn",
                   metric="ROC",
                   preProcess = c('center', 'scale', 'pca'),
                   tuneLength=10,
                   trControl=fitControl)
pred_knn_pca <- predict(model_knn_pca, test_df)
cm_knn_pca <- confusionMatrix(pred_knn_pca, test_df$diagnosis, positive = "M")
cm_knn_pca
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   B   M
##          B 104   6
##          M   2  57
##                                           
##                Accuracy : 0.9527          
##                  95% CI : (0.9089, 0.9793)
##     No Information Rate : 0.6272          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8975          
##                                           
##  Mcnemar's Test P-Value : 0.2888          
##                                           
##             Sensitivity : 0.9048          
##             Specificity : 0.9811          
##          Pos Pred Value : 0.9661          
##          Neg Pred Value : 0.9455          
##              Prevalence : 0.3728          
##          Detection Rate : 0.3373          
##    Detection Prevalence : 0.3491          
##       Balanced Accuracy : 0.9429          
##                                           
##        'Positive' Class : M               
## 
pred_prob_knn_pca <- predict(model_knn_pca, test_df, type="prob")
roc_knn_pca <- roc(test_df$diagnosis, pred_prob_knn_pca$M)
## Setting levels: control = B, case = M
## Setting direction: controls < cases
plot(roc_knn_pca)

Applying KNN ON LDA

model_knn_lda <- train(diagnosis~.,
                   train_df_lda,
                   method="knn",
                   metric="ROC",
                   preProcess = c('center', 'scale'),
                   tuneLength=10,
                   trControl=fitControl)
pred_knn_lda <- predict(model_knn_lda, test_df_lda)
cm_knn_lda <- confusionMatrix(pred_knn_lda, test_df_lda$diagnosis, positive = "M")
cm_knn_lda
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   B   M
##          B 106   5
##          M   0  58
##                                           
##                Accuracy : 0.9704          
##                  95% CI : (0.9323, 0.9903)
##     No Information Rate : 0.6272          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.9357          
##                                           
##  Mcnemar's Test P-Value : 0.07364         
##                                           
##             Sensitivity : 0.9206          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.9550          
##              Prevalence : 0.3728          
##          Detection Rate : 0.3432          
##    Detection Prevalence : 0.3432          
##       Balanced Accuracy : 0.9603          
##                                           
##        'Positive' Class : M               
## 
pred_prob_knn_lda <- predict(model_knn_lda, test_df_lda, type="prob")
roc_knn_lda <- roc(test_df_lda$diagnosis, pred_prob_knn_lda$M)
## Setting levels: control = B, case = M
## Setting direction: controls < cases
plot(roc_knn_lda)

Applying Neural Networks (NNET) Methodolgy

model_nnet <- train(diagnosis~.,
                    train_df,
                    method="nnet",
                    metric="ROC",
                    preProcess=c('center', 'scale'),
                    tuneLength=10,
                    trace=FALSE,
                    trControl=fitControl)
pred_nnet <- predict(model_nnet, test_df)
cm_nnet <- confusionMatrix(pred_nnet, test_df$diagnosis, positive = "M")
cm_nnet
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   B   M
##          B 104   4
##          M   2  59
##                                           
##                Accuracy : 0.9645          
##                  95% CI : (0.9243, 0.9869)
##     No Information Rate : 0.6272          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9236          
##                                           
##  Mcnemar's Test P-Value : 0.6831          
##                                           
##             Sensitivity : 0.9365          
##             Specificity : 0.9811          
##          Pos Pred Value : 0.9672          
##          Neg Pred Value : 0.9630          
##              Prevalence : 0.3728          
##          Detection Rate : 0.3491          
##    Detection Prevalence : 0.3609          
##       Balanced Accuracy : 0.9588          
##                                           
##        'Positive' Class : M               
## 

Applying Neural Networks (NNET) on PCA

model_nnet_pca <- train(diagnosis~.,
                    train_df,
                    method="nnet",
                    metric="ROC",
                    preProcess=c('center', 'scale', 'pca'),
                    tuneLength=10,
                    trace=FALSE,
                    trControl=fitControl)
pred_nnet_pca <- predict(model_nnet_pca, test_df)
cm_nnet_pca <- confusionMatrix(pred_nnet_pca, test_df$diagnosis, positive = "M")
cm_nnet_pca
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   B   M
##          B 103   1
##          M   3  62
##                                           
##                Accuracy : 0.9763          
##                  95% CI : (0.9405, 0.9935)
##     No Information Rate : 0.6272          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9497          
##                                           
##  Mcnemar's Test P-Value : 0.6171          
##                                           
##             Sensitivity : 0.9841          
##             Specificity : 0.9717          
##          Pos Pred Value : 0.9538          
##          Neg Pred Value : 0.9904          
##              Prevalence : 0.3728          
##          Detection Rate : 0.3669          
##    Detection Prevalence : 0.3846          
##       Balanced Accuracy : 0.9779          
##                                           
##        'Positive' Class : M               
## 

Applying Neural Networks (NNET) on LDA

model_nnet_lda <- train(diagnosis~.,
                    train_df_lda,
                    method="nnet",
                    metric="ROC",
                    preProcess=c('center', 'scale'),
                    tuneLength=10,
                    trace=FALSE,
                    trControl=fitControl)
pred_nnet_lda <- predict(model_nnet_lda, test_df_lda)
cm_nnet_lda <- confusionMatrix(pred_nnet_lda, test_df_lda$diagnosis, positive = "M")
cm_nnet_lda
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   B   M
##          B 106   1
##          M   0  62
##                                           
##                Accuracy : 0.9941          
##                  95% CI : (0.9675, 0.9999)
##     No Information Rate : 0.6272          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9873          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9841          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.9907          
##              Prevalence : 0.3728          
##          Detection Rate : 0.3669          
##    Detection Prevalence : 0.3669          
##       Balanced Accuracy : 0.9921          
##                                           
##        'Positive' Class : M               
## 

Model comparasion

Let’s compare the models and check their correlation:

model_list <- list(KNN = model_knn, PCA_KNN=model_knn_pca, LDA_KNN=model_knn_lda, 
                   NNET=model_nnet, PCA_NNET=model_nnet_pca, LDA_NNET=model_nnet_lda)
resamples <- resamples(model_list)

Correlation between models

model_cor <- modelCor(resamples)
model_cor
##                 KNN    PCA_KNN    LDA_KNN       NNET   PCA_NNET   LDA_NNET
## KNN       1.0000000 -0.5601738  0.4551622  0.5229763 -0.1074780  0.1261569
## PCA_KNN  -0.5601738  1.0000000 -0.6420451 -0.1874635 -0.5447953 -0.7245273
## LDA_KNN   0.4551622 -0.6420451  1.0000000 -0.4581097 -0.2090284  0.8804419
## NNET      0.5229763 -0.1874635 -0.4581097  1.0000000  0.3972921 -0.5409319
## PCA_NNET -0.1074780 -0.5447953 -0.2090284  0.3972921  1.0000000  0.1723862
## LDA_NNET  0.1261569 -0.7245273  0.8804419 -0.5409319  0.1723862  1.0000000

Model Plot

corrplot(model_cor)

bwplot(resamples, metric="ROC")

Some models have a great variability depending of the processed sample. The models PCA_NNET, and PCA_KNN achieve a great auc with some variability.

The ROC metric measure the auc of the roc curve of each model. This metric is independent of any threshold. Prediction classes are obtained by default with a threshold of 0.5 which could not be the best with an unbalanced dataset like this.

cm_list <- list(NNET=cm_nnet, PCA_NNET=cm_nnet_pca, LDA_NNET=cm_nnet_lda, 
                   KNN = cm_knn, PCA_KNN=cm_knn_pca, LDA_KNN=cm_knn_lda)
cm_list_results <- sapply(cm_list, function(x) x$byClass)
cm_list_results
##                           NNET  PCA_NNET  LDA_NNET       KNN   PCA_KNN
## Sensitivity          0.9365079 0.9841270 0.9841270 0.8730159 0.9047619
## Specificity          0.9811321 0.9716981 1.0000000 1.0000000 0.9811321
## Pos Pred Value       0.9672131 0.9538462 1.0000000 1.0000000 0.9661017
## Neg Pred Value       0.9629630 0.9903846 0.9906542 0.9298246 0.9454545
## Precision            0.9672131 0.9538462 1.0000000 1.0000000 0.9661017
## Recall               0.9365079 0.9841270 0.9841270 0.8730159 0.9047619
## F1                   0.9516129 0.9687500 0.9920000 0.9322034 0.9344262
## Prevalence           0.3727811 0.3727811 0.3727811 0.3727811 0.3727811
## Detection Rate       0.3491124 0.3668639 0.3668639 0.3254438 0.3372781
## Detection Prevalence 0.3609467 0.3846154 0.3668639 0.3254438 0.3491124
## Balanced Accuracy    0.9588200 0.9779125 0.9920635 0.9365079 0.9429470
##                        LDA_KNN
## Sensitivity          0.9206349
## Specificity          1.0000000
## Pos Pred Value       1.0000000
## Neg Pred Value       0.9549550
## Precision            1.0000000
## Recall               0.9206349
## F1                   0.9586777
## Prevalence           0.3727811
## Detection Rate       0.3431953
## Detection Prevalence 0.3431953
## Balanced Accuracy    0.9603175

From the above table The best results for sensitivity (detection of breast cases) is LDA_NNET(0.9841270) which also has a great F1 score of 0.9920000.

Purpose of the Analysis

The purpose of the analysis is to find the best Model (based on Machine Learning Methodologies) to predict the breast cancer on the breast cancer wisconsin dataset.

Conclusions

We have found a model based on neural network and LDA preprocessed data with good results over the test set. This model has a sensibility of 0.984 with a F1 score of 0.992

Further we can try to improve the model by

Modify models to use a different metric rather than ROC (auc) which takes in consideration the best threshold

Try different stacking models

```