knitr::opts_chunk$set(echo = TRUE)
library(readr)
library(dplyr)##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(corrplot)## corrplot 0.92 loaded
library(gridExtra)##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(pROC)## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(MASS)##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(caTools)
library(caret)## Loading required package: lattice
library(caretEnsemble)## Warning: package 'caretEnsemble' was built under R version 4.3.2
##
## Attaching package: 'caretEnsemble'
## The following object is masked from 'package:ggplot2':
##
## autoplot
Analyze the Breast Cancer DataSet, and apply two machine learning classification models to compare their results and find the model which is best to predict breast cancer.
I am considering Breast Cancer Wisconsin (Diagnostic) DataSet obtained from Kaggle. https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data
# URL and CSV file names
url <- "https://raw.githubusercontent.com/jayatveluri/Data622/main/"
csv_name <- "data"
# 1. Read the CSV files into data frames
df <- read_csv(paste0(url, csv_name, ".csv"))## New names:
## • `` -> `...33`
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 568 Columns: 33
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): diagnosis
## dbl (31): id, radius_mean, texture_mean, perimeter_mean, area_mean, smoothne...
## lgl (1): ...33
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(df)## Rows: 568
## Columns: 33
## $ id <dbl> 842302, 842517, 84300903, 84348301, 84358402, …
## $ diagnosis <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "…
## $ radius_mean <dbl> 17.990, 20.570, 19.690, 11.420, 20.290, 12.450…
## $ texture_mean <dbl> 10.38, 17.77, 21.25, 20.38, 14.34, 15.70, 19.9…
## $ perimeter_mean <dbl> 122.80, 132.90, 130.00, 77.58, 135.10, 82.57, …
## $ area_mean <dbl> 1001.0, 1326.0, 1203.0, 386.1, 1297.0, 477.1, …
## $ smoothness_mean <dbl> 0.11840, 0.08474, 0.10960, 0.14250, 0.10030, 0…
## $ compactness_mean <dbl> 0.27760, 0.07864, 0.15990, 0.28390, 0.13280, 0…
## $ concavity_mean <dbl> 0.30010, 0.08690, 0.19740, 0.24140, 0.19800, 0…
## $ `concave points_mean` <dbl> 0.14710, 0.07017, 0.12790, 0.10520, 0.10430, 0…
## $ symmetry_mean <dbl> 0.2419, 0.1812, 0.2069, 0.2597, 0.1809, 0.2087…
## $ fractal_dimension_mean <dbl> 0.07871, 0.05667, 0.05999, 0.09744, 0.05883, 0…
## $ radius_se <dbl> 1.0950, 0.5435, 0.7456, 0.4956, 0.7572, 0.3345…
## $ texture_se <dbl> 0.9053, 0.7339, 0.7869, 1.1560, 0.7813, 0.8902…
## $ perimeter_se <dbl> 8.589, 3.398, 4.585, 3.445, 5.438, 2.217, 3.18…
## $ area_se <dbl> 153.40, 74.08, 94.03, 27.23, 94.44, 27.19, 53.…
## $ smoothness_se <dbl> 0.006399, 0.005225, 0.006150, 0.009110, 0.0114…
## $ compactness_se <dbl> 0.049040, 0.013080, 0.040060, 0.074580, 0.0246…
## $ concavity_se <dbl> 0.05373, 0.01860, 0.03832, 0.05661, 0.05688, 0…
## $ `concave points_se` <dbl> 0.015870, 0.013400, 0.020580, 0.018670, 0.0188…
## $ symmetry_se <dbl> 0.03003, 0.01389, 0.02250, 0.05963, 0.01756, 0…
## $ fractal_dimension_se <dbl> 0.006193, 0.003532, 0.004571, 0.009208, 0.0051…
## $ radius_worst <dbl> 25.38, 24.99, 23.57, 14.91, 22.54, 15.47, 22.8…
## $ texture_worst <dbl> 17.33, 23.41, 25.53, 26.50, 16.67, 23.75, 27.6…
## $ perimeter_worst <dbl> 184.60, 158.80, 152.50, 98.87, 152.20, 103.40,…
## $ area_worst <dbl> 2019.0, 1956.0, 1709.0, 567.7, 1575.0, 741.6, …
## $ smoothness_worst <dbl> 0.1622, 0.1238, 0.1444, 0.2098, 0.1374, 0.1791…
## $ compactness_worst <dbl> 0.6656, 0.1866, 0.4245, 0.8663, 0.2050, 0.5249…
## $ concavity_worst <dbl> 0.71190, 0.24160, 0.45040, 0.68690, 0.40000, 0…
## $ `concave points_worst` <dbl> 0.26540, 0.18600, 0.24300, 0.25750, 0.16250, 0…
## $ symmetry_worst <dbl> 0.4601, 0.2750, 0.3613, 0.6638, 0.2364, 0.3985…
## $ fractal_dimension_worst <dbl> 0.11890, 0.08902, 0.08758, 0.17300, 0.07678, 0…
## $ ...33 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
str(df)## spc_tbl_ [568 × 33] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ id : num [1:568] 842302 842517 84300903 84348301 84358402 ...
## $ diagnosis : chr [1:568] "M" "M" "M" "M" ...
## $ radius_mean : num [1:568] 18 20.6 19.7 11.4 20.3 ...
## $ texture_mean : num [1:568] 10.4 17.8 21.2 20.4 14.3 ...
## $ perimeter_mean : num [1:568] 122.8 132.9 130 77.6 135.1 ...
## $ area_mean : num [1:568] 1001 1326 1203 386 1297 ...
## $ smoothness_mean : num [1:568] 0.1184 0.0847 0.1096 0.1425 0.1003 ...
## $ compactness_mean : num [1:568] 0.2776 0.0786 0.1599 0.2839 0.1328 ...
## $ concavity_mean : num [1:568] 0.3001 0.0869 0.1974 0.2414 0.198 ...
## $ concave points_mean : num [1:568] 0.1471 0.0702 0.1279 0.1052 0.1043 ...
## $ symmetry_mean : num [1:568] 0.242 0.181 0.207 0.26 0.181 ...
## $ fractal_dimension_mean : num [1:568] 0.0787 0.0567 0.06 0.0974 0.0588 ...
## $ radius_se : num [1:568] 1.095 0.543 0.746 0.496 0.757 ...
## $ texture_se : num [1:568] 0.905 0.734 0.787 1.156 0.781 ...
## $ perimeter_se : num [1:568] 8.59 3.4 4.58 3.44 5.44 ...
## $ area_se : num [1:568] 153.4 74.1 94 27.2 94.4 ...
## $ smoothness_se : num [1:568] 0.0064 0.00522 0.00615 0.00911 0.01149 ...
## $ compactness_se : num [1:568] 0.049 0.0131 0.0401 0.0746 0.0246 ...
## $ concavity_se : num [1:568] 0.0537 0.0186 0.0383 0.0566 0.0569 ...
## $ concave points_se : num [1:568] 0.0159 0.0134 0.0206 0.0187 0.0188 ...
## $ symmetry_se : num [1:568] 0.03 0.0139 0.0225 0.0596 0.0176 ...
## $ fractal_dimension_se : num [1:568] 0.00619 0.00353 0.00457 0.00921 0.00511 ...
## $ radius_worst : num [1:568] 25.4 25 23.6 14.9 22.5 ...
## $ texture_worst : num [1:568] 17.3 23.4 25.5 26.5 16.7 ...
## $ perimeter_worst : num [1:568] 184.6 158.8 152.5 98.9 152.2 ...
## $ area_worst : num [1:568] 2019 1956 1709 568 1575 ...
## $ smoothness_worst : num [1:568] 0.162 0.124 0.144 0.21 0.137 ...
## $ compactness_worst : num [1:568] 0.666 0.187 0.424 0.866 0.205 ...
## $ concavity_worst : num [1:568] 0.712 0.242 0.45 0.687 0.4 ...
## $ concave points_worst : num [1:568] 0.265 0.186 0.243 0.258 0.163 ...
## $ symmetry_worst : num [1:568] 0.46 0.275 0.361 0.664 0.236 ...
## $ fractal_dimension_worst: num [1:568] 0.1189 0.089 0.0876 0.173 0.0768 ...
## $ ...33 : logi [1:568] NA NA NA NA NA NA ...
## - attr(*, "spec")=
## .. cols(
## .. id = col_double(),
## .. diagnosis = col_character(),
## .. radius_mean = col_double(),
## .. texture_mean = col_double(),
## .. perimeter_mean = col_double(),
## .. area_mean = col_double(),
## .. smoothness_mean = col_double(),
## .. compactness_mean = col_double(),
## .. concavity_mean = col_double(),
## .. `concave points_mean` = col_double(),
## .. symmetry_mean = col_double(),
## .. fractal_dimension_mean = col_double(),
## .. radius_se = col_double(),
## .. texture_se = col_double(),
## .. perimeter_se = col_double(),
## .. area_se = col_double(),
## .. smoothness_se = col_double(),
## .. compactness_se = col_double(),
## .. concavity_se = col_double(),
## .. `concave points_se` = col_double(),
## .. symmetry_se = col_double(),
## .. fractal_dimension_se = col_double(),
## .. radius_worst = col_double(),
## .. texture_worst = col_double(),
## .. perimeter_worst = col_double(),
## .. area_worst = col_double(),
## .. smoothness_worst = col_double(),
## .. compactness_worst = col_double(),
## .. concavity_worst = col_double(),
## .. `concave points_worst` = col_double(),
## .. symmetry_worst = col_double(),
## .. fractal_dimension_worst = col_double(),
## .. ...33 = col_logical()
## .. )
## - attr(*, "problems")=<externalptr>
You can also embed plots, for example:
df$diagnosis <- as.factor(df$diagnosis)
# the 33 column is not right
df[,33] <- NULLsummary(df)## id diagnosis radius_mean texture_mean
## Min. : 8670 B:356 Min. : 6.981 Min. : 9.71
## 1st Qu.: 869222 M:212 1st Qu.:11.707 1st Qu.:16.17
## Median : 906157 Median :13.375 Median :18.84
## Mean : 30425140 Mean :14.139 Mean :19.28
## 3rd Qu.: 8825022 3rd Qu.:15.797 3rd Qu.:21.79
## Max. :911320502 Max. :28.110 Max. :39.28
## perimeter_mean area_mean smoothness_mean compactness_mean
## Min. : 43.79 Min. : 143.5 Min. :0.06251 Min. :0.01938
## 1st Qu.: 75.20 1st Qu.: 420.3 1st Qu.:0.08640 1st Qu.:0.06517
## Median : 86.29 Median : 551.4 Median :0.09589 Median :0.09312
## Mean : 92.05 Mean : 655.7 Mean :0.09644 Mean :0.10445
## 3rd Qu.:104.15 3rd Qu.: 784.1 3rd Qu.:0.10533 3rd Qu.:0.13043
## Max. :188.50 Max. :2501.0 Max. :0.16340 Max. :0.34540
## concavity_mean concave points_mean symmetry_mean fractal_dimension_mean
## Min. :0.00000 Min. :0.00000 Min. :0.1060 Min. :0.04996
## 1st Qu.:0.02958 1st Qu.:0.02035 1st Qu.:0.1620 1st Qu.:0.05770
## Median :0.06155 Median :0.03360 Median :0.1792 Median :0.06155
## Mean :0.08896 Mean :0.04901 Mean :0.1812 Mean :0.06280
## 3rd Qu.:0.13100 3rd Qu.:0.07401 3rd Qu.:0.1957 3rd Qu.:0.06613
## Max. :0.42680 Max. :0.20120 Max. :0.3040 Max. :0.09744
## radius_se texture_se perimeter_se area_se
## Min. :0.1115 Min. :0.3602 Min. : 0.757 Min. : 6.802
## 1st Qu.:0.2324 1st Qu.:0.8331 1st Qu.: 1.605 1st Qu.: 17.850
## Median :0.3240 Median :1.1080 Median : 2.285 Median : 24.565
## Mean :0.4052 Mean :1.2165 Mean : 2.867 Mean : 40.374
## 3rd Qu.:0.4798 3rd Qu.:1.4743 3rd Qu.: 3.360 3rd Qu.: 45.237
## Max. :2.8730 Max. :4.8850 Max. :21.980 Max. :542.200
## smoothness_se compactness_se concavity_se concave points_se
## Min. :0.001713 Min. :0.002252 Min. :0.00000 Min. :0.000000
## 1st Qu.:0.005166 1st Qu.:0.013133 1st Qu.:0.01510 1st Qu.:0.007663
## Median :0.006374 Median :0.020460 Median :0.02592 Median :0.010950
## Mean :0.007041 Mean :0.025515 Mean :0.03195 Mean :0.011817
## 3rd Qu.:0.008151 3rd Qu.:0.032455 3rd Qu.:0.04212 3rd Qu.:0.014730
## Max. :0.031130 Max. :0.135400 Max. :0.39600 Max. :0.052790
## symmetry_se fractal_dimension_se radius_worst texture_worst
## Min. :0.007882 Min. :0.0008948 Min. : 7.93 Min. :12.02
## 1st Qu.:0.015128 1st Qu.:0.0022445 1st Qu.:13.03 1st Qu.:21.07
## Median :0.018725 Median :0.0031955 Median :14.97 Median :25.41
## Mean :0.020531 Mean :0.0037967 Mean :16.28 Mean :25.67
## 3rd Qu.:0.023398 3rd Qu.:0.0045585 3rd Qu.:18.80 3rd Qu.:29.68
## Max. :0.078950 Max. :0.0298400 Max. :36.04 Max. :49.54
## perimeter_worst area_worst smoothness_worst compactness_worst
## Min. : 50.41 Min. : 185.2 Min. :0.07117 Min. :0.02729
## 1st Qu.: 84.15 1st Qu.: 515.7 1st Qu.:0.11660 1st Qu.:0.14758
## Median : 97.67 Median : 686.5 Median :0.13135 Median :0.21300
## Mean :107.35 Mean : 881.7 Mean :0.13244 Mean :0.25460
## 3rd Qu.:125.53 3rd Qu.:1085.0 3rd Qu.:0.14602 3rd Qu.:0.33930
## Max. :251.20 Max. :4254.0 Max. :0.22260 Max. :1.05800
## concavity_worst concave points_worst symmetry_worst fractal_dimension_worst
## Min. :0.0000 Min. :0.00000 Min. :0.1565 Min. :0.05504
## 1st Qu.:0.1159 1st Qu.:0.06497 1st Qu.:0.2504 1st Qu.:0.07147
## Median :0.2275 Median :0.10002 Median :0.2821 Median :0.08005
## Mean :0.2727 Mean :0.11481 Mean :0.2901 Mean :0.08397
## 3rd Qu.:0.3835 3rd Qu.:0.16168 3rd Qu.:0.3180 3rd Qu.:0.09208
## Max. :1.2520 Max. :0.29100 Max. :0.6638 Max. :0.20750
There aren’t NA values.
The dataset is a bit unbalanced:
prop.table(table(df$diagnosis))##
## B M
## 0.6267606 0.3732394
corr_mat <- cor(df[,3:ncol(df)])
corrplot(corr_mat, order = "hclust", tl.cex = 1, addrect = 9)## There is a great correlation between some variablesWe are going to get a training and a testing set to use when building some models:
set.seed(1234)
df_index <- createDataPartition(df$diagnosis, p=0.7, list = FALSE)
train_df <- df[df_index, -1]
test_df <- df[-df_index, -1]Because there is so much correlation, some machine learning models can fail. Let us create a PCA and LDA version of the data
pca_res <- prcomp(df[,3:ncol(df)], center = TRUE, scale = TRUE)
plot(pca_res, type="l")summary(pca_res)## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 3.6430 2.3887 1.67894 1.40544 1.28662 1.0982 0.81949
## Proportion of Variance 0.4424 0.1902 0.09396 0.06584 0.05518 0.0402 0.02239
## Cumulative Proportion 0.4424 0.6326 0.72653 0.79237 0.84755 0.8878 0.91013
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.68973 0.64618 0.59266 0.54282 0.51175 0.49126 0.39418
## Proportion of Variance 0.01586 0.01392 0.01171 0.00982 0.00873 0.00804 0.00518
## Cumulative Proportion 0.92599 0.93991 0.95162 0.96144 0.97017 0.97821 0.98339
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 0.30696 0.28022 0.24367 0.22980 0.22256 0.17656 0.1729
## Proportion of Variance 0.00314 0.00262 0.00198 0.00176 0.00165 0.00104 0.0010
## Cumulative Proportion 0.98653 0.98915 0.99113 0.99289 0.99454 0.99558 0.9966
## PC22 PC23 PC24 PC25 PC26 PC27 PC28
## Standard deviation 0.16547 0.15629 0.1344 0.12458 0.08929 0.08295 0.03993
## Proportion of Variance 0.00091 0.00081 0.0006 0.00052 0.00027 0.00023 0.00005
## Cumulative Proportion 0.99749 0.99830 0.9989 0.99942 0.99969 0.99992 0.99997
## PC29 PC30
## Standard deviation 0.02728 0.01153
## Proportion of Variance 0.00002 0.00000
## Cumulative Proportion 1.00000 1.00000
If we look at the above data, the two first components explains the 0.6324 of the variance, and 10 principal components to explain more than 0.95 of the variance and 17 to explain more than 0.99
pca_df <- as.data.frame(pca_res$x)
ggplot(pca_df, aes(x=PC1, y=PC2, col=df$diagnosis)) + geom_point(alpha=0.5) #### Separating the data
g_pc1 <- ggplot(pca_df, aes(x=PC1, fill=df$diagnosis)) + geom_density(alpha=0.25)
g_pc2 <- ggplot(pca_df, aes(x=PC2, fill=df$diagnosis)) + geom_density(alpha=0.25)
grid.arrange(g_pc1, g_pc2, ncol=2)LDA take in consideration the different classes and could get better results
lda_res <- lda(diagnosis~., df, center = TRUE, scale = TRUE)
lda_df <- predict(lda_res, df)$x %>% as.data.frame() %>% cbind(diagnosis=df$diagnosis)
lda_res## Call:
## lda(diagnosis ~ ., data = df, center = TRUE, scale = TRUE)
##
## Prior probabilities of groups:
## B M
## 0.6267606 0.3732394
##
## Group means:
## id radius_mean texture_mean perimeter_mean area_mean smoothness_mean
## B 26618125 12.15885 17.89615 78.16011 463.5817 0.09258958
## M 36818050 17.46283 21.60491 115.36538 978.3764 0.10289849
## compactness_mean concavity_mean `concave points_mean` symmetry_mean
## B 0.08018705 0.0461870 0.02578965 0.1742295
## M 0.14518778 0.1607747 0.08799000 0.1929090
## fractal_dimension_mean radius_se texture_se perimeter_se area_se
## B 0.06287871 0.2837969 1.219797 1.998783 21.14072
## M 0.06268009 0.6090825 1.210915 4.323929 72.67241
## smoothness_se compactness_se concavity_se `concave points_se` symmetry_se
## B 0.007195921 0.02148538 0.02606976 0.009885343 0.02056646
## M 0.006780094 0.03228117 0.04182401 0.015060472 0.02047240
## fractal_dimension_se radius_worst texture_worst perimeter_worst area_worst
## B 0.003638447 13.39082 23.49581 87.08416 559.7149
## M 0.004062406 21.13481 29.31821 141.37033 1422.2863
## smoothness_worst compactness_worst concavity_worst `concave points_worst`
## B 0.1250578 0.1830047 0.1667047 0.07465346
## M 0.1448452 0.3748241 0.4506056 0.18223731
## symmetry_worst fractal_dimension_worst
## B 0.2701986 0.07946750
## M 0.3234679 0.09152995
##
## Coefficients of linear discriminants:
## LD1
## id -2.471712e-10
## radius_mean -1.116030e+00
## texture_mean 2.226287e-02
## perimeter_mean 1.278227e-01
## area_mean 1.344838e-03
## smoothness_mean 1.704319e+00
## compactness_mean -2.197485e+01
## concavity_mean 7.061429e+00
## `concave points_mean` 9.772217e+00
## symmetry_mean 6.537618e-01
## fractal_dimension_mean 2.934736e+00
## radius_se 2.108651e+00
## texture_se -3.815391e-02
## perimeter_se -1.179492e-01
## area_se -3.885717e-03
## smoothness_se 7.983966e+01
## compactness_se 8.614795e-01
## concavity_se -1.766068e+01
## `concave points_se` 5.401010e+01
## symmetry_se 8.214913e+00
## fractal_dimension_se -3.782109e+01
## radius_worst 9.608121e-01
## texture_worst 3.587123e-02
## perimeter_worst -1.205477e-02
## area_worst -4.960733e-03
## smoothness_worst 2.517400e+00
## compactness_worst 4.427600e-01
## concavity_worst 1.845604e+00
## `concave points_worst` 2.354210e+00
## symmetry_worst 2.702041e+00
## fractal_dimension_worst 2.109905e+01
ggplot(lda_df, aes(x=LD1, y=0, col=diagnosis)) + geom_point(alpha=0.5)ggplot(lda_df, aes(x=LD1, fill=diagnosis)) + geom_density(alpha=0.5)train_df_lda <- lda_df[df_index,]
test_df_lda <- lda_df[-df_index,]fitControl <- trainControl(method="cv",
number = 5,
preProcOptions = list(thresh = 0.99), # threshold for pca preprocess
classProbs = TRUE,
summaryFunction = twoClassSummary)model_knn <- train(diagnosis~.,
train_df,
method="knn",
metric="ROC",
preProcess = c('center', 'scale'),
tuneLength=10,
trControl=fitControl)pred_knn <- predict(model_knn, test_df)
cm_knn <- confusionMatrix(pred_knn, test_df$diagnosis, positive = "M")
cm_knn## Confusion Matrix and Statistics
##
## Reference
## Prediction B M
## B 106 8
## M 0 55
##
## Accuracy : 0.9527
## 95% CI : (0.9089, 0.9793)
## No Information Rate : 0.6272
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.8961
##
## Mcnemar's Test P-Value : 0.01333
##
## Sensitivity : 0.8730
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9298
## Prevalence : 0.3728
## Detection Rate : 0.3254
## Detection Prevalence : 0.3254
## Balanced Accuracy : 0.9365
##
## 'Positive' Class : M
##
pred_prob_knn <- predict(model_knn, test_df, type="prob")
roc_knn <- roc(test_df$diagnosis, pred_prob_knn$M)## Setting levels: control = B, case = M
## Setting direction: controls < cases
plot(roc_knn) #### Applying KNN ON PCA
model_knn_pca <- train(diagnosis~.,
train_df,
method="knn",
metric="ROC",
preProcess = c('center', 'scale', 'pca'),
tuneLength=10,
trControl=fitControl)pred_knn_pca <- predict(model_knn_pca, test_df)
cm_knn_pca <- confusionMatrix(pred_knn_pca, test_df$diagnosis, positive = "M")
cm_knn_pca## Confusion Matrix and Statistics
##
## Reference
## Prediction B M
## B 104 6
## M 2 57
##
## Accuracy : 0.9527
## 95% CI : (0.9089, 0.9793)
## No Information Rate : 0.6272
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8975
##
## Mcnemar's Test P-Value : 0.2888
##
## Sensitivity : 0.9048
## Specificity : 0.9811
## Pos Pred Value : 0.9661
## Neg Pred Value : 0.9455
## Prevalence : 0.3728
## Detection Rate : 0.3373
## Detection Prevalence : 0.3491
## Balanced Accuracy : 0.9429
##
## 'Positive' Class : M
##
pred_prob_knn_pca <- predict(model_knn_pca, test_df, type="prob")
roc_knn_pca <- roc(test_df$diagnosis, pred_prob_knn_pca$M)## Setting levels: control = B, case = M
## Setting direction: controls < cases
plot(roc_knn_pca)model_knn_lda <- train(diagnosis~.,
train_df_lda,
method="knn",
metric="ROC",
preProcess = c('center', 'scale'),
tuneLength=10,
trControl=fitControl)pred_knn_lda <- predict(model_knn_lda, test_df_lda)
cm_knn_lda <- confusionMatrix(pred_knn_lda, test_df_lda$diagnosis, positive = "M")
cm_knn_lda## Confusion Matrix and Statistics
##
## Reference
## Prediction B M
## B 106 5
## M 0 58
##
## Accuracy : 0.9704
## 95% CI : (0.9323, 0.9903)
## No Information Rate : 0.6272
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.9357
##
## Mcnemar's Test P-Value : 0.07364
##
## Sensitivity : 0.9206
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9550
## Prevalence : 0.3728
## Detection Rate : 0.3432
## Detection Prevalence : 0.3432
## Balanced Accuracy : 0.9603
##
## 'Positive' Class : M
##
pred_prob_knn_lda <- predict(model_knn_lda, test_df_lda, type="prob")
roc_knn_lda <- roc(test_df_lda$diagnosis, pred_prob_knn_lda$M)## Setting levels: control = B, case = M
## Setting direction: controls < cases
plot(roc_knn_lda)model_nnet <- train(diagnosis~.,
train_df,
method="nnet",
metric="ROC",
preProcess=c('center', 'scale'),
tuneLength=10,
trace=FALSE,
trControl=fitControl)pred_nnet <- predict(model_nnet, test_df)
cm_nnet <- confusionMatrix(pred_nnet, test_df$diagnosis, positive = "M")
cm_nnet## Confusion Matrix and Statistics
##
## Reference
## Prediction B M
## B 104 4
## M 2 59
##
## Accuracy : 0.9645
## 95% CI : (0.9243, 0.9869)
## No Information Rate : 0.6272
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9236
##
## Mcnemar's Test P-Value : 0.6831
##
## Sensitivity : 0.9365
## Specificity : 0.9811
## Pos Pred Value : 0.9672
## Neg Pred Value : 0.9630
## Prevalence : 0.3728
## Detection Rate : 0.3491
## Detection Prevalence : 0.3609
## Balanced Accuracy : 0.9588
##
## 'Positive' Class : M
##
model_nnet_pca <- train(diagnosis~.,
train_df,
method="nnet",
metric="ROC",
preProcess=c('center', 'scale', 'pca'),
tuneLength=10,
trace=FALSE,
trControl=fitControl)pred_nnet_pca <- predict(model_nnet_pca, test_df)
cm_nnet_pca <- confusionMatrix(pred_nnet_pca, test_df$diagnosis, positive = "M")
cm_nnet_pca## Confusion Matrix and Statistics
##
## Reference
## Prediction B M
## B 103 1
## M 3 62
##
## Accuracy : 0.9763
## 95% CI : (0.9405, 0.9935)
## No Information Rate : 0.6272
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9497
##
## Mcnemar's Test P-Value : 0.6171
##
## Sensitivity : 0.9841
## Specificity : 0.9717
## Pos Pred Value : 0.9538
## Neg Pred Value : 0.9904
## Prevalence : 0.3728
## Detection Rate : 0.3669
## Detection Prevalence : 0.3846
## Balanced Accuracy : 0.9779
##
## 'Positive' Class : M
##
model_nnet_lda <- train(diagnosis~.,
train_df_lda,
method="nnet",
metric="ROC",
preProcess=c('center', 'scale'),
tuneLength=10,
trace=FALSE,
trControl=fitControl)pred_nnet_lda <- predict(model_nnet_lda, test_df_lda)
cm_nnet_lda <- confusionMatrix(pred_nnet_lda, test_df_lda$diagnosis, positive = "M")
cm_nnet_lda## Confusion Matrix and Statistics
##
## Reference
## Prediction B M
## B 106 1
## M 0 62
##
## Accuracy : 0.9941
## 95% CI : (0.9675, 0.9999)
## No Information Rate : 0.6272
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9873
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9841
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9907
## Prevalence : 0.3728
## Detection Rate : 0.3669
## Detection Prevalence : 0.3669
## Balanced Accuracy : 0.9921
##
## 'Positive' Class : M
##
Let’s compare the models and check their correlation:
model_list <- list(KNN = model_knn, PCA_KNN=model_knn_pca, LDA_KNN=model_knn_lda,
NNET=model_nnet, PCA_NNET=model_nnet_pca, LDA_NNET=model_nnet_lda)
resamples <- resamples(model_list)model_cor <- modelCor(resamples)
model_cor## KNN PCA_KNN LDA_KNN NNET PCA_NNET LDA_NNET
## KNN 1.0000000 -0.5601738 0.4551622 0.5229763 -0.1074780 0.1261569
## PCA_KNN -0.5601738 1.0000000 -0.6420451 -0.1874635 -0.5447953 -0.7245273
## LDA_KNN 0.4551622 -0.6420451 1.0000000 -0.4581097 -0.2090284 0.8804419
## NNET 0.5229763 -0.1874635 -0.4581097 1.0000000 0.3972921 -0.5409319
## PCA_NNET -0.1074780 -0.5447953 -0.2090284 0.3972921 1.0000000 0.1723862
## LDA_NNET 0.1261569 -0.7245273 0.8804419 -0.5409319 0.1723862 1.0000000
corrplot(model_cor)bwplot(resamples, metric="ROC")Some models have a great variability depending of the processed sample. The models PCA_NNET, and PCA_KNN achieve a great auc with some variability.
The ROC metric measure the auc of the roc curve of each model. This metric is independent of any threshold. Prediction classes are obtained by default with a threshold of 0.5 which could not be the best with an unbalanced dataset like this.
cm_list <- list(NNET=cm_nnet, PCA_NNET=cm_nnet_pca, LDA_NNET=cm_nnet_lda,
KNN = cm_knn, PCA_KNN=cm_knn_pca, LDA_KNN=cm_knn_lda)cm_list_results <- sapply(cm_list, function(x) x$byClass)
cm_list_results## NNET PCA_NNET LDA_NNET KNN PCA_KNN
## Sensitivity 0.9365079 0.9841270 0.9841270 0.8730159 0.9047619
## Specificity 0.9811321 0.9716981 1.0000000 1.0000000 0.9811321
## Pos Pred Value 0.9672131 0.9538462 1.0000000 1.0000000 0.9661017
## Neg Pred Value 0.9629630 0.9903846 0.9906542 0.9298246 0.9454545
## Precision 0.9672131 0.9538462 1.0000000 1.0000000 0.9661017
## Recall 0.9365079 0.9841270 0.9841270 0.8730159 0.9047619
## F1 0.9516129 0.9687500 0.9920000 0.9322034 0.9344262
## Prevalence 0.3727811 0.3727811 0.3727811 0.3727811 0.3727811
## Detection Rate 0.3491124 0.3668639 0.3668639 0.3254438 0.3372781
## Detection Prevalence 0.3609467 0.3846154 0.3668639 0.3254438 0.3491124
## Balanced Accuracy 0.9588200 0.9779125 0.9920635 0.9365079 0.9429470
## LDA_KNN
## Sensitivity 0.9206349
## Specificity 1.0000000
## Pos Pred Value 1.0000000
## Neg Pred Value 0.9549550
## Precision 1.0000000
## Recall 0.9206349
## F1 0.9586777
## Prevalence 0.3727811
## Detection Rate 0.3431953
## Detection Prevalence 0.3431953
## Balanced Accuracy 0.9603175
From the above table The best results for sensitivity (detection of breast cases) is LDA_NNET(0.9841270) which also has a great F1 score of 0.9920000.
The purpose of the analysis is to find the best Model (based on Machine Learning Methodologies) to predict the breast cancer on the breast cancer wisconsin dataset.
We have found a model based on neural network and LDA preprocessed data with good results over the test set. This model has a sensibility of 0.984 with a F1 score of 0.992
Further we can try to improve the model by
Modify models to use a different metric rather than ROC (auc) which takes in consideration the best threshold
Try different stacking models
```