# install.packages("caretEnsemble")
# install.packages("Amelia")
# install.packages("psych")
# install.packages("mice")
# install.packages("GGally")
# install.packages("gbm")
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.5 ✔ purrr 0.3.4
## ✔ tibble 3.1.3 ✔ dplyr 1.0.7
## ✔ tidyr 1.1.3 ✔ stringr 1.4.0
## ✔ readr 2.0.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ggplot2)
library(caret) # confusionMatrix function
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(caretEnsemble)
##
## Attaching package: 'caretEnsemble'
## The following object is masked from 'package:ggplot2':
##
## autoplot
library(MASS) # lda function
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(class) # kNN
library(psych)
## Warning: package 'psych' was built under R version 4.1.2
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(Amelia)
## Warning: package 'Amelia' was built under R version 4.1.2
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2022 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(mice)
## Warning: package 'mice' was built under R version 4.1.2
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(rpart)
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.1.2
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:psych':
##
## outlier
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(gbm)
## Warning: package 'gbm' was built under R version 4.1.2
## Loaded gbm 2.1.8.1
library(readr)
wdbc <- read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data",
col_names = FALSE)
## Rows: 569 Columns: 32── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): X2
## dbl (31): X1, X3, X4, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15, X16,...
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(wdbc) <- c('id_number', 'diagnosis', 'radius_mean',
'texture_mean', 'perimeter_mean', 'area_mean',
'smoothness_mean', 'compactness_mean',
'concavity_mean','concave_points_mean',
'symmetry_mean', 'fractal_dimension_mean',
'radius_se', 'texture_se', 'perimeter_se',
'area_se', 'smoothness_se', 'compactness_se',
'concavity_se', 'concave_points_se',
'symmetry_se', 'fractal_dimension_se',
'radius_worst', 'texture_worst',
'perimeter_worst', 'area_worst',
'smoothness_worst', 'compactness_worst',
'concavity_worst', 'concave_points_worst',
'symmetry_worst', 'fractal_dimension_worst')
glimpse(wdbc)
## Rows: 569
## Columns: 32
## $ id_number <dbl> 842302, 842517, 84300903, 84348301, 84358402, …
## $ diagnosis <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "…
## $ radius_mean <dbl> 17.990, 20.570, 19.690, 11.420, 20.290, 12.450…
## $ texture_mean <dbl> 10.38, 17.77, 21.25, 20.38, 14.34, 15.70, 19.9…
## $ perimeter_mean <dbl> 122.80, 132.90, 130.00, 77.58, 135.10, 82.57, …
## $ area_mean <dbl> 1001.0, 1326.0, 1203.0, 386.1, 1297.0, 477.1, …
## $ smoothness_mean <dbl> 0.11840, 0.08474, 0.10960, 0.14250, 0.10030, 0…
## $ compactness_mean <dbl> 0.27760, 0.07864, 0.15990, 0.28390, 0.13280, 0…
## $ concavity_mean <dbl> 0.30010, 0.08690, 0.19740, 0.24140, 0.19800, 0…
## $ concave_points_mean <dbl> 0.14710, 0.07017, 0.12790, 0.10520, 0.10430, 0…
## $ symmetry_mean <dbl> 0.2419, 0.1812, 0.2069, 0.2597, 0.1809, 0.2087…
## $ fractal_dimension_mean <dbl> 0.07871, 0.05667, 0.05999, 0.09744, 0.05883, 0…
## $ radius_se <dbl> 1.0950, 0.5435, 0.7456, 0.4956, 0.7572, 0.3345…
## $ texture_se <dbl> 0.9053, 0.7339, 0.7869, 1.1560, 0.7813, 0.8902…
## $ perimeter_se <dbl> 8.589, 3.398, 4.585, 3.445, 5.438, 2.217, 3.18…
## $ area_se <dbl> 153.40, 74.08, 94.03, 27.23, 94.44, 27.19, 53.…
## $ smoothness_se <dbl> 0.006399, 0.005225, 0.006150, 0.009110, 0.0114…
## $ compactness_se <dbl> 0.049040, 0.013080, 0.040060, 0.074580, 0.0246…
## $ concavity_se <dbl> 0.05373, 0.01860, 0.03832, 0.05661, 0.05688, 0…
## $ concave_points_se <dbl> 0.015870, 0.013400, 0.020580, 0.018670, 0.0188…
## $ symmetry_se <dbl> 0.03003, 0.01389, 0.02250, 0.05963, 0.01756, 0…
## $ fractal_dimension_se <dbl> 0.006193, 0.003532, 0.004571, 0.009208, 0.0051…
## $ radius_worst <dbl> 25.38, 24.99, 23.57, 14.91, 22.54, 15.47, 22.8…
## $ texture_worst <dbl> 17.33, 23.41, 25.53, 26.50, 16.67, 23.75, 27.6…
## $ perimeter_worst <dbl> 184.60, 158.80, 152.50, 98.87, 152.20, 103.40,…
## $ area_worst <dbl> 2019.0, 1956.0, 1709.0, 567.7, 1575.0, 741.6, …
## $ smoothness_worst <dbl> 0.1622, 0.1238, 0.1444, 0.2098, 0.1374, 0.1791…
## $ compactness_worst <dbl> 0.6656, 0.1866, 0.4245, 0.8663, 0.2050, 0.5249…
## $ concavity_worst <dbl> 0.71190, 0.24160, 0.45040, 0.68690, 0.40000, 0…
## $ concave_points_worst <dbl> 0.26540, 0.18600, 0.24300, 0.25750, 0.16250, 0…
## $ symmetry_worst <dbl> 0.4601, 0.2750, 0.3613, 0.6638, 0.2364, 0.3985…
## $ fractal_dimension_worst <dbl> 0.11890, 0.08902, 0.08758, 0.17300, 0.07678, 0…
# Removing the first column (id_number) because it is not a real predictor.
# Keep the response variable ( diagnosis) and the predictors containing 'mean'
# Checking the values (B/M) of the response variable and convert to 0/1
wdbc <- wdbc %>%
dplyr::select(diagnosis, contains("mean"))%>%
mutate(bc = ifelse(diagnosis=="M", 1, ifelse(diagnosis=="B",0, NA))) %>%
dplyr::select(-diagnosis)
head(wdbc)
dim(wdbc)
## [1] 569 11
table(wdbc$bc)
##
## 0 1
## 357 212
# Checking missing values
library(mice)
missmap(wdbc)
## Warning: Unknown or uninitialised column: `arguments`.
## Warning: Unknown or uninitialised column: `arguments`.
## Warning: Unknown or uninitialised column: `imputations`.
# Checking bivariate relationships among variables
ggpairs(wdbc)
library(dplyr)
# Standardization
scale2 <- function(x, na.rm = FALSE) (x - mean(x, na.rm = na.rm)) / sd(x, na.rm)
wdbc <- wdbc %>%
mutate_at(vars(-bc), scale2)
# Provide a seed number for reproducibility
set.seed(34859)
nr <- nrow(wdbc)
tr.id <- sample(nr, floor(0.7*nr), replace=FALSE, prob=NULL)
tr.dat <- wdbc[tr.id,]
ts.dat <- wdbc[-tr.id,]
tr.x <- tr.dat %>% dplyr::select(-bc)
ts.x <- ts.dat %>% dplyr::select(-bc)
ts.y <- ts.dat %>% dplyr::select(bc)
logis <- glm(bc~., data = tr.dat)
summary(logis)
##
## Call:
## glm(formula = bc ~ ., data = tr.dat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.67175 -0.17671 -0.03821 0.17852 0.80928
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.38046 0.01386 27.451 < 2e-16 ***
## radius_mean 1.77547 0.51256 3.464 0.000592 ***
## texture_mean 0.11125 0.01558 7.141 4.61e-12 ***
## perimeter_mean -1.41977 0.56730 -2.503 0.012738 *
## area_mean -0.31358 0.09610 -3.263 0.001200 **
## smoothness_mean 0.02102 0.02360 0.891 0.373633
## compactness_mean 0.02493 0.06284 0.397 0.691770
## concavity_mean 0.03086 0.04474 0.690 0.490746
## concave_points_mean 0.25856 0.06300 4.104 4.96e-05 ***
## symmetry_mean 0.03700 0.01788 2.070 0.039153 *
## fractal_dimension_mean -0.01293 0.03592 -0.360 0.719084
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.07597603)
##
## Null deviance: 95.068 on 397 degrees of freedom
## Residual deviance: 29.403 on 387 degrees of freedom
## AIC: 116.54
##
## Number of Fisher Scoring iterations: 2
# A simple predictive model with significant predictors
logis.sig <- glm(bc~ radius_mean + texture_mean + perimeter_mean + area_mean
+ concave_points_mean + symmetry_mean, data = tr.dat)
# Predicted probability
logis.prob <- predict(logis.sig, ts.dat, type="response")
# Classification (y_hat)
logis.pred <- ifelse(logis.prob >= 0.5, 1, 0)
# confusionMatrix function takes only factor values for yhat and y.
(logis.conf <- confusionMatrix(factor(logis.pred), factor(ts.dat$bc), positive = "1", mode="everything"))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 114 6
## 1 2 49
##
## Accuracy : 0.9532
## 95% CI : (0.9099, 0.9796)
## No Information Rate : 0.6784
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8907
##
## Mcnemar's Test P-Value : 0.2888
##
## Sensitivity : 0.8909
## Specificity : 0.9828
## Pos Pred Value : 0.9608
## Neg Pred Value : 0.9500
## Precision : 0.9608
## Recall : 0.8909
## F1 : 0.9245
## Prevalence : 0.3216
## Detection Rate : 0.2865
## Detection Prevalence : 0.2982
## Balanced Accuracy : 0.9368
##
## 'Positive' Class : 1
##
lda.fit <- lda(bc ~ ., data = tr.dat)
lda.pred <- predict(lda.fit, ts.dat)$class
lda.conf <- confusionMatrix(factor(lda.pred), factor(ts.dat$bc), positive ="1", mode ="everything")
lda.conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 114 6
## 1 2 49
##
## Accuracy : 0.9532
## 95% CI : (0.9099, 0.9796)
## No Information Rate : 0.6784
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8907
##
## Mcnemar's Test P-Value : 0.2888
##
## Sensitivity : 0.8909
## Specificity : 0.9828
## Pos Pred Value : 0.9608
## Neg Pred Value : 0.9500
## Precision : 0.9608
## Recall : 0.8909
## F1 : 0.9245
## Prevalence : 0.3216
## Detection Rate : 0.2865
## Detection Prevalence : 0.2982
## Balanced Accuracy : 0.9368
##
## 'Positive' Class : 1
##
##Quadratic discriminant analysis
qda.fit <- qda(bc ~ ., data = tr.dat)
qda.pred <- predict(qda.fit, ts.dat)$class
qda.conf <- confusionMatrix(factor(qda.pred), factor(ts.dat$bc), positive ="1", mode ="everything")
qda.conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 114 6
## 1 2 49
##
## Accuracy : 0.9532
## 95% CI : (0.9099, 0.9796)
## No Information Rate : 0.6784
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8907
##
## Mcnemar's Test P-Value : 0.2888
##
## Sensitivity : 0.8909
## Specificity : 0.9828
## Pos Pred Value : 0.9608
## Neg Pred Value : 0.9500
## Precision : 0.9608
## Recall : 0.8909
## F1 : 0.9245
## Prevalence : 0.3216
## Detection Rate : 0.2865
## Detection Prevalence : 0.2982
## Balanced Accuracy : 0.9368
##
## 'Positive' Class : 1
##
##kNN
# The knn function is available in the class package
knn.pred <- knn(tr.dat, ts.dat, cl = tr.dat$bc, k=7)
knn.conf <- confusionMatrix(factor(knn.pred), factor(ts.dat$bc), positive ="1", mode ="everything")
knn.conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 116 2
## 1 0 53
##
## Accuracy : 0.9883
## 95% CI : (0.9584, 0.9986)
## No Information Rate : 0.6784
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9729
##
## Mcnemar's Test P-Value : 0.4795
##
## Sensitivity : 0.9636
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9831
## Precision : 1.0000
## Recall : 0.9636
## F1 : 0.9815
## Prevalence : 0.3216
## Detection Rate : 0.3099
## Detection Prevalence : 0.3099
## Balanced Accuracy : 0.9818
##
## 'Positive' Class : 1
##
# How can we incorporate different distance metrics in the knn function?
# We can use the knn function the caret package
Naive Bayes
# Naive Bayes classifier can be implemented by e1071, klaR, naivebayes, bnclssify, caret, and h2o packages
library(caret)
library(klaR)
## Warning: package 'klaR' was built under R version 4.1.2
library(e1071)
nb.fit = train(x = tr.x, y = factor(tr.dat$bc), method = "nb",
trControl=trainControl(method='cv', number=10))
## Warning: Setting row names on a tibble is deprecated.
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 19
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 38
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 41
## Warning: Setting row names on a tibble is deprecated.
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 19
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 38
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 41
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 2
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 7
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 19
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 21
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 27
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 34
## Warning: Setting row names on a tibble is deprecated.
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 7
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 19
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 21
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 27
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 34
## Warning: Setting row names on a tibble is deprecated.
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 9
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 22
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 25
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 30
## Warning: Setting row names on a tibble is deprecated.
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 9
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 22
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 25
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 35
## Warning: Setting row names on a tibble is deprecated.
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 2
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 6
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 8
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 13
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 16
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 24
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 38
## Warning: Setting row names on a tibble is deprecated.
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 6
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 8
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 13
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 16
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 24
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 38
## Warning: Setting row names on a tibble is deprecated.
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 3
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 13
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 22
## Warning: Setting row names on a tibble is deprecated.
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 13
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 22
## Warning: Setting row names on a tibble is deprecated.
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 11
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 17
## Warning: Setting row names on a tibble is deprecated.
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 11
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 17
## Warning: Setting row names on a tibble is deprecated.
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 8
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 30
## Warning: Setting row names on a tibble is deprecated.
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 8
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 30
## Warning: Setting row names on a tibble is deprecated.
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 2
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 5
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 6
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 15
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 22
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 40
## Warning: Setting row names on a tibble is deprecated.
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 2
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 5
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 6
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 15
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 24
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 40
## Warning: Setting row names on a tibble is deprecated.
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 6
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 7
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 11
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 32
## Warning: Setting row names on a tibble is deprecated.
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 7
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 11
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 32
## Warning: Setting row names on a tibble is deprecated.
nb.fit
## Naive Bayes
##
## 398 samples
## 10 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 357, 358, 358, 359, 358, 358, ...
## Resampling results across tuning parameters:
##
## usekernel Accuracy Kappa
## FALSE 0.9096060 0.8093107
## TRUE 0.9147952 0.8207147
##
## Tuning parameter 'fL' was held constant at a value of 0
## Tuning
## parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were fL = 0, usekernel = TRUE and adjust
## = 1.
nb.pred <- predict(nb.fit, newdata = ts.x)
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 1
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 25
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 27
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 36
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 112
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 146
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 171
table(nb.pred)
## nb.pred
## 0 1
## 115 56
nb.conf <- confusionMatrix(factor(nb.pred), factor(ts.dat$bc), positive = "1", mode ="everything")
nb.conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 111 4
## 1 5 51
##
## Accuracy : 0.9474
## 95% CI : (0.9024, 0.9757)
## No Information Rate : 0.6784
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.88
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9273
## Specificity : 0.9569
## Pos Pred Value : 0.9107
## Neg Pred Value : 0.9652
## Precision : 0.9107
## Recall : 0.9273
## F1 : 0.9189
## Prevalence : 0.3216
## Detection Rate : 0.2982
## Detection Prevalence : 0.3275
## Balanced Accuracy : 0.9421
##
## 'Positive' Class : 1
##
##Decision Tree
# Decision tree can be implemented using tree, rpart, ... packages
library(rpart)
library(rpart.plot)
dc.fit <- rpart(factor(bc) ~., data=tr.dat, method="class",control=rpart.control(minsplit=30, cp=0.001))
rpart.plot(dc.fit, main ="Prediction of Breast Cancer")
dc.prune.fit <- prune(dc.fit, dc.fit$cptable[which.min(dc.fit$cptable[,"xerror"]),"CP"])
summary(dc.prune.fit)
## Call:
## rpart(formula = factor(bc) ~ ., data = tr.dat, method = "class",
## control = rpart.control(minsplit = 30, cp = 0.001))
## n= 398
##
## CP nsplit rel error xerror xstd
## 1 0.77070064 0 1.0000000 1.0000000 0.06210365
## 2 0.03184713 1 0.2292994 0.2547771 0.03820589
##
## Variable importance
## concave_points_mean concavity_mean perimeter_mean radius_mean
## 22 18 16 15
## area_mean compactness_mean
## 15 14
##
## Node number 1: 398 observations, complexity param=0.7707006
## predicted class=0 expected loss=0.3944724 P(node) =1
## class counts: 241 157
## probabilities: 0.606 0.394
## left son=2 (239 obs) right son=3 (159 obs)
## Primary splits:
## concave_points_mean < 0.0726971 to the left, improve=125.0950, (0 missing)
## area_mean < 0.1175312 to the left, improve=115.7720, (0 missing)
## perimeter_mean < 0.3259794 to the left, improve=114.7733, (0 missing)
## radius_mean < 0.3256789 to the left, improve=113.2904, (0 missing)
## concavity_mean < 0.00621783 to the left, improve=108.0154, (0 missing)
## Surrogate splits:
## concavity_mean < -0.08020987 to the left, agree=0.917, adj=0.792, (0 split)
## perimeter_mean < 0.2374983 to the left, agree=0.882, adj=0.704, (0 split)
## radius_mean < 0.3299353 to the left, agree=0.872, adj=0.679, (0 split)
## area_mean < 0.2044842 to the left, agree=0.867, adj=0.667, (0 split)
## compactness_mean < -0.04621959 to the left, agree=0.849, adj=0.623, (0 split)
##
## Node number 2: 239 observations
## predicted class=0 expected loss=0.07112971 P(node) =0.6005025
## class counts: 222 17
## probabilities: 0.929 0.071
##
## Node number 3: 159 observations
## predicted class=1 expected loss=0.1194969 P(node) =0.3994975
## class counts: 19 140
## probabilities: 0.119 0.881
rpart.plot(dc.prune.fit, main ="Prediction of Breast Cancer with Pruning")
dc.pred <- predict(dc.fit, newdata = ts.x, type = "class")
dc.conf <- confusionMatrix(factor(dc.pred), factor(ts.dat$bc), positive ="1", mode="everything")
dc.conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 111 7
## 1 5 48
##
## Accuracy : 0.9298
## 95% CI : (0.8806, 0.9632)
## No Information Rate : 0.6784
## P-Value [Acc > NIR] : 2.034e-15
##
## Kappa : 0.8376
##
## Mcnemar's Test P-Value : 0.7728
##
## Sensitivity : 0.8727
## Specificity : 0.9569
## Pos Pred Value : 0.9057
## Neg Pred Value : 0.9407
## Precision : 0.9057
## Recall : 0.8727
## F1 : 0.8889
## Prevalence : 0.3216
## Detection Rate : 0.2807
## Detection Prevalence : 0.3099
## Balanced Accuracy : 0.9148
##
## 'Positive' Class : 1
##
dc.prune.pred <- predict(dc.prune.fit, newdata = ts.x, type = "class")
dc.prune.conf <- confusionMatrix(factor(dc.prune.pred), factor(ts.dat$bc), positive ="1", mode="everything")
dc.prune.conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 107 3
## 1 9 52
##
## Accuracy : 0.9298
## 95% CI : (0.8806, 0.9632)
## No Information Rate : 0.6784
## P-Value [Acc > NIR] : 2.034e-15
##
## Kappa : 0.8437
##
## Mcnemar's Test P-Value : 0.1489
##
## Sensitivity : 0.9455
## Specificity : 0.9224
## Pos Pred Value : 0.8525
## Neg Pred Value : 0.9727
## Precision : 0.8525
## Recall : 0.9455
## F1 : 0.8966
## Prevalence : 0.3216
## Detection Rate : 0.3041
## Detection Prevalence : 0.3567
## Balanced Accuracy : 0.9339
##
## 'Positive' Class : 1
##
##Random Forest In random forest the number of trees and the number of predictors used are important arguments. We chose ntree=500 and mtry = p‾√.
library(randomForest)
rf.fit= randomForest(factor(bc)~., data = tr.dat, ntree = 500, mtry = ceiling(sqrt(ncol(tr.dat))))
rf.fit
##
## Call:
## randomForest(formula = factor(bc) ~ ., data = tr.dat, ntree = 500, mtry = ceiling(sqrt(ncol(tr.dat))))
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 6.28%
## Confusion matrix:
## 0 1 class.error
## 0 230 11 0.04564315
## 1 14 143 0.08917197
rf.pred = predict(rf.fit, ts.x, type ="response")
rf.conf <- confusionMatrix(factor(rf.pred), factor(ts.dat$bc), positive ="1", mode="everything")
rf.conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 111 3
## 1 5 52
##
## Accuracy : 0.9532
## 95% CI : (0.9099, 0.9796)
## No Information Rate : 0.6784
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8938
##
## Mcnemar's Test P-Value : 0.7237
##
## Sensitivity : 0.9455
## Specificity : 0.9569
## Pos Pred Value : 0.9123
## Neg Pred Value : 0.9737
## Precision : 0.9123
## Recall : 0.9455
## F1 : 0.9286
## Prevalence : 0.3216
## Detection Rate : 0.3041
## Detection Prevalence : 0.3333
## Balanced Accuracy : 0.9512
##
## 'Positive' Class : 1
##
##Boosting
library(gbm)
set.seed(123)
boost.fit= gbm(bc ~ ., data = tr.dat, distribution ="bernoulli", n.tree = 500, cv.folds=5,
shrinkage = 0.1, interaction.depth = 2, n.cores = NULL)
par(mar = c(5,8,1,1))
summary(boost.fit, cBars = 10, method = relative.influence, las = 2)
# To find the best hyperparameters, you can use a grid search
boost.pred.prob = predict(boost.fit, ts.x, n.trees = boost.fit$n.trees, type="response")
boost.pred <- factor(ifelse(boost.pred.prob>0.5, 1,0))
boost.conf <- confusionMatrix(factor(boost.pred), factor(ts.dat$bc), positive ="1", mode ="everything")
boost.conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 111 4
## 1 5 51
##
## Accuracy : 0.9474
## 95% CI : (0.9024, 0.9757)
## No Information Rate : 0.6784
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.88
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9273
## Specificity : 0.9569
## Pos Pred Value : 0.9107
## Neg Pred Value : 0.9652
## Precision : 0.9107
## Recall : 0.9273
## F1 : 0.9189
## Prevalence : 0.3216
## Detection Rate : 0.2982
## Detection Prevalence : 0.3275
## Balanced Accuracy : 0.9421
##
## 'Positive' Class : 1
##
The important hyperparameter is to select a proper kernel option.
# Naive Bayes classifier can be implemented by e1071, klaR, naivebayes, bnclssify, caret, and h2o packages
library(e1071)
# Linear kernel
svm.fit = svm(factor(bc) ~ ., data = tr.dat, kernel = "linear", cost = 10, scale = FALSE, type ="C-classification")
summary(svm.fit)
##
## Call:
## svm(formula = factor(bc) ~ ., data = tr.dat, kernel = "linear", cost = 10,
## type = "C-classification", scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 10
##
## Number of Support Vectors: 59
##
## ( 29 30 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
svm.pred <- predict(svm.fit, newdata = ts.x)
svm.conf <- confusionMatrix(factor(svm.pred), factor(ts.dat$bc), positive ="1", mode = "everything")
svm.conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 112 5
## 1 4 50
##
## Accuracy : 0.9474
## 95% CI : (0.9024, 0.9757)
## No Information Rate : 0.6784
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8788
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9091
## Specificity : 0.9655
## Pos Pred Value : 0.9259
## Neg Pred Value : 0.9573
## Precision : 0.9259
## Recall : 0.9091
## F1 : 0.9174
## Prevalence : 0.3216
## Detection Rate : 0.2924
## Detection Prevalence : 0.3158
## Balanced Accuracy : 0.9373
##
## 'Positive' Class : 1
##
# Polynomial kernel
svm.fit1 = svm(factor(bc) ~ ., data = tr.dat, kernel = "polynomial", cost = 10, scale = FALSE, type ="C-classification")
summary(svm.fit1)
##
## Call:
## svm(formula = factor(bc) ~ ., data = tr.dat, kernel = "polynomial",
## cost = 10, type = "C-classification", scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: polynomial
## cost: 10
## degree: 3
## coef.0: 0
##
## Number of Support Vectors: 88
##
## ( 42 46 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
svm.pred1 <- predict(svm.fit1, newdata = ts.x)
svm.conf1 <- confusionMatrix(factor(svm.pred1), factor(ts.dat$bc), positive ="1", mode = "everything")
svm.conf1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 114 11
## 1 2 44
##
## Accuracy : 0.924
## 95% CI : (0.8735, 0.9589)
## No Information Rate : 0.6784
## P-Value [Acc > NIR] : 1.2e-14
##
## Kappa : 0.818
##
## Mcnemar's Test P-Value : 0.0265
##
## Sensitivity : 0.8000
## Specificity : 0.9828
## Pos Pred Value : 0.9565
## Neg Pred Value : 0.9120
## Precision : 0.9565
## Recall : 0.8000
## F1 : 0.8713
## Prevalence : 0.3216
## Detection Rate : 0.2573
## Detection Prevalence : 0.2690
## Balanced Accuracy : 0.8914
##
## 'Positive' Class : 1
##
# Nonlinear kernel
# To identify the types of kernel, type ??svm in the Console
svm.fit2 = svm(factor(bc) ~ ., data = tr.dat, kernel = "radial", cost = 10, scale = FALSE)
summary(svm.fit1)
##
## Call:
## svm(formula = factor(bc) ~ ., data = tr.dat, kernel = "polynomial",
## cost = 10, type = "C-classification", scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: polynomial
## cost: 10
## degree: 3
## coef.0: 0
##
## Number of Support Vectors: 88
##
## ( 42 46 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
svm.pred2 <- predict(svm.fit2, newdata = ts.x)
svm.conf2 <- confusionMatrix(factor(svm.pred2), factor(ts.dat$bc), positive ="1", mode = "everything")
svm.conf2
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 112 5
## 1 4 50
##
## Accuracy : 0.9474
## 95% CI : (0.9024, 0.9757)
## No Information Rate : 0.6784
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8788
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9091
## Specificity : 0.9655
## Pos Pred Value : 0.9259
## Neg Pred Value : 0.9573
## Precision : 0.9259
## Recall : 0.9091
## F1 : 0.9174
## Prevalence : 0.3216
## Detection Rate : 0.2924
## Detection Prevalence : 0.3158
## Balanced Accuracy : 0.9373
##
## 'Positive' Class : 1
##
##Artificial Neural Network
#install.packages("neuralnet")
library(neuralnet)
##
## Attaching package: 'neuralnet'
## The following object is masked from 'package:dplyr':
##
## compute
set.seed(123)
## hidden =c(5,3): layer 1 has 5 nodes and layer 2 has 3 nodes
n <- names(tr.dat)
nn.fit <- neuralnet(factor(bc) ~., data = tr.dat, hidden=c(5,3), linear.output = FALSE, err.fc = "ce", likelihood = TRUE)
# For regression, linear.output = TRUE
plot(nn.fit, rep ="best")
nn.prob <- compute(nn.fit, ts.x)
nn.pred <- ifelse(nn.prob$net.result[,2] >= 0.5, 1,0)
nn.conf <- confusionMatrix(factor(nn.pred), factor(ts.dat$bc), positive ="1", mode="everything")
nn.conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 107 5
## 1 9 50
##
## Accuracy : 0.9181
## 95% CI : (0.8664, 0.9545)
## No Information Rate : 0.6784
## P-Value [Acc > NIR] : 6.53e-14
##
## Kappa : 0.8159
##
## Mcnemar's Test P-Value : 0.4227
##
## Sensitivity : 0.9091
## Specificity : 0.9224
## Pos Pred Value : 0.8475
## Neg Pred Value : 0.9554
## Precision : 0.8475
## Recall : 0.9091
## F1 : 0.8772
## Prevalence : 0.3216
## Detection Rate : 0.2924
## Detection Prevalence : 0.3450
## Balanced Accuracy : 0.9158
##
## 'Positive' Class : 1
##
##Summary of classification algorithims
logis.result <- c(logis.conf$overall[1], logis.conf$byClass[c(1:4,7)])
lda.result <- c(lda.conf$overall[1], lda.conf$byClass[c(1:4,7)])
qda.result <- c(qda.conf$overall[1], qda.conf$byClass[c(1:4,7)])
knn.result <- c(knn.conf$overall[1], knn.conf$byClass[c(1:4,7)])
nb.result <- c(nb.conf$overall[1], nb.conf$byClass[c(1:4,7)])
dc.result <- c(dc.conf$overall[1], dc.conf$byClass[c(1:4,7)])
dc.prune.result <- c(dc.prune.conf$overall[1], dc.prune.conf$byClass[c(1:4,7)])
rf.result <- c(rf.conf$overall[1], rf.conf$byClass[c(1:4,7)])
boost.result <- c(boost.conf$overall[1], boost.conf$byClass[c(1:4,7)])
svm.result <- c(svm.conf$overall[1], svm.conf$byClass[c(1:4,7)])
svm1.result <- c(svm.conf1$overall[1], svm.conf1$byClass[c(1:4,7)])
svm2.result <- c(svm.conf2$overall[1], svm.conf2$byClass[c(1:4,7)])
nn.result <- c(nn.conf$overall[1], nn.conf$byClass[c(1:4,7)])
result <- round(rbind(logis.result, lda.result, qda.result, knn.result, nb.result, dc.result, dc.prune.result, rf.result, boost.result, svm.result, svm1.result, svm2.result, nn.result),3)
rownames(result) <- c("Logistic", "LDA","QDA", "KNN", "NB", "Tree" ,"Tree-Prune", "RF", "GBM", "SVM-Linear", "SVM-Polynomial", "SVM-Radial", "ANN")
library(knitr)
kable(result, caption="Performance of Classification Techniques")
| Accuracy | Sensitivity | Specificity | Pos Pred Value | Neg Pred Value | F1 | |
|---|---|---|---|---|---|---|
| Logistic | 0.953 | 0.891 | 0.983 | 0.961 | 0.950 | 0.925 |
| LDA | 0.953 | 0.891 | 0.983 | 0.961 | 0.950 | 0.925 |
| QDA | 0.953 | 0.891 | 0.983 | 0.961 | 0.950 | 0.925 |
| KNN | 0.988 | 0.964 | 1.000 | 1.000 | 0.983 | 0.981 |
| NB | 0.947 | 0.927 | 0.957 | 0.911 | 0.965 | 0.919 |
| Tree | 0.930 | 0.873 | 0.957 | 0.906 | 0.941 | 0.889 |
| Tree-Prune | 0.930 | 0.945 | 0.922 | 0.852 | 0.973 | 0.897 |
| RF | 0.953 | 0.945 | 0.957 | 0.912 | 0.974 | 0.929 |
| GBM | 0.947 | 0.927 | 0.957 | 0.911 | 0.965 | 0.919 |
| SVM-Linear | 0.947 | 0.909 | 0.966 | 0.926 | 0.957 | 0.917 |
| SVM-Polynomial | 0.924 | 0.800 | 0.983 | 0.957 | 0.912 | 0.871 |
| SVM-Radial | 0.947 | 0.909 | 0.966 | 0.926 | 0.957 | 0.917 |
| ANN | 0.918 | 0.909 | 0.922 | 0.847 | 0.955 | 0.877 |
The results show that KNN performs the best in the current setting.