# install.packages("caretEnsemble")
# install.packages("Amelia")
# install.packages("psych")
# install.packages("mice")
# install.packages("GGally")
# install.packages("gbm")
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.5     ✔ purrr   0.3.4
## ✔ tibble  3.1.3     ✔ dplyr   1.0.7
## ✔ tidyr   1.1.3     ✔ stringr 1.4.0
## ✔ readr   2.0.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(ggplot2)
library(caret)  # confusionMatrix function
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(caretEnsemble)
## 
## Attaching package: 'caretEnsemble'
## The following object is masked from 'package:ggplot2':
## 
##     autoplot
library(MASS)  # lda function
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(class) # kNN
library(psych)
## Warning: package 'psych' was built under R version 4.1.2
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(Amelia)
## Warning: package 'Amelia' was built under R version 4.1.2
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2022 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(mice)
## Warning: package 'mice' was built under R version 4.1.2
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(rpart)
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.1.2
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:psych':
## 
##     outlier
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(gbm)
## Warning: package 'gbm' was built under R version 4.1.2
## Loaded gbm 2.1.8.1
library(readr)
wdbc <- read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data", 
    col_names = FALSE)
## Rows: 569 Columns: 32── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): X2
## dbl (31): X1, X3, X4, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15, X16,...
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(wdbc) <- c('id_number', 'diagnosis', 'radius_mean', 
         'texture_mean', 'perimeter_mean', 'area_mean', 
         'smoothness_mean', 'compactness_mean', 
         'concavity_mean','concave_points_mean', 
         'symmetry_mean', 'fractal_dimension_mean',
         'radius_se', 'texture_se', 'perimeter_se', 
         'area_se', 'smoothness_se', 'compactness_se', 
         'concavity_se', 'concave_points_se', 
         'symmetry_se', 'fractal_dimension_se', 
         'radius_worst', 'texture_worst', 
         'perimeter_worst', 'area_worst', 
         'smoothness_worst', 'compactness_worst', 
         'concavity_worst', 'concave_points_worst', 
         'symmetry_worst', 'fractal_dimension_worst')
glimpse(wdbc)
## Rows: 569
## Columns: 32
## $ id_number               <dbl> 842302, 842517, 84300903, 84348301, 84358402, …
## $ diagnosis               <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "…
## $ radius_mean             <dbl> 17.990, 20.570, 19.690, 11.420, 20.290, 12.450…
## $ texture_mean            <dbl> 10.38, 17.77, 21.25, 20.38, 14.34, 15.70, 19.9…
## $ perimeter_mean          <dbl> 122.80, 132.90, 130.00, 77.58, 135.10, 82.57, …
## $ area_mean               <dbl> 1001.0, 1326.0, 1203.0, 386.1, 1297.0, 477.1, …
## $ smoothness_mean         <dbl> 0.11840, 0.08474, 0.10960, 0.14250, 0.10030, 0…
## $ compactness_mean        <dbl> 0.27760, 0.07864, 0.15990, 0.28390, 0.13280, 0…
## $ concavity_mean          <dbl> 0.30010, 0.08690, 0.19740, 0.24140, 0.19800, 0…
## $ concave_points_mean     <dbl> 0.14710, 0.07017, 0.12790, 0.10520, 0.10430, 0…
## $ symmetry_mean           <dbl> 0.2419, 0.1812, 0.2069, 0.2597, 0.1809, 0.2087…
## $ fractal_dimension_mean  <dbl> 0.07871, 0.05667, 0.05999, 0.09744, 0.05883, 0…
## $ radius_se               <dbl> 1.0950, 0.5435, 0.7456, 0.4956, 0.7572, 0.3345…
## $ texture_se              <dbl> 0.9053, 0.7339, 0.7869, 1.1560, 0.7813, 0.8902…
## $ perimeter_se            <dbl> 8.589, 3.398, 4.585, 3.445, 5.438, 2.217, 3.18…
## $ area_se                 <dbl> 153.40, 74.08, 94.03, 27.23, 94.44, 27.19, 53.…
## $ smoothness_se           <dbl> 0.006399, 0.005225, 0.006150, 0.009110, 0.0114…
## $ compactness_se          <dbl> 0.049040, 0.013080, 0.040060, 0.074580, 0.0246…
## $ concavity_se            <dbl> 0.05373, 0.01860, 0.03832, 0.05661, 0.05688, 0…
## $ concave_points_se       <dbl> 0.015870, 0.013400, 0.020580, 0.018670, 0.0188…
## $ symmetry_se             <dbl> 0.03003, 0.01389, 0.02250, 0.05963, 0.01756, 0…
## $ fractal_dimension_se    <dbl> 0.006193, 0.003532, 0.004571, 0.009208, 0.0051…
## $ radius_worst            <dbl> 25.38, 24.99, 23.57, 14.91, 22.54, 15.47, 22.8…
## $ texture_worst           <dbl> 17.33, 23.41, 25.53, 26.50, 16.67, 23.75, 27.6…
## $ perimeter_worst         <dbl> 184.60, 158.80, 152.50, 98.87, 152.20, 103.40,…
## $ area_worst              <dbl> 2019.0, 1956.0, 1709.0, 567.7, 1575.0, 741.6, …
## $ smoothness_worst        <dbl> 0.1622, 0.1238, 0.1444, 0.2098, 0.1374, 0.1791…
## $ compactness_worst       <dbl> 0.6656, 0.1866, 0.4245, 0.8663, 0.2050, 0.5249…
## $ concavity_worst         <dbl> 0.71190, 0.24160, 0.45040, 0.68690, 0.40000, 0…
## $ concave_points_worst    <dbl> 0.26540, 0.18600, 0.24300, 0.25750, 0.16250, 0…
## $ symmetry_worst          <dbl> 0.4601, 0.2750, 0.3613, 0.6638, 0.2364, 0.3985…
## $ fractal_dimension_worst <dbl> 0.11890, 0.08902, 0.08758, 0.17300, 0.07678, 0…
# Removing the first column (id_number) because it is not a real predictor.
# Keep the response variable ( diagnosis) and the predictors containing 'mean'
# Checking the values (B/M) of the response variable and convert to 0/1

wdbc <- wdbc %>%
  dplyr::select(diagnosis, contains("mean"))%>%
  mutate(bc = ifelse(diagnosis=="M", 1, ifelse(diagnosis=="B",0, NA))) %>%
  dplyr::select(-diagnosis)
head(wdbc)
dim(wdbc)
## [1] 569  11
table(wdbc$bc)
## 
##   0   1 
## 357 212
# Checking missing values
library(mice)
missmap(wdbc)
## Warning: Unknown or uninitialised column: `arguments`.

## Warning: Unknown or uninitialised column: `arguments`.
## Warning: Unknown or uninitialised column: `imputations`.

# Checking bivariate relationships among variables
ggpairs(wdbc)

library(dplyr)
# Standardization
scale2 <- function(x, na.rm = FALSE) (x - mean(x, na.rm = na.rm)) / sd(x, na.rm)
wdbc <- wdbc %>%
  mutate_at(vars(-bc), scale2)
# Provide a seed number for reproducibility
set.seed(34859)

nr <- nrow(wdbc)
tr.id <- sample(nr, floor(0.7*nr), replace=FALSE, prob=NULL)

tr.dat <- wdbc[tr.id,]
ts.dat <- wdbc[-tr.id,]

tr.x <- tr.dat %>% dplyr::select(-bc)

ts.x <- ts.dat %>% dplyr::select(-bc)
ts.y <- ts.dat %>% dplyr::select(bc)  

Logistic regression

logis <- glm(bc~., data = tr.dat)

summary(logis)
## 
## Call:
## glm(formula = bc ~ ., data = tr.dat)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.67175  -0.17671  -0.03821   0.17852   0.80928  
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             0.38046    0.01386  27.451  < 2e-16 ***
## radius_mean             1.77547    0.51256   3.464 0.000592 ***
## texture_mean            0.11125    0.01558   7.141 4.61e-12 ***
## perimeter_mean         -1.41977    0.56730  -2.503 0.012738 *  
## area_mean              -0.31358    0.09610  -3.263 0.001200 ** 
## smoothness_mean         0.02102    0.02360   0.891 0.373633    
## compactness_mean        0.02493    0.06284   0.397 0.691770    
## concavity_mean          0.03086    0.04474   0.690 0.490746    
## concave_points_mean     0.25856    0.06300   4.104 4.96e-05 ***
## symmetry_mean           0.03700    0.01788   2.070 0.039153 *  
## fractal_dimension_mean -0.01293    0.03592  -0.360 0.719084    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.07597603)
## 
##     Null deviance: 95.068  on 397  degrees of freedom
## Residual deviance: 29.403  on 387  degrees of freedom
## AIC: 116.54
## 
## Number of Fisher Scoring iterations: 2
# A simple predictive model with significant predictors

logis.sig <- glm(bc~ radius_mean + texture_mean + perimeter_mean + area_mean
                 + concave_points_mean + symmetry_mean, data = tr.dat)


# Predicted probability
logis.prob <- predict(logis.sig, ts.dat, type="response")

# Classification (y_hat)
logis.pred <- ifelse(logis.prob >= 0.5, 1, 0)



# confusionMatrix function takes only factor values for yhat and y.

(logis.conf <- confusionMatrix(factor(logis.pred), factor(ts.dat$bc), positive = "1", mode="everything"))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 114   6
##          1   2  49
##                                           
##                Accuracy : 0.9532          
##                  95% CI : (0.9099, 0.9796)
##     No Information Rate : 0.6784          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8907          
##                                           
##  Mcnemar's Test P-Value : 0.2888          
##                                           
##             Sensitivity : 0.8909          
##             Specificity : 0.9828          
##          Pos Pred Value : 0.9608          
##          Neg Pred Value : 0.9500          
##               Precision : 0.9608          
##                  Recall : 0.8909          
##                      F1 : 0.9245          
##              Prevalence : 0.3216          
##          Detection Rate : 0.2865          
##    Detection Prevalence : 0.2982          
##       Balanced Accuracy : 0.9368          
##                                           
##        'Positive' Class : 1               
## 

Linear discriminant analysis

lda.fit <- lda(bc ~ ., data = tr.dat)
lda.pred <- predict(lda.fit, ts.dat)$class

lda.conf <- confusionMatrix(factor(lda.pred), factor(ts.dat$bc), positive ="1", mode ="everything")
lda.conf
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 114   6
##          1   2  49
##                                           
##                Accuracy : 0.9532          
##                  95% CI : (0.9099, 0.9796)
##     No Information Rate : 0.6784          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8907          
##                                           
##  Mcnemar's Test P-Value : 0.2888          
##                                           
##             Sensitivity : 0.8909          
##             Specificity : 0.9828          
##          Pos Pred Value : 0.9608          
##          Neg Pred Value : 0.9500          
##               Precision : 0.9608          
##                  Recall : 0.8909          
##                      F1 : 0.9245          
##              Prevalence : 0.3216          
##          Detection Rate : 0.2865          
##    Detection Prevalence : 0.2982          
##       Balanced Accuracy : 0.9368          
##                                           
##        'Positive' Class : 1               
## 

##Quadratic discriminant analysis

qda.fit <- qda(bc ~ ., data = tr.dat)
qda.pred <- predict(qda.fit, ts.dat)$class

qda.conf <- confusionMatrix(factor(qda.pred), factor(ts.dat$bc), positive ="1", mode ="everything")
qda.conf
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 114   6
##          1   2  49
##                                           
##                Accuracy : 0.9532          
##                  95% CI : (0.9099, 0.9796)
##     No Information Rate : 0.6784          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8907          
##                                           
##  Mcnemar's Test P-Value : 0.2888          
##                                           
##             Sensitivity : 0.8909          
##             Specificity : 0.9828          
##          Pos Pred Value : 0.9608          
##          Neg Pred Value : 0.9500          
##               Precision : 0.9608          
##                  Recall : 0.8909          
##                      F1 : 0.9245          
##              Prevalence : 0.3216          
##          Detection Rate : 0.2865          
##    Detection Prevalence : 0.2982          
##       Balanced Accuracy : 0.9368          
##                                           
##        'Positive' Class : 1               
## 

##kNN

# The knn function is available in the class package

knn.pred <- knn(tr.dat, ts.dat, cl = tr.dat$bc, k=7)

knn.conf <- confusionMatrix(factor(knn.pred), factor(ts.dat$bc), positive ="1", mode ="everything")
knn.conf
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 116   2
##          1   0  53
##                                           
##                Accuracy : 0.9883          
##                  95% CI : (0.9584, 0.9986)
##     No Information Rate : 0.6784          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9729          
##                                           
##  Mcnemar's Test P-Value : 0.4795          
##                                           
##             Sensitivity : 0.9636          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.9831          
##               Precision : 1.0000          
##                  Recall : 0.9636          
##                      F1 : 0.9815          
##              Prevalence : 0.3216          
##          Detection Rate : 0.3099          
##    Detection Prevalence : 0.3099          
##       Balanced Accuracy : 0.9818          
##                                           
##        'Positive' Class : 1               
## 
# How can we incorporate different distance metrics in the knn function?
# We can use the knn function the caret package

Naive Bayes

 # Naive Bayes classifier can be implemented by e1071, klaR, naivebayes, bnclssify, caret, and h2o packages

library(caret)
library(klaR)
## Warning: package 'klaR' was built under R version 4.1.2
library(e1071)

 nb.fit = train(x = tr.x, y = factor(tr.dat$bc), method = "nb",
                trControl=trainControl(method='cv', number=10))
## Warning: Setting row names on a tibble is deprecated.
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 19
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 38
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 41
## Warning: Setting row names on a tibble is deprecated.
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 19
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 38
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 41
## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 2
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 7
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 19
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 21
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 27
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 34
## Warning: Setting row names on a tibble is deprecated.
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 7
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 19
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 21
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 27
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 34
## Warning: Setting row names on a tibble is deprecated.
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 9
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 22
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 25
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 30
## Warning: Setting row names on a tibble is deprecated.
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 9
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 22
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 25
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 35
## Warning: Setting row names on a tibble is deprecated.
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 2
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 6
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 8
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 13
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 16
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 24
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 38
## Warning: Setting row names on a tibble is deprecated.
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 6
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 8
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 13
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 16
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 24
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 38
## Warning: Setting row names on a tibble is deprecated.
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 3
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 13
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 22
## Warning: Setting row names on a tibble is deprecated.
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 13
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 22
## Warning: Setting row names on a tibble is deprecated.
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 11
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 17
## Warning: Setting row names on a tibble is deprecated.
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 11
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 17
## Warning: Setting row names on a tibble is deprecated.
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 8
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 30
## Warning: Setting row names on a tibble is deprecated.
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 8
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 30
## Warning: Setting row names on a tibble is deprecated.
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 2
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 5
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 6
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 15
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 22
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 40
## Warning: Setting row names on a tibble is deprecated.
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 2
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 5
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 6
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 15
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 24
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 40
## Warning: Setting row names on a tibble is deprecated.
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 6
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 7
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 11
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 32
## Warning: Setting row names on a tibble is deprecated.
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 7
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 11
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 32
## Warning: Setting row names on a tibble is deprecated.
 nb.fit
## Naive Bayes 
## 
## 398 samples
##  10 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 357, 358, 358, 359, 358, 358, ... 
## Resampling results across tuning parameters:
## 
##   usekernel  Accuracy   Kappa    
##   FALSE      0.9096060  0.8093107
##    TRUE      0.9147952  0.8207147
## 
## Tuning parameter 'fL' was held constant at a value of 0
## Tuning
##  parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were fL = 0, usekernel = TRUE and adjust
##  = 1.
 nb.pred <- predict(nb.fit, newdata = ts.x)
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 1
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 25
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 27
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 36
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 112
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 146
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 171
 table(nb.pred)
## nb.pred
##   0   1 
## 115  56
 nb.conf <- confusionMatrix(factor(nb.pred), factor(ts.dat$bc), positive = "1", mode ="everything")
 nb.conf
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 111   4
##          1   5  51
##                                           
##                Accuracy : 0.9474          
##                  95% CI : (0.9024, 0.9757)
##     No Information Rate : 0.6784          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.88            
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9273          
##             Specificity : 0.9569          
##          Pos Pred Value : 0.9107          
##          Neg Pred Value : 0.9652          
##               Precision : 0.9107          
##                  Recall : 0.9273          
##                      F1 : 0.9189          
##              Prevalence : 0.3216          
##          Detection Rate : 0.2982          
##    Detection Prevalence : 0.3275          
##       Balanced Accuracy : 0.9421          
##                                           
##        'Positive' Class : 1               
## 

##Decision Tree

# Decision tree can be implemented using tree, rpart, ... packages
library(rpart)
library(rpart.plot)

dc.fit <- rpart(factor(bc) ~., data=tr.dat, method="class",control=rpart.control(minsplit=30, cp=0.001))

rpart.plot(dc.fit, main ="Prediction of Breast Cancer")

dc.prune.fit <- prune(dc.fit, dc.fit$cptable[which.min(dc.fit$cptable[,"xerror"]),"CP"])
summary(dc.prune.fit)
## Call:
## rpart(formula = factor(bc) ~ ., data = tr.dat, method = "class", 
##     control = rpart.control(minsplit = 30, cp = 0.001))
##   n= 398 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.77070064      0 1.0000000 1.0000000 0.06210365
## 2 0.03184713      1 0.2292994 0.2547771 0.03820589
## 
## Variable importance
## concave_points_mean      concavity_mean      perimeter_mean         radius_mean 
##                  22                  18                  16                  15 
##           area_mean    compactness_mean 
##                  15                  14 
## 
## Node number 1: 398 observations,    complexity param=0.7707006
##   predicted class=0  expected loss=0.3944724  P(node) =1
##     class counts:   241   157
##    probabilities: 0.606 0.394 
##   left son=2 (239 obs) right son=3 (159 obs)
##   Primary splits:
##       concave_points_mean < 0.0726971   to the left,  improve=125.0950, (0 missing)
##       area_mean           < 0.1175312   to the left,  improve=115.7720, (0 missing)
##       perimeter_mean      < 0.3259794   to the left,  improve=114.7733, (0 missing)
##       radius_mean         < 0.3256789   to the left,  improve=113.2904, (0 missing)
##       concavity_mean      < 0.00621783  to the left,  improve=108.0154, (0 missing)
##   Surrogate splits:
##       concavity_mean   < -0.08020987 to the left,  agree=0.917, adj=0.792, (0 split)
##       perimeter_mean   < 0.2374983   to the left,  agree=0.882, adj=0.704, (0 split)
##       radius_mean      < 0.3299353   to the left,  agree=0.872, adj=0.679, (0 split)
##       area_mean        < 0.2044842   to the left,  agree=0.867, adj=0.667, (0 split)
##       compactness_mean < -0.04621959 to the left,  agree=0.849, adj=0.623, (0 split)
## 
## Node number 2: 239 observations
##   predicted class=0  expected loss=0.07112971  P(node) =0.6005025
##     class counts:   222    17
##    probabilities: 0.929 0.071 
## 
## Node number 3: 159 observations
##   predicted class=1  expected loss=0.1194969  P(node) =0.3994975
##     class counts:    19   140
##    probabilities: 0.119 0.881
rpart.plot(dc.prune.fit, main ="Prediction of Breast Cancer with Pruning")

dc.pred <- predict(dc.fit, newdata = ts.x, type = "class")
dc.conf <- confusionMatrix(factor(dc.pred), factor(ts.dat$bc), positive ="1", mode="everything")
dc.conf
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 111   7
##          1   5  48
##                                           
##                Accuracy : 0.9298          
##                  95% CI : (0.8806, 0.9632)
##     No Information Rate : 0.6784          
##     P-Value [Acc > NIR] : 2.034e-15       
##                                           
##                   Kappa : 0.8376          
##                                           
##  Mcnemar's Test P-Value : 0.7728          
##                                           
##             Sensitivity : 0.8727          
##             Specificity : 0.9569          
##          Pos Pred Value : 0.9057          
##          Neg Pred Value : 0.9407          
##               Precision : 0.9057          
##                  Recall : 0.8727          
##                      F1 : 0.8889          
##              Prevalence : 0.3216          
##          Detection Rate : 0.2807          
##    Detection Prevalence : 0.3099          
##       Balanced Accuracy : 0.9148          
##                                           
##        'Positive' Class : 1               
## 
dc.prune.pred <- predict(dc.prune.fit, newdata = ts.x, type = "class")
dc.prune.conf <- confusionMatrix(factor(dc.prune.pred), factor(ts.dat$bc), positive ="1", mode="everything")
dc.prune.conf
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 107   3
##          1   9  52
##                                           
##                Accuracy : 0.9298          
##                  95% CI : (0.8806, 0.9632)
##     No Information Rate : 0.6784          
##     P-Value [Acc > NIR] : 2.034e-15       
##                                           
##                   Kappa : 0.8437          
##                                           
##  Mcnemar's Test P-Value : 0.1489          
##                                           
##             Sensitivity : 0.9455          
##             Specificity : 0.9224          
##          Pos Pred Value : 0.8525          
##          Neg Pred Value : 0.9727          
##               Precision : 0.8525          
##                  Recall : 0.9455          
##                      F1 : 0.8966          
##              Prevalence : 0.3216          
##          Detection Rate : 0.3041          
##    Detection Prevalence : 0.3567          
##       Balanced Accuracy : 0.9339          
##                                           
##        'Positive' Class : 1               
## 

##Random Forest In random forest the number of trees and the number of predictors used are important arguments. We chose ntree=500 and mtry = p‾√.

library(randomForest)

rf.fit= randomForest(factor(bc)~., data = tr.dat, ntree = 500, mtry = ceiling(sqrt(ncol(tr.dat))))
rf.fit
## 
## Call:
##  randomForest(formula = factor(bc) ~ ., data = tr.dat, ntree = 500,      mtry = ceiling(sqrt(ncol(tr.dat)))) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 6.28%
## Confusion matrix:
##     0   1 class.error
## 0 230  11  0.04564315
## 1  14 143  0.08917197
rf.pred = predict(rf.fit, ts.x, type ="response")
rf.conf <- confusionMatrix(factor(rf.pred), factor(ts.dat$bc), positive ="1", mode="everything")
rf.conf
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 111   3
##          1   5  52
##                                           
##                Accuracy : 0.9532          
##                  95% CI : (0.9099, 0.9796)
##     No Information Rate : 0.6784          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8938          
##                                           
##  Mcnemar's Test P-Value : 0.7237          
##                                           
##             Sensitivity : 0.9455          
##             Specificity : 0.9569          
##          Pos Pred Value : 0.9123          
##          Neg Pred Value : 0.9737          
##               Precision : 0.9123          
##                  Recall : 0.9455          
##                      F1 : 0.9286          
##              Prevalence : 0.3216          
##          Detection Rate : 0.3041          
##    Detection Prevalence : 0.3333          
##       Balanced Accuracy : 0.9512          
##                                           
##        'Positive' Class : 1               
## 

##Boosting

library(gbm)
set.seed(123)
boost.fit= gbm(bc ~ ., data = tr.dat, distribution ="bernoulli", n.tree = 500, cv.folds=5,
               shrinkage = 0.1, interaction.depth = 2, n.cores  = NULL)

par(mar = c(5,8,1,1))
summary(boost.fit, cBars = 10, method = relative.influence, las = 2)
# To find the best hyperparameters, you can use a grid search

boost.pred.prob = predict(boost.fit, ts.x, n.trees  = boost.fit$n.trees, type="response")
boost.pred <- factor(ifelse(boost.pred.prob>0.5, 1,0))
boost.conf <- confusionMatrix(factor(boost.pred), factor(ts.dat$bc), positive ="1", mode ="everything")
boost.conf
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 111   4
##          1   5  51
##                                           
##                Accuracy : 0.9474          
##                  95% CI : (0.9024, 0.9757)
##     No Information Rate : 0.6784          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.88            
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9273          
##             Specificity : 0.9569          
##          Pos Pred Value : 0.9107          
##          Neg Pred Value : 0.9652          
##               Precision : 0.9107          
##                  Recall : 0.9273          
##                      F1 : 0.9189          
##              Prevalence : 0.3216          
##          Detection Rate : 0.2982          
##    Detection Prevalence : 0.3275          
##       Balanced Accuracy : 0.9421          
##                                           
##        'Positive' Class : 1               
## 

SVM

The important hyperparameter is to select a proper kernel option.

# Naive Bayes classifier can be implemented by e1071, klaR, naivebayes, bnclssify, caret, and h2o packages

library(e1071)

# Linear kernel

svm.fit = svm(factor(bc) ~ ., data = tr.dat, kernel = "linear", cost = 10, scale = FALSE, type ="C-classification")
summary(svm.fit)
## 
## Call:
## svm(formula = factor(bc) ~ ., data = tr.dat, kernel = "linear", cost = 10, 
##     type = "C-classification", scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  10 
## 
## Number of Support Vectors:  59
## 
##  ( 29 30 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1
svm.pred <- predict(svm.fit, newdata = ts.x)
svm.conf <- confusionMatrix(factor(svm.pred), factor(ts.dat$bc), positive ="1", mode = "everything")
svm.conf
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 112   5
##          1   4  50
##                                           
##                Accuracy : 0.9474          
##                  95% CI : (0.9024, 0.9757)
##     No Information Rate : 0.6784          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8788          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9091          
##             Specificity : 0.9655          
##          Pos Pred Value : 0.9259          
##          Neg Pred Value : 0.9573          
##               Precision : 0.9259          
##                  Recall : 0.9091          
##                      F1 : 0.9174          
##              Prevalence : 0.3216          
##          Detection Rate : 0.2924          
##    Detection Prevalence : 0.3158          
##       Balanced Accuracy : 0.9373          
##                                           
##        'Positive' Class : 1               
## 
# Polynomial kernel

svm.fit1 = svm(factor(bc) ~ ., data = tr.dat, kernel = "polynomial", cost = 10, scale = FALSE, type ="C-classification")
summary(svm.fit1)
## 
## Call:
## svm(formula = factor(bc) ~ ., data = tr.dat, kernel = "polynomial", 
##     cost = 10, type = "C-classification", scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  polynomial 
##        cost:  10 
##      degree:  3 
##      coef.0:  0 
## 
## Number of Support Vectors:  88
## 
##  ( 42 46 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1
svm.pred1 <- predict(svm.fit1, newdata = ts.x)
svm.conf1 <- confusionMatrix(factor(svm.pred1), factor(ts.dat$bc), positive ="1", mode = "everything")
svm.conf1
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 114  11
##          1   2  44
##                                           
##                Accuracy : 0.924           
##                  95% CI : (0.8735, 0.9589)
##     No Information Rate : 0.6784          
##     P-Value [Acc > NIR] : 1.2e-14         
##                                           
##                   Kappa : 0.818           
##                                           
##  Mcnemar's Test P-Value : 0.0265          
##                                           
##             Sensitivity : 0.8000          
##             Specificity : 0.9828          
##          Pos Pred Value : 0.9565          
##          Neg Pred Value : 0.9120          
##               Precision : 0.9565          
##                  Recall : 0.8000          
##                      F1 : 0.8713          
##              Prevalence : 0.3216          
##          Detection Rate : 0.2573          
##    Detection Prevalence : 0.2690          
##       Balanced Accuracy : 0.8914          
##                                           
##        'Positive' Class : 1               
## 
# Nonlinear kernel
# To identify the types of kernel, type  ??svm in the Console
svm.fit2 = svm(factor(bc) ~ ., data = tr.dat, kernel = "radial", cost = 10, scale = FALSE)
summary(svm.fit1)
## 
## Call:
## svm(formula = factor(bc) ~ ., data = tr.dat, kernel = "polynomial", 
##     cost = 10, type = "C-classification", scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  polynomial 
##        cost:  10 
##      degree:  3 
##      coef.0:  0 
## 
## Number of Support Vectors:  88
## 
##  ( 42 46 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1
svm.pred2 <- predict(svm.fit2, newdata = ts.x)
svm.conf2 <- confusionMatrix(factor(svm.pred2), factor(ts.dat$bc), positive ="1", mode = "everything")
svm.conf2
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 112   5
##          1   4  50
##                                           
##                Accuracy : 0.9474          
##                  95% CI : (0.9024, 0.9757)
##     No Information Rate : 0.6784          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8788          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9091          
##             Specificity : 0.9655          
##          Pos Pred Value : 0.9259          
##          Neg Pred Value : 0.9573          
##               Precision : 0.9259          
##                  Recall : 0.9091          
##                      F1 : 0.9174          
##              Prevalence : 0.3216          
##          Detection Rate : 0.2924          
##    Detection Prevalence : 0.3158          
##       Balanced Accuracy : 0.9373          
##                                           
##        'Positive' Class : 1               
## 

##Artificial Neural Network

#install.packages("neuralnet")
library(neuralnet)
## 
## Attaching package: 'neuralnet'
## The following object is masked from 'package:dplyr':
## 
##     compute
set.seed(123)

## hidden =c(5,3): layer 1 has 5 nodes and layer 2 has 3 nodes
n <- names(tr.dat)
nn.fit <- neuralnet(factor(bc) ~., data = tr.dat, hidden=c(5,3), linear.output = FALSE, err.fc = "ce", likelihood = TRUE)

# For regression, linear.output = TRUE

plot(nn.fit, rep ="best")

nn.prob <- compute(nn.fit, ts.x)

nn.pred <- ifelse(nn.prob$net.result[,2] >= 0.5, 1,0)
nn.conf <- confusionMatrix(factor(nn.pred), factor(ts.dat$bc), positive ="1", mode="everything")
nn.conf
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 107   5
##          1   9  50
##                                           
##                Accuracy : 0.9181          
##                  95% CI : (0.8664, 0.9545)
##     No Information Rate : 0.6784          
##     P-Value [Acc > NIR] : 6.53e-14        
##                                           
##                   Kappa : 0.8159          
##                                           
##  Mcnemar's Test P-Value : 0.4227          
##                                           
##             Sensitivity : 0.9091          
##             Specificity : 0.9224          
##          Pos Pred Value : 0.8475          
##          Neg Pred Value : 0.9554          
##               Precision : 0.8475          
##                  Recall : 0.9091          
##                      F1 : 0.8772          
##              Prevalence : 0.3216          
##          Detection Rate : 0.2924          
##    Detection Prevalence : 0.3450          
##       Balanced Accuracy : 0.9158          
##                                           
##        'Positive' Class : 1               
## 

##Summary of classification algorithims

logis.result <- c(logis.conf$overall[1], logis.conf$byClass[c(1:4,7)])
lda.result <- c(lda.conf$overall[1], lda.conf$byClass[c(1:4,7)])
qda.result <- c(qda.conf$overall[1], qda.conf$byClass[c(1:4,7)])
knn.result <- c(knn.conf$overall[1], knn.conf$byClass[c(1:4,7)])
nb.result <- c(nb.conf$overall[1], nb.conf$byClass[c(1:4,7)])
dc.result <- c(dc.conf$overall[1], dc.conf$byClass[c(1:4,7)])
dc.prune.result <- c(dc.prune.conf$overall[1], dc.prune.conf$byClass[c(1:4,7)])
rf.result <- c(rf.conf$overall[1], rf.conf$byClass[c(1:4,7)])
boost.result <- c(boost.conf$overall[1], boost.conf$byClass[c(1:4,7)])
svm.result <- c(svm.conf$overall[1], svm.conf$byClass[c(1:4,7)])
svm1.result <- c(svm.conf1$overall[1], svm.conf1$byClass[c(1:4,7)])
svm2.result <- c(svm.conf2$overall[1], svm.conf2$byClass[c(1:4,7)])
nn.result <- c(nn.conf$overall[1], nn.conf$byClass[c(1:4,7)])

result <- round(rbind(logis.result, lda.result, qda.result, knn.result, nb.result, dc.result, dc.prune.result, rf.result, boost.result, svm.result, svm1.result, svm2.result, nn.result),3)
rownames(result) <- c("Logistic", "LDA","QDA", "KNN", "NB", "Tree" ,"Tree-Prune", "RF", "GBM", "SVM-Linear", "SVM-Polynomial", "SVM-Radial", "ANN")

library(knitr)
kable(result, caption="Performance of Classification Techniques")
Performance of Classification Techniques
Accuracy Sensitivity Specificity Pos Pred Value Neg Pred Value F1
Logistic 0.953 0.891 0.983 0.961 0.950 0.925
LDA 0.953 0.891 0.983 0.961 0.950 0.925
QDA 0.953 0.891 0.983 0.961 0.950 0.925
KNN 0.988 0.964 1.000 1.000 0.983 0.981
NB 0.947 0.927 0.957 0.911 0.965 0.919
Tree 0.930 0.873 0.957 0.906 0.941 0.889
Tree-Prune 0.930 0.945 0.922 0.852 0.973 0.897
RF 0.953 0.945 0.957 0.912 0.974 0.929
GBM 0.947 0.927 0.957 0.911 0.965 0.919
SVM-Linear 0.947 0.909 0.966 0.926 0.957 0.917
SVM-Polynomial 0.924 0.800 0.983 0.957 0.912 0.871
SVM-Radial 0.947 0.909 0.966 0.926 0.957 0.917
ANN 0.918 0.909 0.922 0.847 0.955 0.877

The results show that KNN performs the best in the current setting.