Load Packages

library(mlbench)
## Warning: package 'mlbench' was built under R version 4.1.3
library(caret)
## Warning: package 'caret' was built under R version 4.1.3
## Loading required package: ggplot2
## Loading required package: lattice
library(e1071)
## Warning: package 'e1071' was built under R version 4.1.3
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.1.3
## Loading required package: Matrix
## Loaded glmnet 4.1-7
library(MASS)
## Warning: package 'MASS' was built under R version 4.1.3
library(DMwR2)
## Warning: package 'DMwR2' was built under R version 4.1.3
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

Load data

data(BreastCancer)

Descriptive Stats

# Dimensions of datast
dim(BreastCancer)
## [1] 699  11

Peek at dataset

head(BreastCancer, n=20)
##         Id Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
## 1  1000025            5         1          1             1            2
## 2  1002945            5         4          4             5            7
## 3  1015425            3         1          1             1            2
## 4  1016277            6         8          8             1            3
## 5  1017023            4         1          1             3            2
## 6  1017122            8        10         10             8            7
## 7  1018099            1         1          1             1            2
## 8  1018561            2         1          2             1            2
## 9  1033078            2         1          1             1            2
## 10 1033078            4         2          1             1            2
## 11 1035283            1         1          1             1            1
## 12 1036172            2         1          1             1            2
## 13 1041801            5         3          3             3            2
## 14 1043999            1         1          1             1            2
## 15 1044572            8         7          5            10            7
## 16 1047630            7         4          6             4            6
## 17 1048672            4         1          1             1            2
## 18 1049815            4         1          1             1            2
## 19 1050670           10         7          7             6            4
## 20 1050718            6         1          1             1            2
##    Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses     Class
## 1            1           3               1       1    benign
## 2           10           3               2       1    benign
## 3            2           3               1       1    benign
## 4            4           3               7       1    benign
## 5            1           3               1       1    benign
## 6           10           9               7       1 malignant
## 7           10           3               1       1    benign
## 8            1           3               1       1    benign
## 9            1           1               1       5    benign
## 10           1           2               1       1    benign
## 11           1           3               1       1    benign
## 12           1           2               1       1    benign
## 13           3           4               4       1 malignant
## 14           3           3               1       1    benign
## 15           9           5               5       4 malignant
## 16           1           4               3       1 malignant
## 17           1           2               1       1    benign
## 18           1           3               1       1    benign
## 19          10           4               1       2 malignant
## 20           1           3               1       1    benign

Since the ID column types are characters and the oter columns are factor types, we will remove the ID column and convert the other columns to numerical value types

# Remove redundant variable Id
dataset <- BreastCancer[ ,-1]

# Convert input values to numeric
for(i in 1:9) {dataset[ ,i] <- as.numeric(as.character(dataset[ ,i]))}

Summary

summary(dataset)
##   Cl.thickness      Cell.size        Cell.shape     Marg.adhesion   
##  Min.   : 1.000   Min.   : 1.000   Min.   : 1.000   Min.   : 1.000  
##  1st Qu.: 2.000   1st Qu.: 1.000   1st Qu.: 1.000   1st Qu.: 1.000  
##  Median : 4.000   Median : 1.000   Median : 1.000   Median : 1.000  
##  Mean   : 4.418   Mean   : 3.134   Mean   : 3.207   Mean   : 2.807  
##  3rd Qu.: 6.000   3rd Qu.: 5.000   3rd Qu.: 5.000   3rd Qu.: 4.000  
##  Max.   :10.000   Max.   :10.000   Max.   :10.000   Max.   :10.000  
##                                                                     
##   Epith.c.size     Bare.nuclei      Bl.cromatin     Normal.nucleoli 
##  Min.   : 1.000   Min.   : 1.000   Min.   : 1.000   Min.   : 1.000  
##  1st Qu.: 2.000   1st Qu.: 1.000   1st Qu.: 2.000   1st Qu.: 1.000  
##  Median : 2.000   Median : 1.000   Median : 3.000   Median : 1.000  
##  Mean   : 3.216   Mean   : 3.545   Mean   : 3.438   Mean   : 2.867  
##  3rd Qu.: 4.000   3rd Qu.: 6.000   3rd Qu.: 5.000   3rd Qu.: 4.000  
##  Max.   :10.000   Max.   :10.000   Max.   :10.000   Max.   :10.000  
##                   NA's   :16                                        
##     Mitoses             Class    
##  Min.   : 1.000   benign   :458  
##  1st Qu.: 1.000   malignant:241  
##  Median : 1.000                  
##  Mean   : 1.589                  
##  3rd Qu.: 1.000                  
##  Max.   :10.000                  
## 

Unimodal Data Visualizations

# Histograms for each attribute
par(mfrow=c(3,3))
for(i in 1:9) {hist(dataset[ ,i], main=names(dataset)[i])}

# Density Plot for each attribute
par(mfrow=c(3,3))
complete_cases <- complete.cases(dataset)
for(i in 1:9) {
  plot(density(dataset[complete_cases, i]), main=names(dataset)[i])
}

# Boxplots for each attribute

par(mfrow=c(3,3))
for(i in 1:9) {boxplot(dataset[ ,i], main=names(dataset)[i])}

Evaluate Algorithms: Baseline

Let’s try a smattering of linear and nonlinear algorithms:

We will use 10-fold cross validation with 3 repeats. This is a good standard test harness configuration. It is a binary classification problem. Our baseline accuracy was found to be 65%.

# 10-fold cross-validation with 3 repeats
trainControl <- trainControl(method="repeatedcv", number=10, repeats=3)
metric <- "Accuracy"

Let’s create our models: we will estimate the accuracy of a suite of machine learning algorithms.

# LG
set.seed(123)
fit.glm <- train(Class~., data=dataset, method="glm", metric=metric, trControl=trainControl, na.action=na.omit)

# LDA
set.seed(123)
fit.lda<- train(Class~., data=dataset, method="lda", metric=metric, trControl=trainControl, na.action=na.omit)

# GLMNET
set.seed(123)
fit.glmnet <- train(Class~., data=dataset, method="glmnet", metric=metric, trControl=trainControl, na.action=na.omit)

results <- resamples(list(LG=fit.glm, LDA=fit.lda, GLMNET=fit.glmnet))

summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: LG, LDA, GLMNET 
## Number of resamples: 30 
## 
## Accuracy 
##             Min.   1st Qu.    Median      Mean   3rd Qu. Max. NA's
## LG     0.8970588 0.9560422 0.9705882 0.9653009 0.9852941    1    0
## LDA    0.9130435 0.9558824 0.9565217 0.9594896 0.9709079    1    0
## GLMNET 0.9253731 0.9560422 0.9705882 0.9682350 0.9852941    1    0
## 
## Kappa 
##             Min.   1st Qu.    Median      Mean   3rd Qu. Max. NA's
## LG     0.7767355 0.9036917 0.9356061 0.9234741 0.9674952    1    0
## LDA    0.7964602 0.9015871 0.9032258 0.9096392 0.9348442    1    0
## GLMNET 0.8291688 0.9026707 0.9356061 0.9299386 0.9676803    1    0
dotplot(results)