Load Packages
library(mlbench)
## Warning: package 'mlbench' was built under R version 4.1.3
library(caret)
## Warning: package 'caret' was built under R version 4.1.3
## Loading required package: ggplot2
## Loading required package: lattice
library(e1071)
## Warning: package 'e1071' was built under R version 4.1.3
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.1.3
## Loading required package: Matrix
## Loaded glmnet 4.1-7
library(MASS)
## Warning: package 'MASS' was built under R version 4.1.3
library(DMwR2)
## Warning: package 'DMwR2' was built under R version 4.1.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
Load data
data(BreastCancer)
Descriptive Stats
# Dimensions of datast
dim(BreastCancer)
## [1] 699 11
Peek at dataset
head(BreastCancer, n=20)
## Id Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
## 1 1000025 5 1 1 1 2
## 2 1002945 5 4 4 5 7
## 3 1015425 3 1 1 1 2
## 4 1016277 6 8 8 1 3
## 5 1017023 4 1 1 3 2
## 6 1017122 8 10 10 8 7
## 7 1018099 1 1 1 1 2
## 8 1018561 2 1 2 1 2
## 9 1033078 2 1 1 1 2
## 10 1033078 4 2 1 1 2
## 11 1035283 1 1 1 1 1
## 12 1036172 2 1 1 1 2
## 13 1041801 5 3 3 3 2
## 14 1043999 1 1 1 1 2
## 15 1044572 8 7 5 10 7
## 16 1047630 7 4 6 4 6
## 17 1048672 4 1 1 1 2
## 18 1049815 4 1 1 1 2
## 19 1050670 10 7 7 6 4
## 20 1050718 6 1 1 1 2
## Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses Class
## 1 1 3 1 1 benign
## 2 10 3 2 1 benign
## 3 2 3 1 1 benign
## 4 4 3 7 1 benign
## 5 1 3 1 1 benign
## 6 10 9 7 1 malignant
## 7 10 3 1 1 benign
## 8 1 3 1 1 benign
## 9 1 1 1 5 benign
## 10 1 2 1 1 benign
## 11 1 3 1 1 benign
## 12 1 2 1 1 benign
## 13 3 4 4 1 malignant
## 14 3 3 1 1 benign
## 15 9 5 5 4 malignant
## 16 1 4 3 1 malignant
## 17 1 2 1 1 benign
## 18 1 3 1 1 benign
## 19 10 4 1 2 malignant
## 20 1 3 1 1 benign
Since the ID column types are characters and the oter columns are
factor types, we will remove the ID column and convert the other columns
to numerical value types
# Remove redundant variable Id
dataset <- BreastCancer[ ,-1]
# Convert input values to numeric
for(i in 1:9) {dataset[ ,i] <- as.numeric(as.character(dataset[ ,i]))}
Summary
summary(dataset)
## Cl.thickness Cell.size Cell.shape Marg.adhesion
## Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. : 1.000
## 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 1.000
## Median : 4.000 Median : 1.000 Median : 1.000 Median : 1.000
## Mean : 4.418 Mean : 3.134 Mean : 3.207 Mean : 2.807
## 3rd Qu.: 6.000 3rd Qu.: 5.000 3rd Qu.: 5.000 3rd Qu.: 4.000
## Max. :10.000 Max. :10.000 Max. :10.000 Max. :10.000
##
## Epith.c.size Bare.nuclei Bl.cromatin Normal.nucleoli
## Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. : 1.000
## 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 2.000 1st Qu.: 1.000
## Median : 2.000 Median : 1.000 Median : 3.000 Median : 1.000
## Mean : 3.216 Mean : 3.545 Mean : 3.438 Mean : 2.867
## 3rd Qu.: 4.000 3rd Qu.: 6.000 3rd Qu.: 5.000 3rd Qu.: 4.000
## Max. :10.000 Max. :10.000 Max. :10.000 Max. :10.000
## NA's :16
## Mitoses Class
## Min. : 1.000 benign :458
## 1st Qu.: 1.000 malignant:241
## Median : 1.000
## Mean : 1.589
## 3rd Qu.: 1.000
## Max. :10.000
##
Unimodal Data Visualizations
# Histograms for each attribute
par(mfrow=c(3,3))
for(i in 1:9) {hist(dataset[ ,i], main=names(dataset)[i])}

# Density Plot for each attribute
par(mfrow=c(3,3))
complete_cases <- complete.cases(dataset)
for(i in 1:9) {
plot(density(dataset[complete_cases, i]), main=names(dataset)[i])
}

# Boxplots for each attribute
par(mfrow=c(3,3))
for(i in 1:9) {boxplot(dataset[ ,i], main=names(dataset)[i])}

Evaluate Algorithms: Baseline
Let’s try a smattering of linear and nonlinear algorithms:
We will use 10-fold cross validation with 3 repeats. This is a good
standard test harness configuration. It is a binary classification
problem. Our baseline accuracy was found to be 65%.
# 10-fold cross-validation with 3 repeats
trainControl <- trainControl(method="repeatedcv", number=10, repeats=3)
metric <- "Accuracy"
Let’s create our models: we will estimate the accuracy of a suite of
machine learning algorithms.
# LG
set.seed(123)
fit.glm <- train(Class~., data=dataset, method="glm", metric=metric, trControl=trainControl, na.action=na.omit)
# LDA
set.seed(123)
fit.lda<- train(Class~., data=dataset, method="lda", metric=metric, trControl=trainControl, na.action=na.omit)
# GLMNET
set.seed(123)
fit.glmnet <- train(Class~., data=dataset, method="glmnet", metric=metric, trControl=trainControl, na.action=na.omit)
results <- resamples(list(LG=fit.glm, LDA=fit.lda, GLMNET=fit.glmnet))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: LG, LDA, GLMNET
## Number of resamples: 30
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LG 0.8970588 0.9560422 0.9705882 0.9653009 0.9852941 1 0
## LDA 0.9130435 0.9558824 0.9565217 0.9594896 0.9709079 1 0
## GLMNET 0.9253731 0.9560422 0.9705882 0.9682350 0.9852941 1 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LG 0.7767355 0.9036917 0.9356061 0.9234741 0.9674952 1 0
## LDA 0.7964602 0.9015871 0.9032258 0.9096392 0.9348442 1 0
## GLMNET 0.8291688 0.9026707 0.9356061 0.9299386 0.9676803 1 0
dotplot(results)
