Load Packages

library(mlbench)

## Warning: package 'mlbench' was built under R version 4.1.3

library(caret)

## Warning: package 'caret' was built under R version 4.1.3

## Loading required package: ggplot2

## Loading required package: lattice

library(e1071)

## Warning: package 'e1071' was built under R version 4.1.3

library(glmnet)

## Warning: package 'glmnet' was built under R version 4.1.3

## Loading required package: Matrix

## Loaded glmnet 4.1-7

library(MASS)

## Warning: package 'MASS' was built under R version 4.1.3

library(DMwR2)

## Warning: package 'DMwR2' was built under R version 4.1.3

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

Load data

data(BreastCancer)

Descriptive Stats

# Dimensions of datast
dim(BreastCancer)

## [1] 699  11

Peek at dataset

head(BreastCancer, n=20)

##         Id Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
## 1  1000025            5         1          1             1            2
## 2  1002945            5         4          4             5            7
## 3  1015425            3         1          1             1            2
## 4  1016277            6         8          8             1            3
## 5  1017023            4         1          1             3            2
## 6  1017122            8        10         10             8            7
## 7  1018099            1         1          1             1            2
## 8  1018561            2         1          2             1            2
## 9  1033078            2         1          1             1            2
## 10 1033078            4         2          1             1            2
## 11 1035283            1         1          1             1            1
## 12 1036172            2         1          1             1            2
## 13 1041801            5         3          3             3            2
## 14 1043999            1         1          1             1            2
## 15 1044572            8         7          5            10            7
## 16 1047630            7         4          6             4            6
## 17 1048672            4         1          1             1            2
## 18 1049815            4         1          1             1            2
## 19 1050670           10         7          7             6            4
## 20 1050718            6         1          1             1            2
##    Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses     Class
## 1            1           3               1       1    benign
## 2           10           3               2       1    benign
## 3            2           3               1       1    benign
## 4            4           3               7       1    benign
## 5            1           3               1       1    benign
## 6           10           9               7       1 malignant
## 7           10           3               1       1    benign
## 8            1           3               1       1    benign
## 9            1           1               1       5    benign
## 10           1           2               1       1    benign
## 11           1           3               1       1    benign
## 12           1           2               1       1    benign
## 13           3           4               4       1 malignant
## 14           3           3               1       1    benign
## 15           9           5               5       4 malignant
## 16           1           4               3       1 malignant
## 17           1           2               1       1    benign
## 18           1           3               1       1    benign
## 19          10           4               1       2 malignant
## 20           1           3               1       1    benign

Since the ID column types are characters and the oter columns are factor types, we will remove the ID column and convert the other columns to numerical value types

# Remove redundant variable Id
dataset <- BreastCancer[ ,-1]

# Convert input values to numeric
for(i in 1:9) {dataset[ ,i] <- as.numeric(as.character(dataset[ ,i]))}

Summary

summary(dataset)

##   Cl.thickness      Cell.size        Cell.shape     Marg.adhesion   
##  Min.   : 1.000   Min.   : 1.000   Min.   : 1.000   Min.   : 1.000  
##  1st Qu.: 2.000   1st Qu.: 1.000   1st Qu.: 1.000   1st Qu.: 1.000  
##  Median : 4.000   Median : 1.000   Median : 1.000   Median : 1.000  
##  Mean   : 4.418   Mean   : 3.134   Mean   : 3.207   Mean   : 2.807  
##  3rd Qu.: 6.000   3rd Qu.: 5.000   3rd Qu.: 5.000   3rd Qu.: 4.000  
##  Max.   :10.000   Max.   :10.000   Max.   :10.000   Max.   :10.000  
##                                                                     
##   Epith.c.size     Bare.nuclei      Bl.cromatin     Normal.nucleoli 
##  Min.   : 1.000   Min.   : 1.000   Min.   : 1.000   Min.   : 1.000  
##  1st Qu.: 2.000   1st Qu.: 1.000   1st Qu.: 2.000   1st Qu.: 1.000  
##  Median : 2.000   Median : 1.000   Median : 3.000   Median : 1.000  
##  Mean   : 3.216   Mean   : 3.545   Mean   : 3.438   Mean   : 2.867  
##  3rd Qu.: 4.000   3rd Qu.: 6.000   3rd Qu.: 5.000   3rd Qu.: 4.000  
##  Max.   :10.000   Max.   :10.000   Max.   :10.000   Max.   :10.000  
##                   NA's   :16                                        
##     Mitoses             Class    
##  Min.   : 1.000   benign   :458  
##  1st Qu.: 1.000   malignant:241  
##  Median : 1.000                  
##  Mean   : 1.589                  
##  3rd Qu.: 1.000                  
##  Max.   :10.000                  
##

Unimodal Data Visualizations

# Histograms for each attribute
par(mfrow=c(3,3))
for(i in 1:9) {hist(dataset[ ,i], main=names(dataset)[i])}

# Density Plot for each attribute
par(mfrow=c(3,3))
complete_cases <- complete.cases(dataset)
for(i in 1:9) {
  plot(density(dataset[complete_cases, i]), main=names(dataset)[i])
}

# Boxplots for each attribute

par(mfrow=c(3,3))
for(i in 1:9) {boxplot(dataset[ ,i], main=names(dataset)[i])}

Evaluate Algorithms: Baseline

Let’s try a smattering of linear and nonlinear algorithms:

We will use 10-fold cross validation with 3 repeats. This is a good standard test harness configuration. It is a binary classification problem. Our baseline accuracy was found to be 65%.

# 10-fold cross-validation with 3 repeats
trainControl <- trainControl(method="repeatedcv", number=10, repeats=3)
metric <- "Accuracy"

Let’s create our models: we will estimate the accuracy of a suite of machine learning algorithms.

# LG
set.seed(123)
fit.glm <- train(Class~., data=dataset, method="glm", metric=metric, trControl=trainControl, na.action=na.omit)

# LDA
set.seed(123)
fit.lda<- train(Class~., data=dataset, method="lda", metric=metric, trControl=trainControl, na.action=na.omit)

# GLMNET
set.seed(123)
fit.glmnet <- train(Class~., data=dataset, method="glmnet", metric=metric, trControl=trainControl, na.action=na.omit)

results <- resamples(list(LG=fit.glm, LDA=fit.lda, GLMNET=fit.glmnet))

summary(results)

## 
## Call:
## summary.resamples(object = results)
## 
## Models: LG, LDA, GLMNET 
## Number of resamples: 30 
## 
## Accuracy 
##             Min.   1st Qu.    Median      Mean   3rd Qu. Max. NA's
## LG     0.8970588 0.9560422 0.9705882 0.9653009 0.9852941    1    0
## LDA    0.9130435 0.9558824 0.9565217 0.9594896 0.9709079    1    0
## GLMNET 0.9253731 0.9560422 0.9705882 0.9682350 0.9852941    1    0
## 
## Kappa 
##             Min.   1st Qu.    Median      Mean   3rd Qu. Max. NA's
## LG     0.7767355 0.9036917 0.9356061 0.9234741 0.9674952    1    0
## LDA    0.7964602 0.9015871 0.9032258 0.9096392 0.9348442    1    0
## GLMNET 0.8291688 0.9026707 0.9356061 0.9299386 0.9676803    1    0

dotplot(results)

DE&M I - Test Harness - Lec 9 - Part 2

Charles Pierre

2025-03-26

Load Packages

Load data

Descriptive Stats

Peek at dataset

Since the ID column types are characters and the oter columns are factor types, we will remove the ID column and convert the other columns to numerical value types

Summary

Unimodal Data Visualizations

Evaluate Algorithms: Baseline

Let’s try a smattering of linear and nonlinear algorithms:

We will use 10-fold cross validation with 3 repeats. This is a good standard test harness configuration. It is a binary classification problem. Our baseline accuracy was found to be 65%.

Let’s create our models: we will estimate the accuracy of a suite of machine learning algorithms.