Package importing

pacman::p_load(dplyr, broom, caTools, ggplot2, gridExtra, caret)

Dataset importing

dataset = read.csv('breast_cancer.csv')
dataset = dataset[, -1]
dataset = data.frame(lapply(dataset, as.factor))
dataset$Class = factor(dataset$Class, 
                       levels = c(2, 4), labels = c(0, 1))
glimpse(dataset)
## Rows: 683
## Columns: 10
## $ Clump.Thickness             <fct> 5, 5, 3, 6, 4, 8, 1, 2, 2, 4, 1, 2, 5, ...
## $ Uniformity.of.Cell.Size     <fct> 1, 4, 1, 8, 1, 10, 1, 1, 1, 2, 1, 1, 3,...
## $ Uniformity.of.Cell.Shape    <fct> 1, 4, 1, 8, 1, 10, 1, 2, 1, 1, 1, 1, 3,...
## $ Marginal.Adhesion           <fct> 1, 5, 1, 1, 3, 8, 1, 1, 1, 1, 1, 1, 3, ...
## $ Single.Epithelial.Cell.Size <fct> 2, 7, 2, 3, 2, 7, 2, 2, 2, 2, 1, 2, 2, ...
## $ Bare.Nuclei                 <fct> 1, 10, 2, 4, 1, 10, 10, 1, 1, 1, 1, 1, ...
## $ Bland.Chromatin             <fct> 3, 3, 3, 3, 3, 9, 3, 3, 1, 2, 3, 2, 4, ...
## $ Normal.Nucleoli             <fct> 1, 2, 1, 7, 1, 7, 1, 1, 1, 1, 1, 1, 4, ...
## $ Mitoses                     <fct> 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, ...
## $ Class                       <fct> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, ...

Dataset partitioning

set.seed(123)
split = sample.split(dataset$Class, SplitRatio = 0.8)
training.set = subset(dataset, split == T)
test.set = subset(dataset, split == F)

Feature scaling

filter = lapply(dataset, is.numeric)
training.set[, filter == T] = scale(training.set[, filter == T])
test.set[, filter == T] = scale(test.set[, filter == T])

Model fitting

# Logistic regression
mod.lr = glm(Class ~ .,
             data = training.set,
             family = binomial)

# K-nearest neighbors (k = 6)
library('class')
library('scales')
accuracy.set = data.frame(k = seq(1, 20, 1), accuracy = rep(0, 20))
for (i in 1:20) {
    y.pred = knn(train = training.set[, -10], 
                 test = test.set[, -10],
                 cl = training.set[, 10],
                 k = i)
    accuracy.set[i, 2] = confusionMatrix(y.pred, 
                                         test.set[, 10])$overall[1]
}
ggplot(data = accuracy.set, aes(x = k, y = accuracy)) +
    geom_point() +
    geom_line(linetype = 'dashed') +
    scale_x_continuous(breaks = pretty_breaks(nrow(accuracy.set))) +
    scale_y_continuous(breaks = pretty_breaks()) +
    labs(title = 'Accuracy vs K-value',
         subtitle = 'Best KNN',
         x = 'K-value',
         y = 'Accuracy')

# Support vector machine
library('e1071')
mod.svm = svm(Class ~ .,
              data = training.set,
              type = 'C-classification',
              kernel = 'linear')

# Kernel support vector machine
library('e1071')
mod.ksvm = svm(Class ~ .,
               data = training.set,
               type = 'C-classification',
               kernel = 'radial')

# Naive bayes
library('e1071')
mod.nb = naiveBayes(x = training.set[, -10],
                    y = training.set[, 10])

# Decision tree - CART
library('rpart')
mod.dt.cart = rpart(Class ~ .,
                    data = training.set,
                    method = 'class',
                    control = rpart.control(xval = 10))
plotcp(mod.dt.cart)

# mod.dt.cart.p = prune(mod.dt.cart, cp = 0.013)
# plotcp(mod.dt.cart.p)

# Decision tree - C5.0
library('C50')
mod.dt.c50 = C5.0(Class ~ .,
                  data = training.set)

# Random forest
library('randomForest')
mod.rf = randomForest(Class ~ .,
                      data = training.set,
                      ntree = 500)

Predication & Tunning

# Logistic regression
y.pred.prob = predict(mod.lr, type = 'response', 
                      newdata = test.set[, -10])
y.pred = factor(ifelse(y.pred.prob > 0.5, 1, 0), levels = c(0, 1))
cm = confusionMatrix(y.pred, test.set[, 10])
ac = format(round(cm$overall[1]*100, 2), nsmall = 2) %>% as.numeric()
f1 = format(round(cm$byClass['F1']*100, 2), nsmall = 2) %>% as.numeric
Accuracy = c(ac)
F1 = c(f1)

# eval = data.frame(Accuracy = ac, F1 = f1)

# K-nearest neighbors (k = 6)
y.pred = knn(train = training.set[, -10],
             test = test.set[, -10],
             cl = training.set[, 10],
             k = 6)
cm = confusionMatrix(y.pred, test.set[, 10])
ac = format(round(cm$overall[1]*100, 2), nsmall = 2) %>% as.numeric()
f1 = format(round(cm$byClass['F1']*100, 2), nsmall = 2) %>% as.numeric
Accuracy = append(Accuracy, ac)
F1 = append(F1, f1)

# Support vector machine
y.pred = predict(mod.svm, newdata = test.set[, -10])
cm = confusionMatrix(y.pred, test.set[, 10])
ac = format(round(cm$overall[1]*100, 2), nsmall = 2) %>% as.numeric()
f1 = format(round(cm$byClass['F1']*100, 2), nsmall = 2) %>% as.numeric
Accuracy = append(Accuracy, ac)
F1 = append(F1, f1)

# Kernel support vector machine
y.pred = predict(mod.ksvm, newdata = test.set[, -10])
cm = confusionMatrix(y.pred, test.set[, 10])
ac = format(round(cm$overall[1]*100, 2), nsmall = 2) %>% as.numeric()
f1 = format(round(cm$byClass['F1']*100, 2), nsmall = 2) %>% as.numeric
Accuracy = append(Accuracy, ac)
F1 = append(F1, f1)

# Naive bayes
y.pred = predict(mod.nb, newdata = test.set[, -10])
cm = confusionMatrix(y.pred, test.set[, 10])
ac = format(round(cm$overall[1]*100, 2), nsmall = 2) %>% as.numeric()
f1 = format(round(cm$byClass['F1']*100, 2), nsmall = 2) %>% as.numeric
Accuracy = append(Accuracy, ac)
F1 = append(F1, f1)

# Decision tree - CART
y.pred = predict(mod.dt.cart, newdata = test.set[, -10], type = 'class')
cm = confusionMatrix(y.pred, test.set[, 10])
ac = format(round(cm$overall[1]*100, 2), nsmall = 2) %>% as.numeric()
f1 = format(round(cm$byClass['F1']*100, 2), nsmall = 2) %>% as.numeric
Accuracy = append(Accuracy, ac)
F1 = append(F1, f1)

# Decision tree - C5.0
y.pred = predict(mod.dt.c50, newdata = test.set[, -10], type = 'class')
cm = confusionMatrix(y.pred, test.set[, 10])
ac = format(round(cm$overall[1]*100, 2), nsmall = 2) %>% as.numeric()
f1 = format(round(cm$byClass['F1']*100, 2), nsmall = 2) %>% as.numeric
Accuracy = append(Accuracy, ac)
F1 = append(F1, f1)

# Random forest
y.pred = predict(mod.rf, newdata = test.set[, -10], type = 'class')
cm = confusionMatrix(y.pred, test.set[, 10])
ac = format(round(cm$overall[1]*100, 2), nsmall = 2) %>% as.numeric()
f1 = format(round(cm$byClass['F1']*100, 2), nsmall = 2) %>% as.numeric
Accuracy = append(Accuracy, ac)
F1 = append(F1, f1)

Model evaluating

eval = cbind(Accuracy, F1)
rownames(eval) = c('LR',
                   'KNN',
                   'SVM',
                   'KSVM',
                   'NB',
                   'DT-CART',
                   'DT-C50',
                   'RF')
eval = as.data.frame(eval)
eval[order(-eval$Accuracy, -eval$F1), , drop = F]
##         Accuracy    F1
## NB         97.81 98.29
## RF         97.81 98.29
## SVM        96.35 97.21
## KSVM       96.35 97.18
## KNN        95.62 96.63
## DT-C50     94.16 95.51
## LR         93.43 95.08
## DT-CART    93.43 94.92
eval %>% 
    arrange(desc(Accuracy)) %>% 
    mutate(ML.name = factor(c('NB',
                              'RF',
                              'SVM',
                              'KSVM',
                              'KNN',
                              'DT-C50',
                              'LR',
                              'DT-CART'),
                            levels = c('NB',
                                       'RF',
                                       'SVM',
                                       'KSVM',
                                       'KNN',
                                       'DT-C50',
                                       'LR',
                                       'DT-CART'))) %>% 
    ggplot() +
    geom_col(aes(x = ML.name, y = Accuracy)) +
    coord_cartesian(ylim = c(90, 100)) +
    labs(title = 'Classification ML vs Accuracy',
         subtitle = 'Naive Bayes',
         x = '')