Package importing
pacman::p_load(dplyr, broom, caTools, ggplot2, gridExtra, caret)
Dataset importing
dataset = read.csv('breast_cancer.csv')
dataset = dataset[, -1]
dataset = data.frame(lapply(dataset, as.factor))
dataset$Class = factor(dataset$Class,
levels = c(2, 4), labels = c(0, 1))
glimpse(dataset)
## Rows: 683
## Columns: 10
## $ Clump.Thickness <fct> 5, 5, 3, 6, 4, 8, 1, 2, 2, 4, 1, 2, 5, ...
## $ Uniformity.of.Cell.Size <fct> 1, 4, 1, 8, 1, 10, 1, 1, 1, 2, 1, 1, 3,...
## $ Uniformity.of.Cell.Shape <fct> 1, 4, 1, 8, 1, 10, 1, 2, 1, 1, 1, 1, 3,...
## $ Marginal.Adhesion <fct> 1, 5, 1, 1, 3, 8, 1, 1, 1, 1, 1, 1, 3, ...
## $ Single.Epithelial.Cell.Size <fct> 2, 7, 2, 3, 2, 7, 2, 2, 2, 2, 1, 2, 2, ...
## $ Bare.Nuclei <fct> 1, 10, 2, 4, 1, 10, 10, 1, 1, 1, 1, 1, ...
## $ Bland.Chromatin <fct> 3, 3, 3, 3, 3, 9, 3, 3, 1, 2, 3, 2, 4, ...
## $ Normal.Nucleoli <fct> 1, 2, 1, 7, 1, 7, 1, 1, 1, 1, 1, 1, 4, ...
## $ Mitoses <fct> 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, ...
## $ Class <fct> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, ...
Dataset partitioning
set.seed(123)
split = sample.split(dataset$Class, SplitRatio = 0.8)
training.set = subset(dataset, split == T)
test.set = subset(dataset, split == F)
Feature scaling
filter = lapply(dataset, is.numeric)
training.set[, filter == T] = scale(training.set[, filter == T])
test.set[, filter == T] = scale(test.set[, filter == T])
Model fitting
# Logistic regression
mod.lr = glm(Class ~ .,
data = training.set,
family = binomial)
# K-nearest neighbors (k = 6)
library('class')
library('scales')
accuracy.set = data.frame(k = seq(1, 20, 1), accuracy = rep(0, 20))
for (i in 1:20) {
y.pred = knn(train = training.set[, -10],
test = test.set[, -10],
cl = training.set[, 10],
k = i)
accuracy.set[i, 2] = confusionMatrix(y.pred,
test.set[, 10])$overall[1]
}
ggplot(data = accuracy.set, aes(x = k, y = accuracy)) +
geom_point() +
geom_line(linetype = 'dashed') +
scale_x_continuous(breaks = pretty_breaks(nrow(accuracy.set))) +
scale_y_continuous(breaks = pretty_breaks()) +
labs(title = 'Accuracy vs K-value',
subtitle = 'Best KNN',
x = 'K-value',
y = 'Accuracy')

# Support vector machine
library('e1071')
mod.svm = svm(Class ~ .,
data = training.set,
type = 'C-classification',
kernel = 'linear')
# Kernel support vector machine
library('e1071')
mod.ksvm = svm(Class ~ .,
data = training.set,
type = 'C-classification',
kernel = 'radial')
# Naive bayes
library('e1071')
mod.nb = naiveBayes(x = training.set[, -10],
y = training.set[, 10])
# Decision tree - CART
library('rpart')
mod.dt.cart = rpart(Class ~ .,
data = training.set,
method = 'class',
control = rpart.control(xval = 10))
plotcp(mod.dt.cart)

# mod.dt.cart.p = prune(mod.dt.cart, cp = 0.013)
# plotcp(mod.dt.cart.p)
# Decision tree - C5.0
library('C50')
mod.dt.c50 = C5.0(Class ~ .,
data = training.set)
# Random forest
library('randomForest')
mod.rf = randomForest(Class ~ .,
data = training.set,
ntree = 500)
Predication & Tunning
# Logistic regression
y.pred.prob = predict(mod.lr, type = 'response',
newdata = test.set[, -10])
y.pred = factor(ifelse(y.pred.prob > 0.5, 1, 0), levels = c(0, 1))
cm = confusionMatrix(y.pred, test.set[, 10])
ac = format(round(cm$overall[1]*100, 2), nsmall = 2) %>% as.numeric()
f1 = format(round(cm$byClass['F1']*100, 2), nsmall = 2) %>% as.numeric
Accuracy = c(ac)
F1 = c(f1)
# eval = data.frame(Accuracy = ac, F1 = f1)
# K-nearest neighbors (k = 6)
y.pred = knn(train = training.set[, -10],
test = test.set[, -10],
cl = training.set[, 10],
k = 6)
cm = confusionMatrix(y.pred, test.set[, 10])
ac = format(round(cm$overall[1]*100, 2), nsmall = 2) %>% as.numeric()
f1 = format(round(cm$byClass['F1']*100, 2), nsmall = 2) %>% as.numeric
Accuracy = append(Accuracy, ac)
F1 = append(F1, f1)
# Support vector machine
y.pred = predict(mod.svm, newdata = test.set[, -10])
cm = confusionMatrix(y.pred, test.set[, 10])
ac = format(round(cm$overall[1]*100, 2), nsmall = 2) %>% as.numeric()
f1 = format(round(cm$byClass['F1']*100, 2), nsmall = 2) %>% as.numeric
Accuracy = append(Accuracy, ac)
F1 = append(F1, f1)
# Kernel support vector machine
y.pred = predict(mod.ksvm, newdata = test.set[, -10])
cm = confusionMatrix(y.pred, test.set[, 10])
ac = format(round(cm$overall[1]*100, 2), nsmall = 2) %>% as.numeric()
f1 = format(round(cm$byClass['F1']*100, 2), nsmall = 2) %>% as.numeric
Accuracy = append(Accuracy, ac)
F1 = append(F1, f1)
# Naive bayes
y.pred = predict(mod.nb, newdata = test.set[, -10])
cm = confusionMatrix(y.pred, test.set[, 10])
ac = format(round(cm$overall[1]*100, 2), nsmall = 2) %>% as.numeric()
f1 = format(round(cm$byClass['F1']*100, 2), nsmall = 2) %>% as.numeric
Accuracy = append(Accuracy, ac)
F1 = append(F1, f1)
# Decision tree - CART
y.pred = predict(mod.dt.cart, newdata = test.set[, -10], type = 'class')
cm = confusionMatrix(y.pred, test.set[, 10])
ac = format(round(cm$overall[1]*100, 2), nsmall = 2) %>% as.numeric()
f1 = format(round(cm$byClass['F1']*100, 2), nsmall = 2) %>% as.numeric
Accuracy = append(Accuracy, ac)
F1 = append(F1, f1)
# Decision tree - C5.0
y.pred = predict(mod.dt.c50, newdata = test.set[, -10], type = 'class')
cm = confusionMatrix(y.pred, test.set[, 10])
ac = format(round(cm$overall[1]*100, 2), nsmall = 2) %>% as.numeric()
f1 = format(round(cm$byClass['F1']*100, 2), nsmall = 2) %>% as.numeric
Accuracy = append(Accuracy, ac)
F1 = append(F1, f1)
# Random forest
y.pred = predict(mod.rf, newdata = test.set[, -10], type = 'class')
cm = confusionMatrix(y.pred, test.set[, 10])
ac = format(round(cm$overall[1]*100, 2), nsmall = 2) %>% as.numeric()
f1 = format(round(cm$byClass['F1']*100, 2), nsmall = 2) %>% as.numeric
Accuracy = append(Accuracy, ac)
F1 = append(F1, f1)
Model evaluating
eval = cbind(Accuracy, F1)
rownames(eval) = c('LR',
'KNN',
'SVM',
'KSVM',
'NB',
'DT-CART',
'DT-C50',
'RF')
eval = as.data.frame(eval)
eval[order(-eval$Accuracy, -eval$F1), , drop = F]
## Accuracy F1
## NB 97.81 98.29
## RF 97.81 98.29
## SVM 96.35 97.21
## KSVM 96.35 97.18
## KNN 95.62 96.63
## DT-C50 94.16 95.51
## LR 93.43 95.08
## DT-CART 93.43 94.92
eval %>%
arrange(desc(Accuracy)) %>%
mutate(ML.name = factor(c('NB',
'RF',
'SVM',
'KSVM',
'KNN',
'DT-C50',
'LR',
'DT-CART'),
levels = c('NB',
'RF',
'SVM',
'KSVM',
'KNN',
'DT-C50',
'LR',
'DT-CART'))) %>%
ggplot() +
geom_col(aes(x = ML.name, y = Accuracy)) +
coord_cartesian(ylim = c(90, 100)) +
labs(title = 'Classification ML vs Accuracy',
subtitle = 'Naive Bayes',
x = '')
