pacman::p_load(dplyr, broom, caTools, ggplot2, gridExtra, caret)
dataset = read.csv('breast_cancer.csv')
glimpse(dataset)
## Rows: 683
## Columns: 11
## $ Sample.code.number <int> 1000025, 1002945, 1015425, 1016277, 101...
## $ Clump.Thickness <int> 5, 5, 3, 6, 4, 8, 1, 2, 2, 4, 1, 2, 5, ...
## $ Uniformity.of.Cell.Size <int> 1, 4, 1, 8, 1, 10, 1, 1, 1, 2, 1, 1, 3,...
## $ Uniformity.of.Cell.Shape <int> 1, 4, 1, 8, 1, 10, 1, 2, 1, 1, 1, 1, 3,...
## $ Marginal.Adhesion <int> 1, 5, 1, 1, 3, 8, 1, 1, 1, 1, 1, 1, 3, ...
## $ Single.Epithelial.Cell.Size <int> 2, 7, 2, 3, 2, 7, 2, 2, 2, 2, 1, 2, 2, ...
## $ Bare.Nuclei <int> 1, 10, 2, 4, 1, 10, 10, 1, 1, 1, 1, 1, ...
## $ Bland.Chromatin <int> 3, 3, 3, 3, 3, 9, 3, 3, 1, 2, 3, 2, 4, ...
## $ Normal.Nucleoli <int> 1, 2, 1, 7, 1, 7, 1, 1, 1, 1, 1, 1, 4, ...
## $ Mitoses <int> 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, ...
## $ Class <int> 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 4, ...
colSums(is.na(dataset))
## Sample.code.number Clump.Thickness
## 0 0
## Uniformity.of.Cell.Size Uniformity.of.Cell.Shape
## 0 0
## Marginal.Adhesion Single.Epithelial.Cell.Size
## 0 0
## Bare.Nuclei Bland.Chromatin
## 0 0
## Normal.Nucleoli Mitoses
## 0 0
## Class
## 0
dataset = dataset[, -1]
dataset = data.frame(lapply(dataset, as.factor))
dataset$Class = factor(dataset$Class,
levels = c(2, 4), labels = c(0, 1))
set.seed(123)
split = sample.split(dataset$Class, SplitRatio = 0.8)
training.set = subset(dataset, split == T)
test.set = subset(dataset, split == F)
mod = glm(Class ~ .,
data = training.set,
family = binomial)
y.pred.prob = predict(mod, type = 'response', newdata = test.set)
y.pred = factor(ifelse(y.pred.prob > 0.5, 1, 0), levels = c(0, 1))
cm = confusionMatrix(y.pred, test.set[, 10])
ac = format(round(cm$overall[1]*100, 2), nsmall = 2)
paste0('Accuracy: ', ac, ' %')
## [1] "Accuracy: 93.43 %"
set.seed(123)
train.control = trainControl(method = 'cv', number = 10)
mod.cv = train(Class ~ .,
data = dataset,
trControl = train.control,
method = 'glm',
family = binomial())
ac = format(round(mod.cv$results$Accuracy*100, 2), nsmall = 2)
paste0('Accuracy: ', ac, ' %')
## [1] "Accuracy: 93.27 %"