Logistic Regression

Importing the libraries

pacman::p_load(dplyr, broom, caTools, ggplot2, gridExtra, caret)

Importing the dataset

dataset = read.csv('breast_cancer.csv')
glimpse(dataset)
## Rows: 683
## Columns: 11
## $ Sample.code.number          <int> 1000025, 1002945, 1015425, 1016277, 101...
## $ Clump.Thickness             <int> 5, 5, 3, 6, 4, 8, 1, 2, 2, 4, 1, 2, 5, ...
## $ Uniformity.of.Cell.Size     <int> 1, 4, 1, 8, 1, 10, 1, 1, 1, 2, 1, 1, 3,...
## $ Uniformity.of.Cell.Shape    <int> 1, 4, 1, 8, 1, 10, 1, 2, 1, 1, 1, 1, 3,...
## $ Marginal.Adhesion           <int> 1, 5, 1, 1, 3, 8, 1, 1, 1, 1, 1, 1, 3, ...
## $ Single.Epithelial.Cell.Size <int> 2, 7, 2, 3, 2, 7, 2, 2, 2, 2, 1, 2, 2, ...
## $ Bare.Nuclei                 <int> 1, 10, 2, 4, 1, 10, 10, 1, 1, 1, 1, 1, ...
## $ Bland.Chromatin             <int> 3, 3, 3, 3, 3, 9, 3, 3, 1, 2, 3, 2, 4, ...
## $ Normal.Nucleoli             <int> 1, 2, 1, 7, 1, 7, 1, 1, 1, 1, 1, 1, 4, ...
## $ Mitoses                     <int> 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, ...
## $ Class                       <int> 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 4, ...
colSums(is.na(dataset))
##          Sample.code.number             Clump.Thickness 
##                           0                           0 
##     Uniformity.of.Cell.Size    Uniformity.of.Cell.Shape 
##                           0                           0 
##           Marginal.Adhesion Single.Epithelial.Cell.Size 
##                           0                           0 
##                 Bare.Nuclei             Bland.Chromatin 
##                           0                           0 
##             Normal.Nucleoli                     Mitoses 
##                           0                           0 
##                       Class 
##                           0
dataset = dataset[, -1]
dataset = data.frame(lapply(dataset, as.factor))
dataset$Class = factor(dataset$Class, 
                       levels = c(2, 4), labels = c(0, 1))

Splitting the dataset into the training set and test set

set.seed(123)
split = sample.split(dataset$Class, SplitRatio = 0.8)
training.set = subset(dataset, split == T)
test.set = subset(dataset, split == F)

Training the logistic regression model on the training set

mod = glm(Class ~ .,
          data = training.set,
          family = binomial)

Predicting the test set result

y.pred.prob = predict(mod, type = 'response', newdata = test.set)
y.pred = factor(ifelse(y.pred.prob > 0.5, 1, 0), levels = c(0, 1))

Making the confusion matrix

cm = confusionMatrix(y.pred, test.set[, 10])
ac = format(round(cm$overall[1]*100, 2), nsmall = 2)
paste0('Accuracy: ', ac, ' %')
## [1] "Accuracy: 93.43 %"

Computing the accuracy with k-fold cross validation

set.seed(123)
train.control = trainControl(method = 'cv', number = 10)
mod.cv = train(Class ~ .,
               data = dataset,
               trControl = train.control,
               method = 'glm',
               family = binomial())
ac = format(round(mod.cv$results$Accuracy*100, 2), nsmall = 2)
paste0('Accuracy: ', ac, ' %')
## [1] "Accuracy: 93.27 %"