1 Library Preparation

2 Import Data

df <- read.csv("breast_cancer.csv")
df %>% head() %>% reactable::reactable()
str(df)
#> 'data.frame':    569 obs. of  32 variables:
#>  $ X                      : int  0 1 2 3 4 5 6 7 8 9 ...
#>  $ mean.radius            : num  18 20.6 19.7 11.4 20.3 ...
#>  $ mean.texture           : num  10.4 17.8 21.2 20.4 14.3 ...
#>  $ mean.perimeter         : num  122.8 132.9 130 77.6 135.1 ...
#>  $ mean.area              : num  1001 1326 1203 386 1297 ...
#>  $ mean.smoothness        : num  0.1184 0.0847 0.1096 0.1425 0.1003 ...
#>  $ mean.compactness       : num  0.2776 0.0786 0.1599 0.2839 0.1328 ...
#>  $ mean.concavity         : num  0.3001 0.0869 0.1974 0.2414 0.198 ...
#>  $ mean.concave.points    : num  0.1471 0.0702 0.1279 0.1052 0.1043 ...
#>  $ mean.symmetry          : num  0.242 0.181 0.207 0.26 0.181 ...
#>  $ mean.fractal.dimension : num  0.0787 0.0567 0.06 0.0974 0.0588 ...
#>  $ radius.error           : num  1.095 0.543 0.746 0.496 0.757 ...
#>  $ texture.error          : num  0.905 0.734 0.787 1.156 0.781 ...
#>  $ perimeter.error        : num  8.59 3.4 4.58 3.44 5.44 ...
#>  $ area.error             : num  153.4 74.1 94 27.2 94.4 ...
#>  $ smoothness.error       : num  0.0064 0.00522 0.00615 0.00911 0.01149 ...
#>  $ compactness.error      : num  0.049 0.0131 0.0401 0.0746 0.0246 ...
#>  $ concavity.error        : num  0.0537 0.0186 0.0383 0.0566 0.0569 ...
#>  $ concave.points.error   : num  0.0159 0.0134 0.0206 0.0187 0.0188 ...
#>  $ symmetry.error         : num  0.03 0.0139 0.0225 0.0596 0.0176 ...
#>  $ fractal.dimension.error: num  0.00619 0.00353 0.00457 0.00921 0.00511 ...
#>  $ worst.radius           : num  25.4 25 23.6 14.9 22.5 ...
#>  $ worst.texture          : num  17.3 23.4 25.5 26.5 16.7 ...
#>  $ worst.perimeter        : num  184.6 158.8 152.5 98.9 152.2 ...
#>  $ worst.area             : num  2019 1956 1709 568 1575 ...
#>  $ worst.smoothness       : num  0.162 0.124 0.144 0.21 0.137 ...
#>  $ worst.compactness      : num  0.666 0.187 0.424 0.866 0.205 ...
#>  $ worst.concavity        : num  0.712 0.242 0.45 0.687 0.4 ...
#>  $ worst.concave.points   : num  0.265 0.186 0.243 0.258 0.163 ...
#>  $ worst.symmetry         : num  0.46 0.275 0.361 0.664 0.236 ...
#>  $ worst.fractal.dimension: num  0.1189 0.089 0.0876 0.173 0.0768 ...
#>  $ target                 : int  0 0 0 0 0 0 0 0 0 0 ...

3 Data Pre-processing

3.1 Check for Missing Values

colSums(is.na(df))
#>                       X             mean.radius            mean.texture 
#>                       0                       0                       0 
#>          mean.perimeter               mean.area         mean.smoothness 
#>                       0                       0                       0 
#>        mean.compactness          mean.concavity     mean.concave.points 
#>                       0                       0                       0 
#>           mean.symmetry  mean.fractal.dimension            radius.error 
#>                       0                       0                       0 
#>           texture.error         perimeter.error              area.error 
#>                       0                       0                       0 
#>        smoothness.error       compactness.error         concavity.error 
#>                       0                       0                       0 
#>    concave.points.error          symmetry.error fractal.dimension.error 
#>                       0                       0                       0 
#>            worst.radius           worst.texture         worst.perimeter 
#>                       0                       0                       0 
#>              worst.area        worst.smoothness       worst.compactness 
#>                       0                       0                       0 
#>         worst.concavity    worst.concave.points          worst.symmetry 
#>                       0                       0                       0 
#> worst.fractal.dimension                  target 
#>                       0                       0

3.2 Drop Column X (Index)

df <- df[-c(1)]

3.3 State Columns into Appropriate Class

df <- df %>% 
  mutate(target = as.factor(target))

4 Objective

We are going to make a machine learning classification of breast cancer using logistic regression

prop.table(table(df$target))
#> 
#>         0         1 
#> 0.3725835 0.6274165

5 Cross-Validation

RNGkind(sample.kind = "Rounding")
set.seed(123)

spec <- c(train = .6, test = .2, validate = .2)

g <- sample(cut(
  seq(nrow(df)), 
  nrow(df)*cumsum(c(0,spec)),
  labels = names(spec)
))

res <- split(df, g)
df_train <- res$train
df_valid <- res$validate
df_test <- res$test
nrow(df_train)
#> [1] 341
nrow(df_valid)
#> [1] 114
nrow(df_test)
#> [1] 114
prop.table(table(df_train$target))
#> 
#>         0         1 
#> 0.3577713 0.6422287
# stepwise: automatic model selection method
options(warn=-1)
log_model_all <- glm(target ~ ., family="binomial", data= df_train)
log_model_nothing <- glm(target ~ 1, family="binomial", data= df_train)

log_model1 <- step(log_model_nothing, 
                 list(lower=formula(log_model_nothing),
                      upper=formula(log_model_all)),
                 direction="both", trace = F, test= "F")

formula(log_model1)
#> target ~ worst.texture + area.error + worst.concave.points + 
#>     compactness.error + worst.concavity + worst.symmetry
log_model1 <- glm(target ~ worst.perimeter + worst.smoothness + worst.texture + 
                  radius.error + worst.symmetry + compactness.error + mean.concavity + 
                  texture.error, family = "binomial", data = df_train)
summary(log_model1)
#> 
#> Call:
#> glm(formula = target ~ worst.perimeter + worst.smoothness + worst.texture + 
#>     radius.error + worst.symmetry + compactness.error + mean.concavity + 
#>     texture.error, family = "binomial", data = df_train)
#> 
#> Deviance Residuals: 
#>     Min       1Q   Median       3Q      Max  
#> -3.2144  -0.0001   0.0032   0.0393   1.4812  
#> 
#> Coefficients:
#>                   Estimate Std. Error z value  Pr(>|z|)    
#> (Intercept)        48.0938    11.4322   4.207 0.0000259 ***
#> worst.perimeter    -0.1811     0.0510  -3.551  0.000384 ***
#> worst.smoothness  -67.6507    29.8999  -2.263  0.023662 *  
#> worst.texture      -0.4868     0.1502  -3.242  0.001187 ** 
#> radius.error      -19.2880     6.9131  -2.790  0.005270 ** 
#> worst.symmetry    -12.1824     7.4964  -1.625  0.104141    
#> compactness.error 202.9279    67.6873   2.998  0.002717 ** 
#> mean.concavity    -54.8910    17.2930  -3.174  0.001503 ** 
#> texture.error       2.7754     1.7886   1.552  0.120734    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> (Dispersion parameter for binomial family taken to be 1)
#> 
#>     Null deviance: 444.749  on 340  degrees of freedom
#> Residual deviance:  43.542  on 332  degrees of freedom
#> AIC: 61.542
#> 
#> Number of Fisher Scoring iterations: 10

6 Model 2

ggcorr(data = df, label = TRUE, label_size = 2, hjust = 1, layout.exp = 3)

Model 2 is going to focus on the elimination of unwanted variables. We can immediately verify the presence of multicollinearity between some of the variables. The mean.radius column has a correlation of 1 and 1 with mean.perimeter and mean.area columns, respectively. This is probably because the three columns essentially contain the same information, which is the physical size of the observation (the cell). Therefore we should only pick one of the three columns when we go into further analysis. And we will also remove other variable that has a multicollinearity effect on it.

log_model2 <- glm(formula = target ~ mean.radius + mean.texture + mean.smoothness + mean.compactness + mean.symmetry
                  + mean.fractal.dimension + radius.error + symmetry.error + fractal.dimension.error + texture.error
                  + smoothness.error + compactness.error, family = "binomial", data = df_train)
summary(log_model2)
#> 
#> Call:
#> glm(formula = target ~ mean.radius + mean.texture + mean.smoothness + 
#>     mean.compactness + mean.symmetry + mean.fractal.dimension + 
#>     radius.error + symmetry.error + fractal.dimension.error + 
#>     texture.error + smoothness.error + compactness.error, family = "binomial", 
#>     data = df_train)
#> 
#> Deviance Residuals: 
#>      Min        1Q    Median        3Q       Max  
#> -2.78234  -0.02555   0.02811   0.13853   2.13286  
#> 
#> Coefficients:
#>                          Estimate Std. Error z value   Pr(>|z|)    
#> (Intercept)              38.11559   12.14361   3.139    0.00170 ** 
#> mean.radius              -0.86508    0.31531  -2.744    0.00608 ** 
#> mean.texture             -0.45065    0.09695  -4.648 0.00000334 ***
#> mean.smoothness         -65.56257   48.64105  -1.348    0.17770    
#> mean.compactness        -29.65384   28.02503  -1.058    0.29000    
#> mean.symmetry           -37.14301   17.97881  -2.066    0.03883 *  
#> mean.fractal.dimension  -47.14917  142.01722  -0.332    0.73989    
#> radius.error             -8.36383    3.13033  -2.672    0.00754 ** 
#> symmetry.error           54.15842   50.11881   1.081    0.27987    
#> fractal.dimension.error 768.13325  435.55428   1.764    0.07780 .  
#> texture.error             1.21317    0.84282   1.439    0.15003    
#> smoothness.error         89.35625  173.04422   0.516    0.60559    
#> compactness.error         1.92703   60.46089   0.032    0.97457    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> (Dispersion parameter for binomial family taken to be 1)
#> 
#>     Null deviance: 444.75  on 340  degrees of freedom
#> Residual deviance: 100.55  on 328  degrees of freedom
#> AIC: 126.55
#> 
#> Number of Fisher Scoring iterations: 8

From this we can see that our model is not perfect, there are some variables that strongly affect our model such as compactness.error, mean.fractal.dimension and smoothness.error. We are going to make the better model by removing those variables.

log_model2 <- update(log_model2, .~. - compactness.error - mean.fractal.dimension - smoothness.error - symmetry.error)
summary(log_model2)
#> 
#> Call:
#> glm(formula = target ~ mean.radius + mean.texture + mean.smoothness + 
#>     mean.compactness + mean.symmetry + radius.error + fractal.dimension.error + 
#>     texture.error, family = "binomial", data = df_train)
#> 
#> Deviance Residuals: 
#>      Min        1Q    Median        3Q       Max  
#> -2.82645  -0.02731   0.03235   0.15002   2.12382  
#> 
#> Coefficients:
#>                          Estimate Std. Error z value    Pr(>|z|)    
#> (Intercept)              35.15359    7.06699   4.974 0.000000655 ***
#> mean.radius              -0.86587    0.23356  -3.707    0.000209 ***
#> mean.texture             -0.44711    0.09491  -4.711 0.000002464 ***
#> mean.smoothness         -74.87604   37.54524  -1.994    0.046121 *  
#> mean.compactness        -35.54625   15.84839  -2.243    0.024904 *  
#> mean.symmetry           -24.07145   14.14496  -1.702    0.088799 .  
#> radius.error             -7.15370    2.87996  -2.484    0.012993 *  
#> fractal.dimension.error 804.09207  308.57385   2.606    0.009165 ** 
#> texture.error             1.41326    0.80223   1.762    0.078125 .  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> (Dispersion parameter for binomial family taken to be 1)
#> 
#>     Null deviance: 444.75  on 340  degrees of freedom
#> Residual deviance: 102.34  on 332  degrees of freedom
#> AIC: 120.34
#> 
#> Number of Fisher Scoring iterations: 8

6.1 Half-Normal Plots

This is used to compare the relative magnitude of effects.

6.1.1 Model 1

library(hnp)
set.seed(123)
hnp(log_model1, halfnormal = F, paint.out = T, pch = 16, sim = 500, print.on = T)
#> Binomial model

6.1.2 Model 2

hnp(log_model2, halfnormal = F, paint.out = T, pch = 16, sim = 500, print.on = T)
#> Binomial model

6.2 Multicolinearity

6.2.1 Model 1

library(car)
vif(log_model1)
#>   worst.perimeter  worst.smoothness     worst.texture      radius.error 
#>          2.439267          2.341580          6.435599          5.129347 
#>    worst.symmetry compactness.error    mean.concavity     texture.error 
#>          1.577600          7.172741          7.102297          6.195302

6.2.2 Model 2

vif(log_model2)
#>             mean.radius            mean.texture         mean.smoothness 
#>                2.240360                2.539338                3.330287 
#>        mean.compactness           mean.symmetry            radius.error 
#>                5.240095                1.697849                1.743381 
#> fractal.dimension.error           texture.error 
#>                3.752997                2.443215

7 Model Prediction

7.1 Model 1

Using log_model1 to make a prediction from validation data

pred_model1 <- predict(object = log_model1,
                               newdata = df_valid,
                               type = "response")
head(pred_model1,5)
#>                4                5                8               11 
#> 0.00007792152187 0.00000001417548 0.00078758956849 0.00173197592798 
#>               16 
#> 0.00000122888243
pred.label_model1 <- ifelse(test = pred_model1 > 0.5, "1", "0")
head(pred.label_model1,5)
#>   4   5   8  11  16 
#> "0" "0" "0" "0" "0"
library(ggplot2)
ggplot(df_test, aes(x = pred_model1)) +
  geom_density(lwd = 0.5) +
  theme_minimal()

From this we can see that our model is very confident that the target is cancerous or not cancerous by density plot above.

library(caret)
confusionMatrix(data = as.factor(pred.label_model1),
                reference = as.factor(df_valid$target),
                positive = "1")
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction  0  1
#>          0 44  0
#>          1  1 69
#>                                              
#>                Accuracy : 0.9912             
#>                  95% CI : (0.9521, 0.9998)   
#>     No Information Rate : 0.6053             
#>     P-Value [Acc > NIR] : <0.0000000000000002
#>                                              
#>                   Kappa : 0.9816             
#>                                              
#>  Mcnemar's Test P-Value : 1                  
#>                                              
#>             Sensitivity : 1.0000             
#>             Specificity : 0.9778             
#>          Pos Pred Value : 0.9857             
#>          Neg Pred Value : 1.0000             
#>              Prevalence : 0.6053             
#>          Detection Rate : 0.6053             
#>    Detection Prevalence : 0.6140             
#>       Balanced Accuracy : 0.9889             
#>                                              
#>        'Positive' Class : 1                  
#> 

7.2 Model 2

pred_model2 <- predict(object = log_model2,
                               newdata = df_valid,
                               type = "response")
pred.label_model2 <- ifelse(test = pred_model2 > 0.5, "1", "0")
confusionMatrix(data = as.factor(pred.label_model2),
                reference = as.factor(df_valid$target),
                positive = "1")
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction  0  1
#>          0 41  3
#>          1  4 66
#>                                                
#>                Accuracy : 0.9386               
#>                  95% CI : (0.8776, 0.975)      
#>     No Information Rate : 0.6053               
#>     P-Value [Acc > NIR] : 0.0000000000000003171
#>                                                
#>                   Kappa : 0.871                
#>                                                
#>  Mcnemar's Test P-Value : 1                    
#>                                                
#>             Sensitivity : 0.9565               
#>             Specificity : 0.9111               
#>          Pos Pred Value : 0.9429               
#>          Neg Pred Value : 0.9318               
#>              Prevalence : 0.6053               
#>          Detection Rate : 0.5789               
#>    Detection Prevalence : 0.6140               
#>       Balanced Accuracy : 0.9338               
#>                                                
#>        'Positive' Class : 1                    
#> 
ggplot(df_test, aes(x = pred_model2)) +
  geom_density(lwd = 0.5) +
  theme_minimal()

8 Model 1 Cut-Off

performa <- function(cutoff, prob, ref, postarget, negtarget) 
{
  predict <- factor(ifelse(prob >= cutoff, postarget, negtarget))
  conf <- caret::confusionMatrix(predict , ref, positive = postarget)
  acc <- conf$overall[1]
  rec <- conf$byClass[1]
  prec <- conf$byClass[3]
  spec <- conf$byClass[2]
  mat <- t(as.matrix(c(rec , acc , prec, spec))) 
  colnames(mat) <- c("recall", "accuracy", "precicion", "specificity")
  return(mat)
}

co <- seq(0.01,0.80,length=100)
result <- matrix(0,100,4)

for(i in 1:100){
  result[i,] = performa(cutoff = co[i], 
                     prob = pred_model1, 
                     ref = as.factor(df_valid$target), 
                     postarget = "1", 
                     negtarget = "0")
}

data_frame("Recall" = result[,1],
           "Accuracy" = result[,2],
           "Precision" = result[,3],
           "Specificity" = result[,4],
           "Cutoff" = co) %>% 
  gather(key = "performa", value = "value", 1:4) %>% 
  ggplot(aes(x = Cutoff, y = value, col = performa)) +
  geom_line(lwd = 1.5) +
  scale_color_manual(values = c("darkred","darkgreen","orange", "blue")) +
  scale_y_continuous(breaks = seq(0,1,0.1), limits = c(0,1)) +
  scale_x_continuous(breaks = seq(0,1,0.1)) +
  labs(title = "Tradeoff model perfomance") +
  theme_minimal() +
  theme(legend.position = "top",
        panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank())

Cut-off that are stable is in 0.45, from this we can use 0.45 for our data validation

9 Data Test Prediction

pred_model_test <- predict(object = log_model1,
                               newdata = df_test,
                               type = "response")
pred.label_model_test <- ifelse(test = pred_model_test > 0.45, "1", "0")
confusionMatrix(data = as.factor(pred.label_model_test),
                reference = as.factor(df_test$target),
                positive = "1")
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction  0  1
#>          0 43  0
#>          1  2 69
#>                                              
#>                Accuracy : 0.9825             
#>                  95% CI : (0.9381, 0.9979)   
#>     No Information Rate : 0.6053             
#>     P-Value [Acc > NIR] : <0.0000000000000002
#>                                              
#>                   Kappa : 0.963              
#>                                              
#>  Mcnemar's Test P-Value : 0.4795             
#>                                              
#>             Sensitivity : 1.0000             
#>             Specificity : 0.9556             
#>          Pos Pred Value : 0.9718             
#>          Neg Pred Value : 1.0000             
#>              Prevalence : 0.6053             
#>          Detection Rate : 0.6053             
#>    Detection Prevalence : 0.6228             
#>       Balanced Accuracy : 0.9778             
#>                                              
#>        'Positive' Class : 1                  
#> 

— End —