df <- read.csv("breast_cancer.csv")
df %>% head() %>% reactable::reactable()
str(df)
#> 'data.frame': 569 obs. of 32 variables:
#> $ X : int 0 1 2 3 4 5 6 7 8 9 ...
#> $ mean.radius : num 18 20.6 19.7 11.4 20.3 ...
#> $ mean.texture : num 10.4 17.8 21.2 20.4 14.3 ...
#> $ mean.perimeter : num 122.8 132.9 130 77.6 135.1 ...
#> $ mean.area : num 1001 1326 1203 386 1297 ...
#> $ mean.smoothness : num 0.1184 0.0847 0.1096 0.1425 0.1003 ...
#> $ mean.compactness : num 0.2776 0.0786 0.1599 0.2839 0.1328 ...
#> $ mean.concavity : num 0.3001 0.0869 0.1974 0.2414 0.198 ...
#> $ mean.concave.points : num 0.1471 0.0702 0.1279 0.1052 0.1043 ...
#> $ mean.symmetry : num 0.242 0.181 0.207 0.26 0.181 ...
#> $ mean.fractal.dimension : num 0.0787 0.0567 0.06 0.0974 0.0588 ...
#> $ radius.error : num 1.095 0.543 0.746 0.496 0.757 ...
#> $ texture.error : num 0.905 0.734 0.787 1.156 0.781 ...
#> $ perimeter.error : num 8.59 3.4 4.58 3.44 5.44 ...
#> $ area.error : num 153.4 74.1 94 27.2 94.4 ...
#> $ smoothness.error : num 0.0064 0.00522 0.00615 0.00911 0.01149 ...
#> $ compactness.error : num 0.049 0.0131 0.0401 0.0746 0.0246 ...
#> $ concavity.error : num 0.0537 0.0186 0.0383 0.0566 0.0569 ...
#> $ concave.points.error : num 0.0159 0.0134 0.0206 0.0187 0.0188 ...
#> $ symmetry.error : num 0.03 0.0139 0.0225 0.0596 0.0176 ...
#> $ fractal.dimension.error: num 0.00619 0.00353 0.00457 0.00921 0.00511 ...
#> $ worst.radius : num 25.4 25 23.6 14.9 22.5 ...
#> $ worst.texture : num 17.3 23.4 25.5 26.5 16.7 ...
#> $ worst.perimeter : num 184.6 158.8 152.5 98.9 152.2 ...
#> $ worst.area : num 2019 1956 1709 568 1575 ...
#> $ worst.smoothness : num 0.162 0.124 0.144 0.21 0.137 ...
#> $ worst.compactness : num 0.666 0.187 0.424 0.866 0.205 ...
#> $ worst.concavity : num 0.712 0.242 0.45 0.687 0.4 ...
#> $ worst.concave.points : num 0.265 0.186 0.243 0.258 0.163 ...
#> $ worst.symmetry : num 0.46 0.275 0.361 0.664 0.236 ...
#> $ worst.fractal.dimension: num 0.1189 0.089 0.0876 0.173 0.0768 ...
#> $ target : int 0 0 0 0 0 0 0 0 0 0 ...
colSums(is.na(df))
#> X mean.radius mean.texture
#> 0 0 0
#> mean.perimeter mean.area mean.smoothness
#> 0 0 0
#> mean.compactness mean.concavity mean.concave.points
#> 0 0 0
#> mean.symmetry mean.fractal.dimension radius.error
#> 0 0 0
#> texture.error perimeter.error area.error
#> 0 0 0
#> smoothness.error compactness.error concavity.error
#> 0 0 0
#> concave.points.error symmetry.error fractal.dimension.error
#> 0 0 0
#> worst.radius worst.texture worst.perimeter
#> 0 0 0
#> worst.area worst.smoothness worst.compactness
#> 0 0 0
#> worst.concavity worst.concave.points worst.symmetry
#> 0 0 0
#> worst.fractal.dimension target
#> 0 0
df <- df[-c(1)]
df <- df %>%
mutate(target = as.factor(target))
We are going to make a machine learning classification of breast cancer using logistic regression
prop.table(table(df$target))
#>
#> 0 1
#> 0.3725835 0.6274165
RNGkind(sample.kind = "Rounding")
set.seed(123)
spec <- c(train = .6, test = .2, validate = .2)
g <- sample(cut(
seq(nrow(df)),
nrow(df)*cumsum(c(0,spec)),
labels = names(spec)
))
res <- split(df, g)
df_train <- res$train
df_valid <- res$validate
df_test <- res$test
nrow(df_train)
#> [1] 341
nrow(df_valid)
#> [1] 114
nrow(df_test)
#> [1] 114
prop.table(table(df_train$target))
#>
#> 0 1
#> 0.3577713 0.6422287
# stepwise: automatic model selection method
options(warn=-1)
log_model_all <- glm(target ~ ., family="binomial", data= df_train)
log_model_nothing <- glm(target ~ 1, family="binomial", data= df_train)
log_model1 <- step(log_model_nothing,
list(lower=formula(log_model_nothing),
upper=formula(log_model_all)),
direction="both", trace = F, test= "F")
formula(log_model1)
#> target ~ worst.texture + area.error + worst.concave.points +
#> compactness.error + worst.concavity + worst.symmetry
log_model1 <- glm(target ~ worst.perimeter + worst.smoothness + worst.texture +
radius.error + worst.symmetry + compactness.error + mean.concavity +
texture.error, family = "binomial", data = df_train)
summary(log_model1)
#>
#> Call:
#> glm(formula = target ~ worst.perimeter + worst.smoothness + worst.texture +
#> radius.error + worst.symmetry + compactness.error + mean.concavity +
#> texture.error, family = "binomial", data = df_train)
#>
#> Deviance Residuals:
#> Min 1Q Median 3Q Max
#> -3.2144 -0.0001 0.0032 0.0393 1.4812
#>
#> Coefficients:
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) 48.0938 11.4322 4.207 0.0000259 ***
#> worst.perimeter -0.1811 0.0510 -3.551 0.000384 ***
#> worst.smoothness -67.6507 29.8999 -2.263 0.023662 *
#> worst.texture -0.4868 0.1502 -3.242 0.001187 **
#> radius.error -19.2880 6.9131 -2.790 0.005270 **
#> worst.symmetry -12.1824 7.4964 -1.625 0.104141
#> compactness.error 202.9279 67.6873 2.998 0.002717 **
#> mean.concavity -54.8910 17.2930 -3.174 0.001503 **
#> texture.error 2.7754 1.7886 1.552 0.120734
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for binomial family taken to be 1)
#>
#> Null deviance: 444.749 on 340 degrees of freedom
#> Residual deviance: 43.542 on 332 degrees of freedom
#> AIC: 61.542
#>
#> Number of Fisher Scoring iterations: 10
ggcorr(data = df, label = TRUE, label_size = 2, hjust = 1, layout.exp = 3)
Model 2 is going to focus on the elimination of unwanted variables. We can immediately verify the presence of multicollinearity between some of the variables. The mean.radius column has a correlation of 1 and 1 with mean.perimeter and mean.area columns, respectively. This is probably because the three columns essentially contain the same information, which is the physical size of the observation (the cell). Therefore we should only pick one of the three columns when we go into further analysis. And we will also remove other variable that has a multicollinearity effect on it.
log_model2 <- glm(formula = target ~ mean.radius + mean.texture + mean.smoothness + mean.compactness + mean.symmetry
+ mean.fractal.dimension + radius.error + symmetry.error + fractal.dimension.error + texture.error
+ smoothness.error + compactness.error, family = "binomial", data = df_train)
summary(log_model2)
#>
#> Call:
#> glm(formula = target ~ mean.radius + mean.texture + mean.smoothness +
#> mean.compactness + mean.symmetry + mean.fractal.dimension +
#> radius.error + symmetry.error + fractal.dimension.error +
#> texture.error + smoothness.error + compactness.error, family = "binomial",
#> data = df_train)
#>
#> Deviance Residuals:
#> Min 1Q Median 3Q Max
#> -2.78234 -0.02555 0.02811 0.13853 2.13286
#>
#> Coefficients:
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) 38.11559 12.14361 3.139 0.00170 **
#> mean.radius -0.86508 0.31531 -2.744 0.00608 **
#> mean.texture -0.45065 0.09695 -4.648 0.00000334 ***
#> mean.smoothness -65.56257 48.64105 -1.348 0.17770
#> mean.compactness -29.65384 28.02503 -1.058 0.29000
#> mean.symmetry -37.14301 17.97881 -2.066 0.03883 *
#> mean.fractal.dimension -47.14917 142.01722 -0.332 0.73989
#> radius.error -8.36383 3.13033 -2.672 0.00754 **
#> symmetry.error 54.15842 50.11881 1.081 0.27987
#> fractal.dimension.error 768.13325 435.55428 1.764 0.07780 .
#> texture.error 1.21317 0.84282 1.439 0.15003
#> smoothness.error 89.35625 173.04422 0.516 0.60559
#> compactness.error 1.92703 60.46089 0.032 0.97457
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for binomial family taken to be 1)
#>
#> Null deviance: 444.75 on 340 degrees of freedom
#> Residual deviance: 100.55 on 328 degrees of freedom
#> AIC: 126.55
#>
#> Number of Fisher Scoring iterations: 8
From this we can see that our model is not perfect, there are some variables that strongly affect our model such as compactness.error, mean.fractal.dimension and smoothness.error. We are going to make the better model by removing those variables.
log_model2 <- update(log_model2, .~. - compactness.error - mean.fractal.dimension - smoothness.error - symmetry.error)
summary(log_model2)
#>
#> Call:
#> glm(formula = target ~ mean.radius + mean.texture + mean.smoothness +
#> mean.compactness + mean.symmetry + radius.error + fractal.dimension.error +
#> texture.error, family = "binomial", data = df_train)
#>
#> Deviance Residuals:
#> Min 1Q Median 3Q Max
#> -2.82645 -0.02731 0.03235 0.15002 2.12382
#>
#> Coefficients:
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) 35.15359 7.06699 4.974 0.000000655 ***
#> mean.radius -0.86587 0.23356 -3.707 0.000209 ***
#> mean.texture -0.44711 0.09491 -4.711 0.000002464 ***
#> mean.smoothness -74.87604 37.54524 -1.994 0.046121 *
#> mean.compactness -35.54625 15.84839 -2.243 0.024904 *
#> mean.symmetry -24.07145 14.14496 -1.702 0.088799 .
#> radius.error -7.15370 2.87996 -2.484 0.012993 *
#> fractal.dimension.error 804.09207 308.57385 2.606 0.009165 **
#> texture.error 1.41326 0.80223 1.762 0.078125 .
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for binomial family taken to be 1)
#>
#> Null deviance: 444.75 on 340 degrees of freedom
#> Residual deviance: 102.34 on 332 degrees of freedom
#> AIC: 120.34
#>
#> Number of Fisher Scoring iterations: 8
This is used to compare the relative magnitude of effects.
library(hnp)
set.seed(123)
hnp(log_model1, halfnormal = F, paint.out = T, pch = 16, sim = 500, print.on = T)
#> Binomial model
hnp(log_model2, halfnormal = F, paint.out = T, pch = 16, sim = 500, print.on = T)
#> Binomial model
library(car)
vif(log_model1)
#> worst.perimeter worst.smoothness worst.texture radius.error
#> 2.439267 2.341580 6.435599 5.129347
#> worst.symmetry compactness.error mean.concavity texture.error
#> 1.577600 7.172741 7.102297 6.195302
vif(log_model2)
#> mean.radius mean.texture mean.smoothness
#> 2.240360 2.539338 3.330287
#> mean.compactness mean.symmetry radius.error
#> 5.240095 1.697849 1.743381
#> fractal.dimension.error texture.error
#> 3.752997 2.443215
Using log_model1 to make a prediction from validation data
pred_model1 <- predict(object = log_model1,
newdata = df_valid,
type = "response")
head(pred_model1,5)
#> 4 5 8 11
#> 0.00007792152187 0.00000001417548 0.00078758956849 0.00173197592798
#> 16
#> 0.00000122888243
pred.label_model1 <- ifelse(test = pred_model1 > 0.5, "1", "0")
head(pred.label_model1,5)
#> 4 5 8 11 16
#> "0" "0" "0" "0" "0"
library(ggplot2)
ggplot(df_test, aes(x = pred_model1)) +
geom_density(lwd = 0.5) +
theme_minimal()
From this we can see that our model is very confident that the target is cancerous or not cancerous by density plot above.
library(caret)
confusionMatrix(data = as.factor(pred.label_model1),
reference = as.factor(df_valid$target),
positive = "1")
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction 0 1
#> 0 44 0
#> 1 1 69
#>
#> Accuracy : 0.9912
#> 95% CI : (0.9521, 0.9998)
#> No Information Rate : 0.6053
#> P-Value [Acc > NIR] : <0.0000000000000002
#>
#> Kappa : 0.9816
#>
#> Mcnemar's Test P-Value : 1
#>
#> Sensitivity : 1.0000
#> Specificity : 0.9778
#> Pos Pred Value : 0.9857
#> Neg Pred Value : 1.0000
#> Prevalence : 0.6053
#> Detection Rate : 0.6053
#> Detection Prevalence : 0.6140
#> Balanced Accuracy : 0.9889
#>
#> 'Positive' Class : 1
#>
pred_model2 <- predict(object = log_model2,
newdata = df_valid,
type = "response")
pred.label_model2 <- ifelse(test = pred_model2 > 0.5, "1", "0")
confusionMatrix(data = as.factor(pred.label_model2),
reference = as.factor(df_valid$target),
positive = "1")
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction 0 1
#> 0 41 3
#> 1 4 66
#>
#> Accuracy : 0.9386
#> 95% CI : (0.8776, 0.975)
#> No Information Rate : 0.6053
#> P-Value [Acc > NIR] : 0.0000000000000003171
#>
#> Kappa : 0.871
#>
#> Mcnemar's Test P-Value : 1
#>
#> Sensitivity : 0.9565
#> Specificity : 0.9111
#> Pos Pred Value : 0.9429
#> Neg Pred Value : 0.9318
#> Prevalence : 0.6053
#> Detection Rate : 0.5789
#> Detection Prevalence : 0.6140
#> Balanced Accuracy : 0.9338
#>
#> 'Positive' Class : 1
#>
ggplot(df_test, aes(x = pred_model2)) +
geom_density(lwd = 0.5) +
theme_minimal()
performa <- function(cutoff, prob, ref, postarget, negtarget)
{
predict <- factor(ifelse(prob >= cutoff, postarget, negtarget))
conf <- caret::confusionMatrix(predict , ref, positive = postarget)
acc <- conf$overall[1]
rec <- conf$byClass[1]
prec <- conf$byClass[3]
spec <- conf$byClass[2]
mat <- t(as.matrix(c(rec , acc , prec, spec)))
colnames(mat) <- c("recall", "accuracy", "precicion", "specificity")
return(mat)
}
co <- seq(0.01,0.80,length=100)
result <- matrix(0,100,4)
for(i in 1:100){
result[i,] = performa(cutoff = co[i],
prob = pred_model1,
ref = as.factor(df_valid$target),
postarget = "1",
negtarget = "0")
}
data_frame("Recall" = result[,1],
"Accuracy" = result[,2],
"Precision" = result[,3],
"Specificity" = result[,4],
"Cutoff" = co) %>%
gather(key = "performa", value = "value", 1:4) %>%
ggplot(aes(x = Cutoff, y = value, col = performa)) +
geom_line(lwd = 1.5) +
scale_color_manual(values = c("darkred","darkgreen","orange", "blue")) +
scale_y_continuous(breaks = seq(0,1,0.1), limits = c(0,1)) +
scale_x_continuous(breaks = seq(0,1,0.1)) +
labs(title = "Tradeoff model perfomance") +
theme_minimal() +
theme(legend.position = "top",
panel.grid.minor.y = element_blank(),
panel.grid.minor.x = element_blank())
Cut-off that are stable is in 0.45, from this we can use 0.45 for our data validation
pred_model_test <- predict(object = log_model1,
newdata = df_test,
type = "response")
pred.label_model_test <- ifelse(test = pred_model_test > 0.45, "1", "0")
confusionMatrix(data = as.factor(pred.label_model_test),
reference = as.factor(df_test$target),
positive = "1")
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction 0 1
#> 0 43 0
#> 1 2 69
#>
#> Accuracy : 0.9825
#> 95% CI : (0.9381, 0.9979)
#> No Information Rate : 0.6053
#> P-Value [Acc > NIR] : <0.0000000000000002
#>
#> Kappa : 0.963
#>
#> Mcnemar's Test P-Value : 0.4795
#>
#> Sensitivity : 1.0000
#> Specificity : 0.9556
#> Pos Pred Value : 0.9718
#> Neg Pred Value : 1.0000
#> Prevalence : 0.6053
#> Detection Rate : 0.6053
#> Detection Prevalence : 0.6228
#> Balanced Accuracy : 0.9778
#>
#> 'Positive' Class : 1
#>
— End —