Load and Split Data
Cancer <- read.csv("C:/Users/lukew/Downloads/Cancer_Data.csv")
Cancer$diagnosis <- factor(Cancer$diagnosis)
cancer <- Cancer[, 2:12]
names(cancer) <- gsub(" ", "_", names(cancer))
set.seed(123)
train_index <- createDataPartition(cancer$diagnosis, p = 0.6, list = FALSE)
train_data <- cancer[train_index, ]
temp_data <- cancer[-train_index, ]
val_index <- createDataPartition(temp_data$diagnosis, p = 0.5, list = FALSE)
val_data <- temp_data[val_index, ]
test_data <- temp_data[-val_index, ]
Exploratory Visualization
ggplot(cancer, aes(x = diagnosis, y = after_stat(count)/nrow(cancer), fill = diagnosis)) +
geom_bar() +
xlab('Diagnosis') +
ylab('Percent of Participants') +
theme_classic()

ggbetweenstats Boxplots for All Predictors
dynamic_ggbetweenstats <- function(data, x, y_var) {
ggbetweenstats(
data = data,
x = {{ x }},
y = !!sym(y_var),
type = "parametric",
messages = FALSE,
results.subtitle = TRUE,
title = paste("Distribution of", y_var, "by Diagnosis"),
xlab = "Diagnosis",
ylab = y_var,
mean.ci = TRUE
)
}
predictor_vars <- names(cancer)[names(cancer) != "diagnosis"]
lapply(predictor_vars, function(var) print(dynamic_ggbetweenstats(cancer, x = diagnosis, y_var = var)))










## [[1]]

##
## [[2]]

##
## [[3]]

##
## [[4]]

##
## [[5]]

##
## [[6]]

##
## [[7]]

##
## [[8]]

##
## [[9]]

##
## [[10]]

Initial Model on Training Data
glm_train <- glm(diagnosis ~ ., data = train_data, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(glm_train)
##
## Call:
## glm(formula = diagnosis ~ ., family = binomial, data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -19.55400 19.41254 -1.007 0.3138
## radius_mean -5.98061 6.34918 -0.942 0.3462
## texture_mean 0.43264 0.09693 4.463 8.07e-06 ***
## perimeter_mean 0.67384 0.85582 0.787 0.4311
## area_mean 0.03366 0.02416 1.394 0.1635
## smoothness_mean 88.11443 51.69149 1.705 0.0883 .
## compactness_mean -35.35619 27.75297 -1.274 0.2027
## concavity_mean 40.42102 18.08332 2.235 0.0254 *
## concave.points_mean 15.52588 42.02118 0.369 0.7118
## symmetry_mean 4.69433 16.64788 0.282 0.7780
## fractal_dimension_mean 43.60160 123.75809 0.352 0.7246
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 453.189 on 342 degrees of freedom
## Residual deviance: 81.851 on 332 degrees of freedom
## AIC: 103.85
##
## Number of Fisher Scoring iterations: 9
confint(glm_train)
## Waiting for profiling to be done...
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 2.5 % 97.5 %
## (Intercept) -60.21094720 16.87844859
## radius_mean -19.35544410 5.56057133
## texture_mean 0.26161476 0.64733334
## perimeter_mean -0.91562498 2.45420077
## area_mean -0.01525731 0.08134466
## smoothness_mean -11.05561868 193.60041812
## compactness_mean -92.65397864 18.23820199
## concavity_mean 6.64741182 78.35162994
## concave.points_mean -66.46845268 99.82932277
## symmetry_mean -27.88784296 38.32262580
## fractal_dimension_mean -199.65300237 293.17078709
Reduced Model on Validation Set
glm_valid_refined <- glm(diagnosis ~ texture_mean + concavity_mean,
data = val_data, family = binomial)
summary(glm_valid_refined)
##
## Call:
## glm(formula = diagnosis ~ texture_mean + concavity_mean, family = binomial,
## data = val_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.79432 1.62437 -4.183 2.88e-05 ***
## texture_mean 0.19993 0.07436 2.688 0.00718 **
## concavity_mean 23.57304 4.53201 5.201 1.98e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 149.125 on 112 degrees of freedom
## Residual deviance: 90.238 on 110 degrees of freedom
## AIC: 96.238
##
## Number of Fisher Scoring iterations: 5
confint(glm_valid_refined)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -10.26788737 -3.830256
## texture_mean 0.05855395 0.354101
## concavity_mean 15.50635993 33.433742
val_data$predicted_prob_refined <- predict(glm_valid_refined, newdata = val_data, type = "response")
val_data$predicted_class_refined <- factor(ifelse(val_data$predicted_prob_refined > 0.5, "M", "B"),
levels = levels(val_data$diagnosis))
confusionMatrix(val_data$predicted_class_refined, val_data$diagnosis)
## Confusion Matrix and Statistics
##
## Reference
## Prediction B M
## B 63 10
## M 8 32
##
## Accuracy : 0.8407
## 95% CI : (0.76, 0.9028)
## No Information Rate : 0.6283
## P-Value [Acc > NIR] : 6.001e-07
##
## Kappa : 0.6556
##
## Mcnemar's Test P-Value : 0.8137
##
## Sensitivity : 0.8873
## Specificity : 0.7619
## Pos Pred Value : 0.8630
## Neg Pred Value : 0.8000
## Prevalence : 0.6283
## Detection Rate : 0.5575
## Detection Prevalence : 0.6460
## Balanced Accuracy : 0.8246
##
## 'Positive' Class : B
##
Model Comparison
base_formula <- diagnosis ~ texture_mean + concavity_mean
additional_vars <- c("area_mean", "smoothness_mean", "radius_mean", "compactness_mean", "symmetry_mean")
model_list <- list(Model_1 = glm(base_formula, data = val_data, family = binomial))
results_df <- data.frame()
for (i in seq_along(additional_vars)) {
var <- additional_vars[i]
formula <- as.formula(paste("diagnosis ~ texture_mean + concavity_mean +", var))
model_list[[paste0("Model_", i + 1)]] <- glm(formula, data = val_data, family = binomial)
}
for (i in seq_along(model_list)) {
model_name <- names(model_list)[i]
model <- model_list[[i]]
val_data[[paste0("predicted_", model_name)]] <- predict(model, newdata = val_data, type = "response")
predicted_class <- factor(ifelse(val_data[[paste0("predicted_", model_name)]] > 0.5, "M", "B"),
levels = levels(val_data$diagnosis))
cm <- confusionMatrix(predicted_class, val_data$diagnosis)
acc <- cm$overall["Accuracy"]
auc_val <- auc(val_data$diagnosis, val_data[[paste0("predicted_", model_name)]])
results_df <- rbind(results_df, data.frame(Model = model_name, AIC = AIC(model),
Deviance = model$deviance, Accuracy = round(acc, 4), AUC = round(auc_val, 4)))
}
## Setting levels: control = B, case = M
## Setting direction: controls < cases
## Setting levels: control = B, case = M
## Setting direction: controls < cases
## Setting levels: control = B, case = M
## Setting direction: controls < cases
## Setting levels: control = B, case = M
## Setting direction: controls < cases
## Setting levels: control = B, case = M
## Setting direction: controls < cases
## Setting levels: control = B, case = M
## Setting direction: controls < cases
results_df
## Model AIC Deviance Accuracy AUC
## Accuracy Model_1 96.23843 90.23843 0.8407 0.9209
## Accuracy1 Model_2 31.59856 23.59856 0.9558 0.9920
## Accuracy2 Model_3 96.80707 88.80707 0.8496 0.9188
## Accuracy3 Model_4 31.02575 23.02575 0.9646 0.9933
## Accuracy4 Model_5 98.09865 90.09865 0.8319 0.9165
## Accuracy5 Model_6 96.61776 88.61776 0.8496 0.9249
Model Comparison Plot
results_df$Model <- as.character(1:nrow(results_df))
results_long <- pivot_longer(results_df, cols = c(Accuracy, AUC, AIC), names_to = "Metric", values_to = "Value")
ggplot(results_long, aes(x = Model, y = Value, fill = Metric)) +
geom_col(position = "dodge") +
facet_wrap(~ Metric, scales = "free_y") +
labs(title = "Model Comparison: Accuracy, AUC, and AIC",
x = "Model", y = "Metric Value") +
scale_fill_brewer(palette = "Set2") +
theme_minimal(base_size = 14) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5))

Final Model on Test Set
final_model <- glm(diagnosis ~ texture_mean + concavity_mean + radius_mean, data = val_data, family = binomial)
# Predictions
test_data$predicted_prob <- predict(final_model, newdata = test_data, type = "response")
test_data$predicted_class <- factor(ifelse(test_data$predicted_prob > 0.5, "M", "B"), levels = levels(test_data$diagnosis))
confusionMatrix(test_data$predicted_class, test_data$diagnosis)
## Confusion Matrix and Statistics
##
## Reference
## Prediction B M
## B 66 6
## M 5 36
##
## Accuracy : 0.9027
## 95% CI : (0.8325, 0.9504)
## No Information Rate : 0.6283
## P-Value [Acc > NIR] : 3.429e-11
##
## Kappa : 0.7906
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9296
## Specificity : 0.8571
## Pos Pred Value : 0.9167
## Neg Pred Value : 0.8780
## Prevalence : 0.6283
## Detection Rate : 0.5841
## Detection Prevalence : 0.6372
## Balanced Accuracy : 0.8934
##
## 'Positive' Class : B
##
roc_final <- roc(test_data$diagnosis, test_data$predicted_prob, levels = c("B", "M"))
## Setting direction: controls < cases
plot.roc(roc_final, main = "Final Model: ROC Curve (Test Set)")

auc(roc_final)
## Area under the curve: 0.9567
Effect Visualization of Texture Mean
texture_seq <- seq(min(test_data$texture_mean), max(test_data$texture_mean), length.out = 100)
new_data <- data.frame(
texture_mean = texture_seq,
concavity_mean = mean(test_data$concavity_mean),
radius_mean = mean(test_data$radius_mean)
)
new_data$predicted_prob <- predict(final_model, newdata = new_data, type = "response")
ggplot(new_data, aes(x = texture_mean, y = predicted_prob)) +
geom_line(color = "#0073C2FF", size = 1.2) +
labs(
title = "Effect of Texture Mean on Probability of Malignancy",
x = "Texture Mean",
y = "Predicted Probability"
) +
annotate("text", x = max(new_data$texture_mean) - 1, y = 0.05, hjust = 1, vjust = 0, size = 4.5,
label = paste(
"logit(p) = -35.05 + 0.404*texture_mean\n",
" + 29.67*concavity_mean\n",
" + 1.62*radius_mean"
)) +
theme_minimal(base_size = 14)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

R Markdown