Load and Split Data

Cancer <- read.csv("C:/Users/lukew/Downloads/Cancer_Data.csv")
Cancer$diagnosis <- factor(Cancer$diagnosis)
cancer <- Cancer[, 2:12]
names(cancer) <- gsub(" ", "_", names(cancer))

set.seed(123)
train_index <- createDataPartition(cancer$diagnosis, p = 0.6, list = FALSE)
train_data <- cancer[train_index, ]
temp_data <- cancer[-train_index, ]
val_index <- createDataPartition(temp_data$diagnosis, p = 0.5, list = FALSE)
val_data <- temp_data[val_index, ]
test_data <- temp_data[-val_index, ]

Exploratory Visualization

ggplot(cancer, aes(x = diagnosis, y = after_stat(count)/nrow(cancer), fill = diagnosis)) +
  geom_bar() +
  xlab('Diagnosis') +
  ylab('Percent of Participants') +
  theme_classic()

ggbetweenstats Boxplots for All Predictors

dynamic_ggbetweenstats <- function(data, x, y_var) {
  ggbetweenstats(
    data = data,
    x = {{ x }},
    y = !!sym(y_var),
    type = "parametric",
    messages = FALSE,
    results.subtitle = TRUE,
    title = paste("Distribution of", y_var, "by Diagnosis"),
    xlab = "Diagnosis",
    ylab = y_var,
    mean.ci = TRUE
  )
}
predictor_vars <- names(cancer)[names(cancer) != "diagnosis"]
lapply(predictor_vars, function(var) print(dynamic_ggbetweenstats(cancer, x = diagnosis, y_var = var)))

## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

Initial Model on Training Data

glm_train <- glm(diagnosis ~ ., data = train_data, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(glm_train)
## 
## Call:
## glm(formula = diagnosis ~ ., family = binomial, data = train_data)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            -19.55400   19.41254  -1.007   0.3138    
## radius_mean             -5.98061    6.34918  -0.942   0.3462    
## texture_mean             0.43264    0.09693   4.463 8.07e-06 ***
## perimeter_mean           0.67384    0.85582   0.787   0.4311    
## area_mean                0.03366    0.02416   1.394   0.1635    
## smoothness_mean         88.11443   51.69149   1.705   0.0883 .  
## compactness_mean       -35.35619   27.75297  -1.274   0.2027    
## concavity_mean          40.42102   18.08332   2.235   0.0254 *  
## concave.points_mean     15.52588   42.02118   0.369   0.7118    
## symmetry_mean            4.69433   16.64788   0.282   0.7780    
## fractal_dimension_mean  43.60160  123.75809   0.352   0.7246    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 453.189  on 342  degrees of freedom
## Residual deviance:  81.851  on 332  degrees of freedom
## AIC: 103.85
## 
## Number of Fisher Scoring iterations: 9
confint(glm_train)
## Waiting for profiling to be done...
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##                                2.5 %       97.5 %
## (Intercept)             -60.21094720  16.87844859
## radius_mean             -19.35544410   5.56057133
## texture_mean              0.26161476   0.64733334
## perimeter_mean           -0.91562498   2.45420077
## area_mean                -0.01525731   0.08134466
## smoothness_mean         -11.05561868 193.60041812
## compactness_mean        -92.65397864  18.23820199
## concavity_mean            6.64741182  78.35162994
## concave.points_mean     -66.46845268  99.82932277
## symmetry_mean           -27.88784296  38.32262580
## fractal_dimension_mean -199.65300237 293.17078709

Reduced Model on Validation Set

glm_valid_refined <- glm(diagnosis ~ texture_mean + concavity_mean, 
                         data = val_data, family = binomial)
summary(glm_valid_refined)
## 
## Call:
## glm(formula = diagnosis ~ texture_mean + concavity_mean, family = binomial, 
##     data = val_data)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -6.79432    1.62437  -4.183 2.88e-05 ***
## texture_mean    0.19993    0.07436   2.688  0.00718 ** 
## concavity_mean 23.57304    4.53201   5.201 1.98e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 149.125  on 112  degrees of freedom
## Residual deviance:  90.238  on 110  degrees of freedom
## AIC: 96.238
## 
## Number of Fisher Scoring iterations: 5
confint(glm_valid_refined)
## Waiting for profiling to be done...
##                       2.5 %    97.5 %
## (Intercept)    -10.26788737 -3.830256
## texture_mean     0.05855395  0.354101
## concavity_mean  15.50635993 33.433742
val_data$predicted_prob_refined <- predict(glm_valid_refined, newdata = val_data, type = "response")
val_data$predicted_class_refined <- factor(ifelse(val_data$predicted_prob_refined > 0.5, "M", "B"),
                                           levels = levels(val_data$diagnosis))
confusionMatrix(val_data$predicted_class_refined, val_data$diagnosis)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  B  M
##          B 63 10
##          M  8 32
##                                         
##                Accuracy : 0.8407        
##                  95% CI : (0.76, 0.9028)
##     No Information Rate : 0.6283        
##     P-Value [Acc > NIR] : 6.001e-07     
##                                         
##                   Kappa : 0.6556        
##                                         
##  Mcnemar's Test P-Value : 0.8137        
##                                         
##             Sensitivity : 0.8873        
##             Specificity : 0.7619        
##          Pos Pred Value : 0.8630        
##          Neg Pred Value : 0.8000        
##              Prevalence : 0.6283        
##          Detection Rate : 0.5575        
##    Detection Prevalence : 0.6460        
##       Balanced Accuracy : 0.8246        
##                                         
##        'Positive' Class : B             
## 

Model Comparison

base_formula <- diagnosis ~ texture_mean + concavity_mean
additional_vars <- c("area_mean", "smoothness_mean", "radius_mean", "compactness_mean", "symmetry_mean")
model_list <- list(Model_1 = glm(base_formula, data = val_data, family = binomial))
results_df <- data.frame()

for (i in seq_along(additional_vars)) {
  var <- additional_vars[i]
  formula <- as.formula(paste("diagnosis ~ texture_mean + concavity_mean +", var))
  model_list[[paste0("Model_", i + 1)]] <- glm(formula, data = val_data, family = binomial)
}

for (i in seq_along(model_list)) {
  model_name <- names(model_list)[i]
  model <- model_list[[i]]
  val_data[[paste0("predicted_", model_name)]] <- predict(model, newdata = val_data, type = "response")
  predicted_class <- factor(ifelse(val_data[[paste0("predicted_", model_name)]] > 0.5, "M", "B"),
                            levels = levels(val_data$diagnosis))
  cm <- confusionMatrix(predicted_class, val_data$diagnosis)
  acc <- cm$overall["Accuracy"]
  auc_val <- auc(val_data$diagnosis, val_data[[paste0("predicted_", model_name)]])
  results_df <- rbind(results_df, data.frame(Model = model_name, AIC = AIC(model),
                                             Deviance = model$deviance, Accuracy = round(acc, 4), AUC = round(auc_val, 4)))
}
## Setting levels: control = B, case = M
## Setting direction: controls < cases
## Setting levels: control = B, case = M
## Setting direction: controls < cases
## Setting levels: control = B, case = M
## Setting direction: controls < cases
## Setting levels: control = B, case = M
## Setting direction: controls < cases
## Setting levels: control = B, case = M
## Setting direction: controls < cases
## Setting levels: control = B, case = M
## Setting direction: controls < cases
results_df
##             Model      AIC Deviance Accuracy    AUC
## Accuracy  Model_1 96.23843 90.23843   0.8407 0.9209
## Accuracy1 Model_2 31.59856 23.59856   0.9558 0.9920
## Accuracy2 Model_3 96.80707 88.80707   0.8496 0.9188
## Accuracy3 Model_4 31.02575 23.02575   0.9646 0.9933
## Accuracy4 Model_5 98.09865 90.09865   0.8319 0.9165
## Accuracy5 Model_6 96.61776 88.61776   0.8496 0.9249

Model Comparison Plot

results_df$Model <- as.character(1:nrow(results_df))
results_long <- pivot_longer(results_df, cols = c(Accuracy, AUC, AIC), names_to = "Metric", values_to = "Value")

ggplot(results_long, aes(x = Model, y = Value, fill = Metric)) +
  geom_col(position = "dodge") +
  facet_wrap(~ Metric, scales = "free_y") +
  labs(title = "Model Comparison: Accuracy, AUC, and AIC",
       x = "Model", y = "Metric Value") +
  scale_fill_brewer(palette = "Set2") +
  theme_minimal(base_size = 14) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5))

Final Model on Test Set

final_model <- glm(diagnosis ~ texture_mean + concavity_mean + radius_mean, data = val_data, family = binomial)

# Predictions
test_data$predicted_prob <- predict(final_model, newdata = test_data, type = "response")
test_data$predicted_class <- factor(ifelse(test_data$predicted_prob > 0.5, "M", "B"), levels = levels(test_data$diagnosis))
confusionMatrix(test_data$predicted_class, test_data$diagnosis)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  B  M
##          B 66  6
##          M  5 36
##                                           
##                Accuracy : 0.9027          
##                  95% CI : (0.8325, 0.9504)
##     No Information Rate : 0.6283          
##     P-Value [Acc > NIR] : 3.429e-11       
##                                           
##                   Kappa : 0.7906          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9296          
##             Specificity : 0.8571          
##          Pos Pred Value : 0.9167          
##          Neg Pred Value : 0.8780          
##              Prevalence : 0.6283          
##          Detection Rate : 0.5841          
##    Detection Prevalence : 0.6372          
##       Balanced Accuracy : 0.8934          
##                                           
##        'Positive' Class : B               
## 
roc_final <- roc(test_data$diagnosis, test_data$predicted_prob, levels = c("B", "M"))
## Setting direction: controls < cases
plot.roc(roc_final, main = "Final Model: ROC Curve (Test Set)")

auc(roc_final)
## Area under the curve: 0.9567

Effect Visualization of Texture Mean

texture_seq <- seq(min(test_data$texture_mean), max(test_data$texture_mean), length.out = 100)
new_data <- data.frame(
  texture_mean = texture_seq,
  concavity_mean = mean(test_data$concavity_mean),
  radius_mean = mean(test_data$radius_mean)
)
new_data$predicted_prob <- predict(final_model, newdata = new_data, type = "response")

ggplot(new_data, aes(x = texture_mean, y = predicted_prob)) +
  geom_line(color = "#0073C2FF", size = 1.2) +
  labs(
    title = "Effect of Texture Mean on Probability of Malignancy",
    x = "Texture Mean",
    y = "Predicted Probability"
  ) +
  annotate("text", x = max(new_data$texture_mean) - 1, y = 0.05, hjust = 1, vjust = 0, size = 4.5,
           label = paste(
             "logit(p) = -35.05 + 0.404*texture_mean\n",
             "          + 29.67*concavity_mean\n",
             "          + 1.62*radius_mean"
           )) +
  theme_minimal(base_size = 14)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

R Markdown