Introduction

This document presents an analysis of medical diagnosis data.
The goal is to identify key predictors influencing disease outcomes and evaluate the diagnostic accuracy of different clinical variables.


Setup and Data Import


Data Cleaning and Preprocessing

# Work on a copy
data <- data_orig

# Make column names consistent
data <- janitor::clean_names(data)

# Remove exact duplicates (rows)
dedup_before <- nrow(data)
data <- distinct(data)
dedup_after <- nrow(data)
cat('Removed', dedup_before - dedup_after, 'duplicate rows\n')
## Removed 0 duplicate rows
# Identify variable types
num_vars <- names(select_if(data, is.numeric))
cat_vars <- names(select_if(data, ~!is.numeric(.)))

# Missingness overview
summary(data)
##  cancer_site_grouping calendar_period_of_diagnosis     sex           
##  Length:36            Length:36                    Length:36         
##  Class :character     Class :character             Class :character  
##  Mode  :character     Mode  :character             Mode  :character  
##                                                                      
##                                                                      
##                                                                      
##   age_group            method          time_since_diagnosis cases_analysed
##  Length:36          Length:36          Length:36            Min.   :2535  
##  Class :character   Class :character   Class :character     1st Qu.:3522  
##  Mode  :character   Mode  :character   Mode  :character     Median :3710  
##                                                             Mean   :4685  
##                                                             3rd Qu.:6797  
##                                                             Max.   :7730  
##  deaths_up_to_timepoint censored_up_to_timepoint
##  Min.   : 899           Min.   : 0.000          
##  1st Qu.:1034           1st Qu.: 0.000          
##  Median :1098           Median : 1.000          
##  Mean   :1403           Mean   : 4.056          
##  3rd Qu.:2036           3rd Qu.: 1.250          
##  Max.   :2264           Max.   :56.000          
##  patients_remaining_at_risk_at_t overall_survival_percent
##  Min.   :1480                    Min.   :58.40           
##  1st Qu.:2493                    1st Qu.:69.12           
##  Median :2654                    Median :70.40           
##  Mean   :3278                    Mean   :69.80           
##  3rd Qu.:4719                    3rd Qu.:71.60           
##  Max.   :5635                    Max.   :73.70           
##  se_for_overall_survival lower_95_percent_ci_for_overall_survival
##  Min.   :0.700           Min.   :56.50                           
##  1st Qu.:0.800           1st Qu.:67.92                           
##  Median :1.000           Median :69.20                           
##  Mean   :1.008           Mean   :68.44                           
##  3rd Qu.:1.100           3rd Qu.:70.38                           
##  Max.   :1.700           Max.   :72.30                           
##  upper_95_percent_ci_for_overall_survival
##  Min.   :60.30                           
##  1st Qu.:70.38                           
##  Median :71.75                           
##  Mean   :71.19                           
##  3rd Qu.:72.72                           
##  Max.   :75.10
miss_summary <- naniar::miss_var_summary(data)
miss_summary
## # A tibble: 14 × 3
##    variable                                 n_miss pct_miss
##    <chr>                                     <int>    <num>
##  1 cancer_site_grouping                          0        0
##  2 calendar_period_of_diagnosis                  0        0
##  3 sex                                           0        0
##  4 age_group                                     0        0
##  5 method                                        0        0
##  6 time_since_diagnosis                          0        0
##  7 cases_analysed                                0        0
##  8 deaths_up_to_timepoint                        0        0
##  9 censored_up_to_timepoint                      0        0
## 10 patients_remaining_at_risk_at_t               0        0
## 11 overall_survival_percent                      0        0
## 12 se_for_overall_survival                       0        0
## 13 lower_95_percent_ci_for_overall_survival      0        0
## 14 upper_95_percent_ci_for_overall_survival      0        0
# Simple missing-value treatment (demonstration):
# - Numeric: median imputation
# - Character/factor: mode imputation

mode_fun <- function(x){
  ux <- unique(x[!is.na(x)])
  ux[which.max(tabulate(match(x, ux)))]
}

for (v in num_vars){
  if (any(is.na(data[[v]]))){
    data[[v]][is.na(data[[v]])] <- median(data[[v]], na.rm = TRUE)
  }
}

for (v in cat_vars){
  if (any(is.na(data[[v]]))){
    if (is.character(data[[v]])) data[[v]] <- as.factor(data[[v]])
    data[[v]][is.na(data[[v]])] <- mode_fun(data[[v]])
  }
}

# Convert character columns to factors
data <- data %>% mutate(across(where(is.character), as.factor))

# Attempt to detect an outcome variable automatically:
possible_outcomes <- names(data)[tolower(names(data)) %in% c('diagnosis','outcome','status','disease','target','label')]

# Fallback: find any binary variable (two unique values)
if(length(possible_outcomes)==0){
  bin_vars <- sapply(data, function(x) length(unique(na.omit(x)))==2)
  possible_outcomes <- names(data)[bin_vars]
}

cat('Candidate outcome variables:', paste(possible_outcomes, collapse=', '), '\n')
## Candidate outcome variables:
# Choose first candidate as outcome (user can change later)
outcome_var <- if(length(possible_outcomes)>0) possible_outcomes[1] else NULL

if(!is.null(outcome_var)){
  cat('Using', outcome_var, 'as outcome variable.\n')
  data[[outcome_var]] <- as.factor(data[[outcome_var]])
} else {
  cat('No clear binary outcome detected. You may need to define one manually.\n')
}
## No clear binary outcome detected. You may need to define one manually.
# Brief cleaned data glimpse
glimpse(data)
## Rows: 36
## Columns: 14
## $ cancer_site_grouping                     <fct> All malignant neoplasms exclu…
## $ calendar_period_of_diagnosis             <fct> Jan-Mar 2018, Apr-Jun 2018, J…
## $ sex                                      <fct> Males, Males, Males, Males, M…
## $ age_group                                <fct> 15-99, 15-99, 15-99, 15-99, 1…
## $ method                                   <fct> Non-standardised, Non-standar…
## $ time_since_diagnosis                     <fct> 1 year, 1 year, 1 year, 1 yea…
## $ cases_analysed                           <dbl> 3544, 3853, 3583, 3685, 3646,…
## $ deaths_up_to_timepoint                   <dbl> 1050, 1070, 1090, 1103, 1096,…
## $ censored_up_to_timepoint                 <dbl> 1, 1, 1, 1, 0, 0, 0, 1, 0, 0,…
## $ patients_remaining_at_risk_at_t          <dbl> 2493, 2782, 2492, 2581, 2550,…
## $ overall_survival_percent                 <dbl> 70.4, 72.2, 69.6, 70.1, 69.9,…
## $ se_for_overall_survival                  <dbl> 1.1, 1.0, 1.1, 1.1, 1.1, 1.1,…
## $ lower_95_percent_ci_for_overall_survival <dbl> 68.9, 70.8, 68.1, 68.6, 68.5,…
## $ upper_95_percent_ci_for_overall_survival <dbl> 71.9, 73.7, 71.1, 71.6, 71.4,…

Exploratory Data Analysis (EDA)

# ---------------------------------------------

# Enhanced and Colorful Visualizations for EDA

# ---------------------------------------------

library(RColorBrewer)
library(viridis)
library(ggthemes)

# Choose vibrant color palettes

hist_fill <- "steelblue"
bar_palette <- scale_fill_brewer(palette = "Set2")
corr_colors <- colorRampPalette(c("#2E8B57", "#FFFFFF", "#8B0000"))(200)

# 1. Numeric distribution plots

if(length(num_vars) > 0){
for (v in head(num_vars, 6)){
print(
ggplot(data, aes_string(x = v)) +
geom_histogram(bins = 30, fill = hist_fill, color = "white", alpha = 0.8) +
ggtitle(paste("Distribution of", v)) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(color = "#2C3E50", face = "bold", hjust = 0.5),
panel.grid.minor = element_blank()
)
)
}
}

# 2. Barplots for categorical variables

if(length(cat_vars) > 0){
for (v in head(cat_vars, 6)){
print(
ggplot(data, aes_string(x = v, fill = v)) +
geom_bar(alpha = 0.9) +
bar_palette +
ggtitle(paste("Counts of", v)) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(color = "#34495E", face = "bold", hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1)
)
)
}
}

# 3. Correlation matrix (colorful heatmap)

if(length(num_vars) > 1){
numdat <- select(data, all_of(num_vars))
corr <- cor(numdat, use = 'pairwise.complete.obs')
corrplot::corrplot(
corr,
method = "color",
tl.cex = 0.8,
col = corr_colors,
addCoef.col = "black",
number.cex = 0.6,
title = "\nCorrelation Matrix of Numeric Variables"
)
}

# 4. Pairwise scatter plots (colored by first categorical var if exists)

if(length(num_vars) >= 3){
if(length(cat_vars) >= 1){
print(GGally::ggpairs(select(data, head(num_vars, 4), all_of(cat_vars[1])), aes(color = !!sym(cat_vars[1]), alpha = 0.7)) +
ggtitle("Pairwise Relationships Between Numeric Variables"))
} else {
print(GGally::ggpairs(select(data, head(num_vars, 4))) +
ggtitle("Pairwise Relationships Between Numeric Variables"))
}
}


Feature Engineering

# Example: create age groups if age exists
if('age' %in% tolower(names(data))){
  age_col <- names(data)[tolower(names(data))=='age'][1]
  data <- data %>% mutate(age_group = cut(.data[[age_col]], breaks = c(-Inf,30,50,65,Inf), labels = c('<=30','31-50','51-65','65+')))
}

# Example: scale numeric predictors (for algorithms that need it)
scale_vars <- num_vars
if(length(scale_vars)>0){
  data <- data %>% mutate(across(all_of(scale_vars), scale))
}

# Create final modeling dataset: remove ID-like columns
id_like <- names(data)[grepl('id|record|case', tolower(names(data)))]
model_data <- data %>% select(-any_of(id_like))

# Remove near-zero variance predictors
nzv <- caret::nearZeroVar(model_data, saveMetrics = TRUE)
if(any(nzv$nzv)){
  model_data <- model_data %>% select(-which(nzv$nzv))
}

glimpse(model_data)
## Rows: 36
## Columns: 9
## $ calendar_period_of_diagnosis             <fct> Jan-Mar 2018, Apr-Jun 2018, J…
## $ sex                                      <fct> Males, Males, Males, Males, M…
## $ deaths_up_to_timepoint                   <dbl[,1]> <matrix[26 x 1]>
## $ censored_up_to_timepoint                 <dbl[,1]> <matrix[26 x 1]>
## $ patients_remaining_at_risk_at_t          <dbl[,1]> <matrix[26 x 1]>
## $ overall_survival_percent                 <dbl[,1]> <matrix[26 x 1]>
## $ se_for_overall_survival                  <dbl[,1]> <matrix[26 x 1]>
## $ lower_95_percent_ci_for_overall_survival <dbl[,1]> <matrix[26 x 1]>
## $ upper_95_percent_ci_for_overall_survival <dbl[,1]> <matrix[26 x 1]>

Modeling: Classification (binary outcome)

if(!is.null(outcome_var) && length(unique(model_data[[outcome_var]]))==2){
  set.seed(123)
  # Partition
  train_index <- createDataPartition(model_data[[outcome_var]], p = 0.75, list = FALSE)
  train <- model_data[train_index,]
  test  <- model_data[-train_index,]

  # Train logistic regression
  formula <- as.formula(paste(outcome_var, '~ .'))
  glm_fit <- glm(formula, data = train, family = binomial)
  summary(glm_fit)

  # Predictions and performance on test
  pred_prob <- predict(glm_fit, newdata = test, type = 'response')
  # Choose 0.5 threshold
  pred_class <- factor(ifelse(pred_prob >= 0.5, levels(train[[outcome_var]])[2], levels(train[[outcome_var]])[1]), levels = levels(train[[outcome_var]]))

  conf_mat <- caret::confusionMatrix(pred_class, test[[outcome_var]])
  print(conf_mat)

  # ROC and AUC
  roc_obj <- pROC::roc(as.numeric(test[[outcome_var]]), as.numeric(pred_prob))
  print(paste('AUC (logistic):', round(pROC::auc(roc_obj),3)))
  plot(roc_obj, main = 'ROC - Logistic Regression')

  # Random forest for comparison
  rf_fit <- randomForest::randomForest(formula, data = train, ntree = 200, importance = TRUE)
  rf_pred_prob <- predict(rf_fit, newdata = test, type = 'prob')[,2]
  rf_pred_class <- predict(rf_fit, newdata = test, type = 'response')
  conf_rf <- caret::confusionMatrix(rf_pred_class, test[[outcome_var]])
  print(conf_rf)
  rf_roc <- pROC::roc(as.numeric(test[[outcome_var]]), as.numeric(rf_pred_prob))
  print(paste('AUC (RF):', round(pROC::auc(rf_roc),3)))
  plot(rf_roc, main = 'ROC - Random Forest')

  # Variable importance
  varImpPlot(rf_fit, main = 'Random Forest Variable Importance')

} else {
  cat('Binary outcome not available or not detected. Modeling section skipped.\n')
}
## Binary outcome not available or not detected. Modeling section skipped.

Results Summary and Recommendations

# Summarise key model metrics if available
if(exists('conf_mat') & exists('conf_rf')){
  metrics <- list(
    logistic = list(confusion = conf_mat$table, accuracy = conf_mat$overall['Accuracy'], auc = pROC::auc(roc_obj)),
    random_forest = list(confusion = conf_rf$table, accuracy = conf_rf$overall['Accuracy'], auc = pROC::auc(rf_roc))
  )
  print(metrics)
}

Save outputs and session info

# Save cleaned data and models (optional)
saveRDS(data, file = 'cleaned_data.rds')
if(exists('glm_fit')) saveRDS(glm_fit, file = 'glm_fit.rds')
if(exists('rf_fit')) saveRDS(rf_fit, file = 'rf_fit.rds')

# Session info for reproducibility
sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.0.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Asia/Kolkata
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] viridis_0.6.5        viridisLite_0.4.2    RColorBrewer_1.1-3  
##  [4] ggthemes_5.1.0       forcats_1.0.0        GGally_2.4.0        
##  [7] naniar_1.1.0         janitor_2.2.1        pROC_1.19.0.1       
## [10] randomForest_4.7-1.2 caret_7.0-1          lattice_0.22-7      
## [13] corrplot_0.95        ggplot2_4.0.0        dplyr_1.1.4         
## [16] readr_2.1.5         
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1     timeDate_4051.111    farver_2.1.2        
##  [4] S7_0.2.0             fastmap_1.2.0        digest_0.6.37       
##  [7] rpart_4.1.24         timechange_0.3.0     lifecycle_1.0.4     
## [10] survival_3.8-3       magrittr_2.0.3       compiler_4.5.1      
## [13] rlang_1.1.6          sass_0.4.10          tools_4.5.1         
## [16] utf8_1.2.6           yaml_2.3.10          data.table_1.17.8   
## [19] knitr_1.50           labeling_0.4.3       bit_4.6.0           
## [22] plyr_1.8.9           withr_3.0.2          purrr_1.1.0         
## [25] nnet_7.3-20          grid_4.5.1           stats4_4.5.1        
## [28] future_1.67.0        globals_0.18.0       scales_1.4.0        
## [31] iterators_1.0.14     MASS_7.3-65          cli_3.6.5           
## [34] crayon_1.5.3         rmarkdown_2.29       generics_0.1.4      
## [37] rstudioapi_0.17.1    future.apply_1.20.0  reshape2_1.4.4      
## [40] tzdb_0.5.0           cachem_1.1.0         stringr_1.5.1       
## [43] splines_4.5.1        parallel_4.5.1       vctrs_0.6.5         
## [46] hardhat_1.4.2        Matrix_1.7-3         jsonlite_2.0.0      
## [49] hms_1.1.3            bit64_4.6.0-1        visdat_0.6.0        
## [52] listenv_0.9.1        foreach_1.5.2        gower_1.0.2         
## [55] jquerylib_0.1.4      tidyr_1.3.1          recipes_1.3.1       
## [58] glue_1.8.0           parallelly_1.45.1    ggstats_0.11.0      
## [61] codetools_0.2-20     lubridate_1.9.4      stringi_1.8.7       
## [64] gtable_0.3.6         tibble_3.3.0         pillar_1.11.0       
## [67] htmltools_0.5.8.1    ipred_0.9-15         lava_1.8.2          
## [70] R6_2.6.1             vroom_1.6.5          evaluate_1.0.4      
## [73] snakecase_0.11.1     bslib_0.9.0          class_7.3-23        
## [76] Rcpp_1.1.0           gridExtra_2.3        nlme_3.1-168        
## [79] prodlim_2025.04.28   xfun_0.54            ModelMetrics_1.2.2.2
## [82] pkgconfig_2.0.3