This document presents an analysis of medical diagnosis data.
The goal is to identify key predictors influencing disease
outcomes and evaluate the diagnostic accuracy
of different clinical variables.
# Work on a copy
data <- data_orig
# Make column names consistent
data <- janitor::clean_names(data)
# Remove exact duplicates (rows)
dedup_before <- nrow(data)
data <- distinct(data)
dedup_after <- nrow(data)
cat('Removed', dedup_before - dedup_after, 'duplicate rows\n')
## Removed 0 duplicate rows
# Identify variable types
num_vars <- names(select_if(data, is.numeric))
cat_vars <- names(select_if(data, ~!is.numeric(.)))
# Missingness overview
summary(data)
##  cancer_site_grouping calendar_period_of_diagnosis     sex           
##  Length:36            Length:36                    Length:36         
##  Class :character     Class :character             Class :character  
##  Mode  :character     Mode  :character             Mode  :character  
##                                                                      
##                                                                      
##                                                                      
##   age_group            method          time_since_diagnosis cases_analysed
##  Length:36          Length:36          Length:36            Min.   :2535  
##  Class :character   Class :character   Class :character     1st Qu.:3522  
##  Mode  :character   Mode  :character   Mode  :character     Median :3710  
##                                                             Mean   :4685  
##                                                             3rd Qu.:6797  
##                                                             Max.   :7730  
##  deaths_up_to_timepoint censored_up_to_timepoint
##  Min.   : 899           Min.   : 0.000          
##  1st Qu.:1034           1st Qu.: 0.000          
##  Median :1098           Median : 1.000          
##  Mean   :1403           Mean   : 4.056          
##  3rd Qu.:2036           3rd Qu.: 1.250          
##  Max.   :2264           Max.   :56.000          
##  patients_remaining_at_risk_at_t overall_survival_percent
##  Min.   :1480                    Min.   :58.40           
##  1st Qu.:2493                    1st Qu.:69.12           
##  Median :2654                    Median :70.40           
##  Mean   :3278                    Mean   :69.80           
##  3rd Qu.:4719                    3rd Qu.:71.60           
##  Max.   :5635                    Max.   :73.70           
##  se_for_overall_survival lower_95_percent_ci_for_overall_survival
##  Min.   :0.700           Min.   :56.50                           
##  1st Qu.:0.800           1st Qu.:67.92                           
##  Median :1.000           Median :69.20                           
##  Mean   :1.008           Mean   :68.44                           
##  3rd Qu.:1.100           3rd Qu.:70.38                           
##  Max.   :1.700           Max.   :72.30                           
##  upper_95_percent_ci_for_overall_survival
##  Min.   :60.30                           
##  1st Qu.:70.38                           
##  Median :71.75                           
##  Mean   :71.19                           
##  3rd Qu.:72.72                           
##  Max.   :75.10
miss_summary <- naniar::miss_var_summary(data)
miss_summary
## # A tibble: 14 × 3
##    variable                                 n_miss pct_miss
##    <chr>                                     <int>    <num>
##  1 cancer_site_grouping                          0        0
##  2 calendar_period_of_diagnosis                  0        0
##  3 sex                                           0        0
##  4 age_group                                     0        0
##  5 method                                        0        0
##  6 time_since_diagnosis                          0        0
##  7 cases_analysed                                0        0
##  8 deaths_up_to_timepoint                        0        0
##  9 censored_up_to_timepoint                      0        0
## 10 patients_remaining_at_risk_at_t               0        0
## 11 overall_survival_percent                      0        0
## 12 se_for_overall_survival                       0        0
## 13 lower_95_percent_ci_for_overall_survival      0        0
## 14 upper_95_percent_ci_for_overall_survival      0        0
# Simple missing-value treatment (demonstration):
# - Numeric: median imputation
# - Character/factor: mode imputation
mode_fun <- function(x){
  ux <- unique(x[!is.na(x)])
  ux[which.max(tabulate(match(x, ux)))]
}
for (v in num_vars){
  if (any(is.na(data[[v]]))){
    data[[v]][is.na(data[[v]])] <- median(data[[v]], na.rm = TRUE)
  }
}
for (v in cat_vars){
  if (any(is.na(data[[v]]))){
    if (is.character(data[[v]])) data[[v]] <- as.factor(data[[v]])
    data[[v]][is.na(data[[v]])] <- mode_fun(data[[v]])
  }
}
# Convert character columns to factors
data <- data %>% mutate(across(where(is.character), as.factor))
# Attempt to detect an outcome variable automatically:
possible_outcomes <- names(data)[tolower(names(data)) %in% c('diagnosis','outcome','status','disease','target','label')]
# Fallback: find any binary variable (two unique values)
if(length(possible_outcomes)==0){
  bin_vars <- sapply(data, function(x) length(unique(na.omit(x)))==2)
  possible_outcomes <- names(data)[bin_vars]
}
cat('Candidate outcome variables:', paste(possible_outcomes, collapse=', '), '\n')
## Candidate outcome variables:
# Choose first candidate as outcome (user can change later)
outcome_var <- if(length(possible_outcomes)>0) possible_outcomes[1] else NULL
if(!is.null(outcome_var)){
  cat('Using', outcome_var, 'as outcome variable.\n')
  data[[outcome_var]] <- as.factor(data[[outcome_var]])
} else {
  cat('No clear binary outcome detected. You may need to define one manually.\n')
}
## No clear binary outcome detected. You may need to define one manually.
# Brief cleaned data glimpse
glimpse(data)
## Rows: 36
## Columns: 14
## $ cancer_site_grouping                     <fct> All malignant neoplasms exclu…
## $ calendar_period_of_diagnosis             <fct> Jan-Mar 2018, Apr-Jun 2018, J…
## $ sex                                      <fct> Males, Males, Males, Males, M…
## $ age_group                                <fct> 15-99, 15-99, 15-99, 15-99, 1…
## $ method                                   <fct> Non-standardised, Non-standar…
## $ time_since_diagnosis                     <fct> 1 year, 1 year, 1 year, 1 yea…
## $ cases_analysed                           <dbl> 3544, 3853, 3583, 3685, 3646,…
## $ deaths_up_to_timepoint                   <dbl> 1050, 1070, 1090, 1103, 1096,…
## $ censored_up_to_timepoint                 <dbl> 1, 1, 1, 1, 0, 0, 0, 1, 0, 0,…
## $ patients_remaining_at_risk_at_t          <dbl> 2493, 2782, 2492, 2581, 2550,…
## $ overall_survival_percent                 <dbl> 70.4, 72.2, 69.6, 70.1, 69.9,…
## $ se_for_overall_survival                  <dbl> 1.1, 1.0, 1.1, 1.1, 1.1, 1.1,…
## $ lower_95_percent_ci_for_overall_survival <dbl> 68.9, 70.8, 68.1, 68.6, 68.5,…
## $ upper_95_percent_ci_for_overall_survival <dbl> 71.9, 73.7, 71.1, 71.6, 71.4,…
# ---------------------------------------------
# Enhanced and Colorful Visualizations for EDA
# ---------------------------------------------
library(RColorBrewer)
library(viridis)
library(ggthemes)
# Choose vibrant color palettes
hist_fill <- "steelblue"
bar_palette <- scale_fill_brewer(palette = "Set2")
corr_colors <- colorRampPalette(c("#2E8B57", "#FFFFFF", "#8B0000"))(200)
# 1. Numeric distribution plots
if(length(num_vars) > 0){
for (v in head(num_vars, 6)){
print(
ggplot(data, aes_string(x = v)) +
geom_histogram(bins = 30, fill = hist_fill, color = "white", alpha = 0.8) +
ggtitle(paste("Distribution of", v)) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(color = "#2C3E50", face = "bold", hjust = 0.5),
panel.grid.minor = element_blank()
)
)
}
}
# 2. Barplots for categorical variables
if(length(cat_vars) > 0){
for (v in head(cat_vars, 6)){
print(
ggplot(data, aes_string(x = v, fill = v)) +
geom_bar(alpha = 0.9) +
bar_palette +
ggtitle(paste("Counts of", v)) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(color = "#34495E", face = "bold", hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1)
)
)
}
}
# 3. Correlation matrix (colorful heatmap)
if(length(num_vars) > 1){
numdat <- select(data, all_of(num_vars))
corr <- cor(numdat, use = 'pairwise.complete.obs')
corrplot::corrplot(
corr,
method = "color",
tl.cex = 0.8,
col = corr_colors,
addCoef.col = "black",
number.cex = 0.6,
title = "\nCorrelation Matrix of Numeric Variables"
)
}
# 4. Pairwise scatter plots (colored by first categorical var if exists)
if(length(num_vars) >= 3){
if(length(cat_vars) >= 1){
print(GGally::ggpairs(select(data, head(num_vars, 4), all_of(cat_vars[1])), aes(color = !!sym(cat_vars[1]), alpha = 0.7)) +
ggtitle("Pairwise Relationships Between Numeric Variables"))
} else {
print(GGally::ggpairs(select(data, head(num_vars, 4))) +
ggtitle("Pairwise Relationships Between Numeric Variables"))
}
}
# Example: create age groups if age exists
if('age' %in% tolower(names(data))){
  age_col <- names(data)[tolower(names(data))=='age'][1]
  data <- data %>% mutate(age_group = cut(.data[[age_col]], breaks = c(-Inf,30,50,65,Inf), labels = c('<=30','31-50','51-65','65+')))
}
# Example: scale numeric predictors (for algorithms that need it)
scale_vars <- num_vars
if(length(scale_vars)>0){
  data <- data %>% mutate(across(all_of(scale_vars), scale))
}
# Create final modeling dataset: remove ID-like columns
id_like <- names(data)[grepl('id|record|case', tolower(names(data)))]
model_data <- data %>% select(-any_of(id_like))
# Remove near-zero variance predictors
nzv <- caret::nearZeroVar(model_data, saveMetrics = TRUE)
if(any(nzv$nzv)){
  model_data <- model_data %>% select(-which(nzv$nzv))
}
glimpse(model_data)
## Rows: 36
## Columns: 9
## $ calendar_period_of_diagnosis             <fct> Jan-Mar 2018, Apr-Jun 2018, J…
## $ sex                                      <fct> Males, Males, Males, Males, M…
## $ deaths_up_to_timepoint                   <dbl[,1]> <matrix[26 x 1]>
## $ censored_up_to_timepoint                 <dbl[,1]> <matrix[26 x 1]>
## $ patients_remaining_at_risk_at_t          <dbl[,1]> <matrix[26 x 1]>
## $ overall_survival_percent                 <dbl[,1]> <matrix[26 x 1]>
## $ se_for_overall_survival                  <dbl[,1]> <matrix[26 x 1]>
## $ lower_95_percent_ci_for_overall_survival <dbl[,1]> <matrix[26 x 1]>
## $ upper_95_percent_ci_for_overall_survival <dbl[,1]> <matrix[26 x 1]>
if(!is.null(outcome_var) && length(unique(model_data[[outcome_var]]))==2){
  set.seed(123)
  # Partition
  train_index <- createDataPartition(model_data[[outcome_var]], p = 0.75, list = FALSE)
  train <- model_data[train_index,]
  test  <- model_data[-train_index,]
  # Train logistic regression
  formula <- as.formula(paste(outcome_var, '~ .'))
  glm_fit <- glm(formula, data = train, family = binomial)
  summary(glm_fit)
  # Predictions and performance on test
  pred_prob <- predict(glm_fit, newdata = test, type = 'response')
  # Choose 0.5 threshold
  pred_class <- factor(ifelse(pred_prob >= 0.5, levels(train[[outcome_var]])[2], levels(train[[outcome_var]])[1]), levels = levels(train[[outcome_var]]))
  conf_mat <- caret::confusionMatrix(pred_class, test[[outcome_var]])
  print(conf_mat)
  # ROC and AUC
  roc_obj <- pROC::roc(as.numeric(test[[outcome_var]]), as.numeric(pred_prob))
  print(paste('AUC (logistic):', round(pROC::auc(roc_obj),3)))
  plot(roc_obj, main = 'ROC - Logistic Regression')
  # Random forest for comparison
  rf_fit <- randomForest::randomForest(formula, data = train, ntree = 200, importance = TRUE)
  rf_pred_prob <- predict(rf_fit, newdata = test, type = 'prob')[,2]
  rf_pred_class <- predict(rf_fit, newdata = test, type = 'response')
  conf_rf <- caret::confusionMatrix(rf_pred_class, test[[outcome_var]])
  print(conf_rf)
  rf_roc <- pROC::roc(as.numeric(test[[outcome_var]]), as.numeric(rf_pred_prob))
  print(paste('AUC (RF):', round(pROC::auc(rf_roc),3)))
  plot(rf_roc, main = 'ROC - Random Forest')
  # Variable importance
  varImpPlot(rf_fit, main = 'Random Forest Variable Importance')
} else {
  cat('Binary outcome not available or not detected. Modeling section skipped.\n')
}
## Binary outcome not available or not detected. Modeling section skipped.
# Summarise key model metrics if available
if(exists('conf_mat') & exists('conf_rf')){
  metrics <- list(
    logistic = list(confusion = conf_mat$table, accuracy = conf_mat$overall['Accuracy'], auc = pROC::auc(roc_obj)),
    random_forest = list(confusion = conf_rf$table, accuracy = conf_rf$overall['Accuracy'], auc = pROC::auc(rf_roc))
  )
  print(metrics)
}
# Save cleaned data and models (optional)
saveRDS(data, file = 'cleaned_data.rds')
if(exists('glm_fit')) saveRDS(glm_fit, file = 'glm_fit.rds')
if(exists('rf_fit')) saveRDS(rf_fit, file = 'rf_fit.rds')
# Session info for reproducibility
sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.0.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Asia/Kolkata
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] viridis_0.6.5        viridisLite_0.4.2    RColorBrewer_1.1-3  
##  [4] ggthemes_5.1.0       forcats_1.0.0        GGally_2.4.0        
##  [7] naniar_1.1.0         janitor_2.2.1        pROC_1.19.0.1       
## [10] randomForest_4.7-1.2 caret_7.0-1          lattice_0.22-7      
## [13] corrplot_0.95        ggplot2_4.0.0        dplyr_1.1.4         
## [16] readr_2.1.5         
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1     timeDate_4051.111    farver_2.1.2        
##  [4] S7_0.2.0             fastmap_1.2.0        digest_0.6.37       
##  [7] rpart_4.1.24         timechange_0.3.0     lifecycle_1.0.4     
## [10] survival_3.8-3       magrittr_2.0.3       compiler_4.5.1      
## [13] rlang_1.1.6          sass_0.4.10          tools_4.5.1         
## [16] utf8_1.2.6           yaml_2.3.10          data.table_1.17.8   
## [19] knitr_1.50           labeling_0.4.3       bit_4.6.0           
## [22] plyr_1.8.9           withr_3.0.2          purrr_1.1.0         
## [25] nnet_7.3-20          grid_4.5.1           stats4_4.5.1        
## [28] future_1.67.0        globals_0.18.0       scales_1.4.0        
## [31] iterators_1.0.14     MASS_7.3-65          cli_3.6.5           
## [34] crayon_1.5.3         rmarkdown_2.29       generics_0.1.4      
## [37] rstudioapi_0.17.1    future.apply_1.20.0  reshape2_1.4.4      
## [40] tzdb_0.5.0           cachem_1.1.0         stringr_1.5.1       
## [43] splines_4.5.1        parallel_4.5.1       vctrs_0.6.5         
## [46] hardhat_1.4.2        Matrix_1.7-3         jsonlite_2.0.0      
## [49] hms_1.1.3            bit64_4.6.0-1        visdat_0.6.0        
## [52] listenv_0.9.1        foreach_1.5.2        gower_1.0.2         
## [55] jquerylib_0.1.4      tidyr_1.3.1          recipes_1.3.1       
## [58] glue_1.8.0           parallelly_1.45.1    ggstats_0.11.0      
## [61] codetools_0.2-20     lubridate_1.9.4      stringi_1.8.7       
## [64] gtable_0.3.6         tibble_3.3.0         pillar_1.11.0       
## [67] htmltools_0.5.8.1    ipred_0.9-15         lava_1.8.2          
## [70] R6_2.6.1             vroom_1.6.5          evaluate_1.0.4      
## [73] snakecase_0.11.1     bslib_0.9.0          class_7.3-23        
## [76] Rcpp_1.1.0           gridExtra_2.3        nlme_3.1-168        
## [79] prodlim_2025.04.28   xfun_0.54            ModelMetrics_1.2.2.2
## [82] pkgconfig_2.0.3