This document presents an analysis of medical diagnosis data.
The goal is to identify key predictors influencing disease
outcomes and evaluate the diagnostic accuracy
of different clinical variables.
# Work on a copy
data <- data_orig
# Make column names consistent
data <- janitor::clean_names(data)
# Remove exact duplicates (rows)
dedup_before <- nrow(data)
data <- distinct(data)
dedup_after <- nrow(data)
cat('Removed', dedup_before - dedup_after, 'duplicate rows\n')
## Removed 0 duplicate rows
# Identify variable types
num_vars <- names(select_if(data, is.numeric))
cat_vars <- names(select_if(data, ~!is.numeric(.)))
# Missingness overview
summary(data)
## cancer_site_grouping calendar_period_of_diagnosis sex
## Length:36 Length:36 Length:36
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## age_group method time_since_diagnosis cases_analysed
## Length:36 Length:36 Length:36 Min. :2535
## Class :character Class :character Class :character 1st Qu.:3522
## Mode :character Mode :character Mode :character Median :3710
## Mean :4685
## 3rd Qu.:6797
## Max. :7730
## deaths_up_to_timepoint censored_up_to_timepoint
## Min. : 899 Min. : 0.000
## 1st Qu.:1034 1st Qu.: 0.000
## Median :1098 Median : 1.000
## Mean :1403 Mean : 4.056
## 3rd Qu.:2036 3rd Qu.: 1.250
## Max. :2264 Max. :56.000
## patients_remaining_at_risk_at_t overall_survival_percent
## Min. :1480 Min. :58.40
## 1st Qu.:2493 1st Qu.:69.12
## Median :2654 Median :70.40
## Mean :3278 Mean :69.80
## 3rd Qu.:4719 3rd Qu.:71.60
## Max. :5635 Max. :73.70
## se_for_overall_survival lower_95_percent_ci_for_overall_survival
## Min. :0.700 Min. :56.50
## 1st Qu.:0.800 1st Qu.:67.92
## Median :1.000 Median :69.20
## Mean :1.008 Mean :68.44
## 3rd Qu.:1.100 3rd Qu.:70.38
## Max. :1.700 Max. :72.30
## upper_95_percent_ci_for_overall_survival
## Min. :60.30
## 1st Qu.:70.38
## Median :71.75
## Mean :71.19
## 3rd Qu.:72.72
## Max. :75.10
miss_summary <- naniar::miss_var_summary(data)
miss_summary
## # A tibble: 14 × 3
## variable n_miss pct_miss
## <chr> <int> <num>
## 1 cancer_site_grouping 0 0
## 2 calendar_period_of_diagnosis 0 0
## 3 sex 0 0
## 4 age_group 0 0
## 5 method 0 0
## 6 time_since_diagnosis 0 0
## 7 cases_analysed 0 0
## 8 deaths_up_to_timepoint 0 0
## 9 censored_up_to_timepoint 0 0
## 10 patients_remaining_at_risk_at_t 0 0
## 11 overall_survival_percent 0 0
## 12 se_for_overall_survival 0 0
## 13 lower_95_percent_ci_for_overall_survival 0 0
## 14 upper_95_percent_ci_for_overall_survival 0 0
# Simple missing-value treatment (demonstration):
# - Numeric: median imputation
# - Character/factor: mode imputation
mode_fun <- function(x){
ux <- unique(x[!is.na(x)])
ux[which.max(tabulate(match(x, ux)))]
}
for (v in num_vars){
if (any(is.na(data[[v]]))){
data[[v]][is.na(data[[v]])] <- median(data[[v]], na.rm = TRUE)
}
}
for (v in cat_vars){
if (any(is.na(data[[v]]))){
if (is.character(data[[v]])) data[[v]] <- as.factor(data[[v]])
data[[v]][is.na(data[[v]])] <- mode_fun(data[[v]])
}
}
# Convert character columns to factors
data <- data %>% mutate(across(where(is.character), as.factor))
# Attempt to detect an outcome variable automatically:
possible_outcomes <- names(data)[tolower(names(data)) %in% c('diagnosis','outcome','status','disease','target','label')]
# Fallback: find any binary variable (two unique values)
if(length(possible_outcomes)==0){
bin_vars <- sapply(data, function(x) length(unique(na.omit(x)))==2)
possible_outcomes <- names(data)[bin_vars]
}
cat('Candidate outcome variables:', paste(possible_outcomes, collapse=', '), '\n')
## Candidate outcome variables:
# Choose first candidate as outcome (user can change later)
outcome_var <- if(length(possible_outcomes)>0) possible_outcomes[1] else NULL
if(!is.null(outcome_var)){
cat('Using', outcome_var, 'as outcome variable.\n')
data[[outcome_var]] <- as.factor(data[[outcome_var]])
} else {
cat('No clear binary outcome detected. You may need to define one manually.\n')
}
## No clear binary outcome detected. You may need to define one manually.
# Brief cleaned data glimpse
glimpse(data)
## Rows: 36
## Columns: 14
## $ cancer_site_grouping <fct> All malignant neoplasms exclu…
## $ calendar_period_of_diagnosis <fct> Jan-Mar 2018, Apr-Jun 2018, J…
## $ sex <fct> Males, Males, Males, Males, M…
## $ age_group <fct> 15-99, 15-99, 15-99, 15-99, 1…
## $ method <fct> Non-standardised, Non-standar…
## $ time_since_diagnosis <fct> 1 year, 1 year, 1 year, 1 yea…
## $ cases_analysed <dbl> 3544, 3853, 3583, 3685, 3646,…
## $ deaths_up_to_timepoint <dbl> 1050, 1070, 1090, 1103, 1096,…
## $ censored_up_to_timepoint <dbl> 1, 1, 1, 1, 0, 0, 0, 1, 0, 0,…
## $ patients_remaining_at_risk_at_t <dbl> 2493, 2782, 2492, 2581, 2550,…
## $ overall_survival_percent <dbl> 70.4, 72.2, 69.6, 70.1, 69.9,…
## $ se_for_overall_survival <dbl> 1.1, 1.0, 1.1, 1.1, 1.1, 1.1,…
## $ lower_95_percent_ci_for_overall_survival <dbl> 68.9, 70.8, 68.1, 68.6, 68.5,…
## $ upper_95_percent_ci_for_overall_survival <dbl> 71.9, 73.7, 71.1, 71.6, 71.4,…
# ---------------------------------------------
# Enhanced and Colorful Visualizations for EDA
# ---------------------------------------------
library(RColorBrewer)
library(viridis)
library(ggthemes)
# Choose vibrant color palettes
hist_fill <- "steelblue"
bar_palette <- scale_fill_brewer(palette = "Set2")
corr_colors <- colorRampPalette(c("#2E8B57", "#FFFFFF", "#8B0000"))(200)
# 1. Numeric distribution plots
if(length(num_vars) > 0){
for (v in head(num_vars, 6)){
print(
ggplot(data, aes_string(x = v)) +
geom_histogram(bins = 30, fill = hist_fill, color = "white", alpha = 0.8) +
ggtitle(paste("Distribution of", v)) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(color = "#2C3E50", face = "bold", hjust = 0.5),
panel.grid.minor = element_blank()
)
)
}
}
# 2. Barplots for categorical variables
if(length(cat_vars) > 0){
for (v in head(cat_vars, 6)){
print(
ggplot(data, aes_string(x = v, fill = v)) +
geom_bar(alpha = 0.9) +
bar_palette +
ggtitle(paste("Counts of", v)) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(color = "#34495E", face = "bold", hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1)
)
)
}
}
# 3. Correlation matrix (colorful heatmap)
if(length(num_vars) > 1){
numdat <- select(data, all_of(num_vars))
corr <- cor(numdat, use = 'pairwise.complete.obs')
corrplot::corrplot(
corr,
method = "color",
tl.cex = 0.8,
col = corr_colors,
addCoef.col = "black",
number.cex = 0.6,
title = "\nCorrelation Matrix of Numeric Variables"
)
}
# 4. Pairwise scatter plots (colored by first categorical var if exists)
if(length(num_vars) >= 3){
if(length(cat_vars) >= 1){
print(GGally::ggpairs(select(data, head(num_vars, 4), all_of(cat_vars[1])), aes(color = !!sym(cat_vars[1]), alpha = 0.7)) +
ggtitle("Pairwise Relationships Between Numeric Variables"))
} else {
print(GGally::ggpairs(select(data, head(num_vars, 4))) +
ggtitle("Pairwise Relationships Between Numeric Variables"))
}
}
# Example: create age groups if age exists
if('age' %in% tolower(names(data))){
age_col <- names(data)[tolower(names(data))=='age'][1]
data <- data %>% mutate(age_group = cut(.data[[age_col]], breaks = c(-Inf,30,50,65,Inf), labels = c('<=30','31-50','51-65','65+')))
}
# Example: scale numeric predictors (for algorithms that need it)
scale_vars <- num_vars
if(length(scale_vars)>0){
data <- data %>% mutate(across(all_of(scale_vars), scale))
}
# Create final modeling dataset: remove ID-like columns
id_like <- names(data)[grepl('id|record|case', tolower(names(data)))]
model_data <- data %>% select(-any_of(id_like))
# Remove near-zero variance predictors
nzv <- caret::nearZeroVar(model_data, saveMetrics = TRUE)
if(any(nzv$nzv)){
model_data <- model_data %>% select(-which(nzv$nzv))
}
glimpse(model_data)
## Rows: 36
## Columns: 9
## $ calendar_period_of_diagnosis <fct> Jan-Mar 2018, Apr-Jun 2018, J…
## $ sex <fct> Males, Males, Males, Males, M…
## $ deaths_up_to_timepoint <dbl[,1]> <matrix[26 x 1]>
## $ censored_up_to_timepoint <dbl[,1]> <matrix[26 x 1]>
## $ patients_remaining_at_risk_at_t <dbl[,1]> <matrix[26 x 1]>
## $ overall_survival_percent <dbl[,1]> <matrix[26 x 1]>
## $ se_for_overall_survival <dbl[,1]> <matrix[26 x 1]>
## $ lower_95_percent_ci_for_overall_survival <dbl[,1]> <matrix[26 x 1]>
## $ upper_95_percent_ci_for_overall_survival <dbl[,1]> <matrix[26 x 1]>
if(!is.null(outcome_var) && length(unique(model_data[[outcome_var]]))==2){
set.seed(123)
# Partition
train_index <- createDataPartition(model_data[[outcome_var]], p = 0.75, list = FALSE)
train <- model_data[train_index,]
test <- model_data[-train_index,]
# Train logistic regression
formula <- as.formula(paste(outcome_var, '~ .'))
glm_fit <- glm(formula, data = train, family = binomial)
summary(glm_fit)
# Predictions and performance on test
pred_prob <- predict(glm_fit, newdata = test, type = 'response')
# Choose 0.5 threshold
pred_class <- factor(ifelse(pred_prob >= 0.5, levels(train[[outcome_var]])[2], levels(train[[outcome_var]])[1]), levels = levels(train[[outcome_var]]))
conf_mat <- caret::confusionMatrix(pred_class, test[[outcome_var]])
print(conf_mat)
# ROC and AUC
roc_obj <- pROC::roc(as.numeric(test[[outcome_var]]), as.numeric(pred_prob))
print(paste('AUC (logistic):', round(pROC::auc(roc_obj),3)))
plot(roc_obj, main = 'ROC - Logistic Regression')
# Random forest for comparison
rf_fit <- randomForest::randomForest(formula, data = train, ntree = 200, importance = TRUE)
rf_pred_prob <- predict(rf_fit, newdata = test, type = 'prob')[,2]
rf_pred_class <- predict(rf_fit, newdata = test, type = 'response')
conf_rf <- caret::confusionMatrix(rf_pred_class, test[[outcome_var]])
print(conf_rf)
rf_roc <- pROC::roc(as.numeric(test[[outcome_var]]), as.numeric(rf_pred_prob))
print(paste('AUC (RF):', round(pROC::auc(rf_roc),3)))
plot(rf_roc, main = 'ROC - Random Forest')
# Variable importance
varImpPlot(rf_fit, main = 'Random Forest Variable Importance')
} else {
cat('Binary outcome not available or not detected. Modeling section skipped.\n')
}
## Binary outcome not available or not detected. Modeling section skipped.
# Summarise key model metrics if available
if(exists('conf_mat') & exists('conf_rf')){
metrics <- list(
logistic = list(confusion = conf_mat$table, accuracy = conf_mat$overall['Accuracy'], auc = pROC::auc(roc_obj)),
random_forest = list(confusion = conf_rf$table, accuracy = conf_rf$overall['Accuracy'], auc = pROC::auc(rf_roc))
)
print(metrics)
}
# Save cleaned data and models (optional)
saveRDS(data, file = 'cleaned_data.rds')
if(exists('glm_fit')) saveRDS(glm_fit, file = 'glm_fit.rds')
if(exists('rf_fit')) saveRDS(rf_fit, file = 'rf_fit.rds')
# Session info for reproducibility
sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.0.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Asia/Kolkata
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] viridis_0.6.5 viridisLite_0.4.2 RColorBrewer_1.1-3
## [4] ggthemes_5.1.0 forcats_1.0.0 GGally_2.4.0
## [7] naniar_1.1.0 janitor_2.2.1 pROC_1.19.0.1
## [10] randomForest_4.7-1.2 caret_7.0-1 lattice_0.22-7
## [13] corrplot_0.95 ggplot2_4.0.0 dplyr_1.1.4
## [16] readr_2.1.5
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.1 timeDate_4051.111 farver_2.1.2
## [4] S7_0.2.0 fastmap_1.2.0 digest_0.6.37
## [7] rpart_4.1.24 timechange_0.3.0 lifecycle_1.0.4
## [10] survival_3.8-3 magrittr_2.0.3 compiler_4.5.1
## [13] rlang_1.1.6 sass_0.4.10 tools_4.5.1
## [16] utf8_1.2.6 yaml_2.3.10 data.table_1.17.8
## [19] knitr_1.50 labeling_0.4.3 bit_4.6.0
## [22] plyr_1.8.9 withr_3.0.2 purrr_1.1.0
## [25] nnet_7.3-20 grid_4.5.1 stats4_4.5.1
## [28] future_1.67.0 globals_0.18.0 scales_1.4.0
## [31] iterators_1.0.14 MASS_7.3-65 cli_3.6.5
## [34] crayon_1.5.3 rmarkdown_2.29 generics_0.1.4
## [37] rstudioapi_0.17.1 future.apply_1.20.0 reshape2_1.4.4
## [40] tzdb_0.5.0 cachem_1.1.0 stringr_1.5.1
## [43] splines_4.5.1 parallel_4.5.1 vctrs_0.6.5
## [46] hardhat_1.4.2 Matrix_1.7-3 jsonlite_2.0.0
## [49] hms_1.1.3 bit64_4.6.0-1 visdat_0.6.0
## [52] listenv_0.9.1 foreach_1.5.2 gower_1.0.2
## [55] jquerylib_0.1.4 tidyr_1.3.1 recipes_1.3.1
## [58] glue_1.8.0 parallelly_1.45.1 ggstats_0.11.0
## [61] codetools_0.2-20 lubridate_1.9.4 stringi_1.8.7
## [64] gtable_0.3.6 tibble_3.3.0 pillar_1.11.0
## [67] htmltools_0.5.8.1 ipred_0.9-15 lava_1.8.2
## [70] R6_2.6.1 vroom_1.6.5 evaluate_1.0.4
## [73] snakecase_0.11.1 bslib_0.9.0 class_7.3-23
## [76] Rcpp_1.1.0 gridExtra_2.3 nlme_3.1-168
## [79] prodlim_2025.04.28 xfun_0.54 ModelMetrics_1.2.2.2
## [82] pkgconfig_2.0.3