This project analyzes the AIBL dataset to classify participants into Healthy Controls (HC) and Non-HC (MCI or AD) using a Random Forest model in R. The process includes data preparation, feature engineering, model training, evaluation, and hyperparameter tuning using packages such as tidyverse, caret, and randomForest.
library(tidyverse)
library(caret)
library(randomForest)
library(stringr)
library(ggplot2)
neurobat <- read.csv("aibl_neurobat_01-Jun-2018.csv", stringsAsFactors = FALSE)
pdxconv <- read.csv("aibl_pdxconv_01-Jun-2018.csv", stringsAsFactors = FALSE)
mmse <- read.csv("aibl_mmse_01-Jun-2018.csv", stringsAsFactors = FALSE)
cdr <- read.csv("aibl_cdr_01-Jun-2018.csv", stringsAsFactors = FALSE)
labdata <- read.csv("aibl_labdata_01-Jun-2018.csv", stringsAsFactors = FALSE)
apoeres <- read.csv("aibl_apoeres_01-Jun-2018.csv", stringsAsFactors = FALSE)
medhist <- read.csv("aibl_medhist_01-Jun-2018.csv", stringsAsFactors = FALSE)
ptdemog <- read.csv("aibl_ptdemog_01-Jun-2018.csv", stringsAsFactors = FALSE)
cdr <- filter(cdr, VISCODE == "bl")
labdata <- filter(labdata, VISCODE == "bl")
mmse <- filter(mmse, VISCODE == "bl")
neurobat <- filter(neurobat, VISCODE == "bl")
pdxconv <- filter(pdxconv, VISCODE == "bl")
aibl_total <- cdr %>%
inner_join(labdata, by = c("RID", "SITEID", "VISCODE")) %>%
inner_join(mmse, by = c("RID", "SITEID", "VISCODE")) %>%
inner_join(neurobat, by = c("RID", "SITEID", "VISCODE")) %>%
inner_join(pdxconv, by = c("RID", "SITEID", "VISCODE"))
medhist1 <- medhist[, !(names(medhist) %in% c("SITEID", "VISCODE"))]
apoeres1 <- apoeres[, !(names(apoeres) %in% c("SITEID", "VISCODE"))]
ptdemog1 <- ptdemog[, !(names(ptdemog) %in% c("SITEID", "VISCODE"))]
aibl_total <- aibl_total %>%
left_join(medhist1, by = "RID") %>%
left_join(apoeres1, by = "RID") %>%
left_join(ptdemog1, by = "RID")
aibl_total <- aibl_total %>%
mutate(
Gender = recode(PTGENDER, `1` = "Male", `2` = "Female"),
BirthYear = as.numeric(str_extract(PTDOB, "\\d{4}")),
Age = 2018 - BirthYear,
DiagnosisGroup = case_when(
DXCURREN == 1 ~ "HC",
DXCURREN %in% c(2, 3) ~ "Non-HC",
TRUE ~ NA_character_
),
APOE4 = rowSums(select(., APGEN1, APGEN2) == 4, na.rm = TRUE)
) %>%
filter(!is.na(DiagnosisGroup))
aibl_model_data <- aibl_total %>%
select(-c(RID, SITEID, VISCODE, EXAMDATE.x, EXAMDATE.y, EXAMDATE,
PTGENDER, PTDOB, DXCURREN, APTESTDT)) %>%
mutate_if(is.character, as.factor)
str(aibl_model_data)
## 'data.frame': 858 obs. of 33 variables:
## $ CDGLOBAL : num 0 0 0 0.5 1 0 0.5 0 0.5 0 ...
## $ AXT117 : num 1.26 1.31 1.33 1.37 6.99 -4 2.44 1.32 1.77 -4 ...
## $ BAT126 : num 484 403 430 362 346 ...
## $ HMT3 : num 4.39 3.87 4.13 5.64 4.29 4.36 3.97 4.42 4.76 4.86 ...
## $ HMT7 : num 5.5 5.2 6.8 5.7 4.8 5.2 3.8 4.8 5.1 6.1 ...
## $ HMT13 : num 220 254 327 140 164 219 179 326 182 298 ...
## $ HMT40 : num 14.4 12.6 13.1 16.5 15 13.8 12.8 13.6 15.1 14 ...
## $ HMT100 : num 32.8 32.5 31.7 29.3 34.9 31.7 32.2 30.8 31.6 28.9 ...
## $ HMT102 : num 34.3 34.2 34.2 34 35.1 34.5 35 34.9 34.2 33.3 ...
## $ RCT6 : num 37.8 18 52.3 33 33.6 ...
## $ RCT11 : num 90.1 129.7 86.5 108.1 126.1 ...
## $ RCT20 : num 174 251 217 162 159 ...
## $ RCT392 : num 0.916 0.792 1.018 1.131 1.131 ...
## $ MMSCORE : int 30 30 27 30 21 29 28 30 28 27 ...
## $ LIMMTOTAL : int 16 9 9 10 3 9 3 10 9 12 ...
## $ LDELTOTAL : int 14 11 2 0 0 7 4 8 7 11 ...
## $ MHPSYCH : int 0 1 0 0 0 0 0 1 0 0 ...
## $ MH2NEURL : int 1 0 0 0 0 0 0 1 0 0 ...
## $ MH4CARD : int 0 0 1 1 0 1 1 0 1 0 ...
## $ MH6HEPAT : int 0 0 0 0 0 0 0 0 0 0 ...
## $ MH8MUSCL : int 0 0 1 0 0 0 1 1 1 1 ...
## $ MH9ENDO : int 0 0 0 1 0 0 0 0 0 0 ...
## $ MH10GAST : int 0 1 0 0 0 1 0 1 0 1 ...
## $ MH12RENA : int 0 0 0 0 0 0 0 0 0 0 ...
## $ MH16SMOK : int -4 0 1 0 1 1 0 1 1 0 ...
## $ MH17MALI : int 0 0 0 0 0 0 0 0 0 1 ...
## $ APGEN1 : int 3 4 3 3 4 3 4 4 3 4 ...
## $ APGEN2 : int 3 3 3 3 4 3 3 2 3 3 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 1 1 2 2 2 2 1 2 1 ...
## $ BirthYear : num 1941 1939 1922 1933 1925 ...
## $ Age : num 77 79 96 85 93 101 90 77 98 92 ...
## $ DiagnosisGroup: Factor w/ 2 levels "HC","Non-HC": 1 1 1 2 2 1 2 1 2 1 ...
## $ APOE4 : num 0 1 0 0 2 0 1 1 0 1 ...
table(aibl_model_data$DiagnosisGroup)
##
## HC Non-HC
## 609 249
aggregate(Age ~ DiagnosisGroup, data = aibl_model_data, summary)
## DiagnosisGroup Age.Min. Age.1st Qu. Age.Median Age.Mean Age.3rd Qu.
## 1 HC 65.00000 75.00000 79.00000 80.01478 84.00000
## 2 Non-HC 61.00000 77.00000 83.00000 82.42570 88.00000
## Age.Max.
## 1 101.00000
## 2 103.00000
prop.table(table(aibl_model_data$Gender, aibl_model_data$DiagnosisGroup), 2)
##
## HC Non-HC
## Female 0.5599343 0.5100402
## Male 0.4400657 0.4899598
prop.table(table(aibl_model_data$APOE4 > 0, aibl_model_data$DiagnosisGroup), 2)
##
## HC Non-HC
## FALSE 0.7208539 0.4698795
## TRUE 0.2791461 0.5301205
aggregate(MMSCORE ~ DiagnosisGroup, data = aibl_model_data, mean)
## DiagnosisGroup MMSCORE
## 1 HC 28.71757
## 2 Non-HC 24.12851
aggregate(CDGLOBAL ~ DiagnosisGroup, data = aibl_model_data, mean)
## DiagnosisGroup CDGLOBAL
## 1 HC 0.02298851
## 2 Non-HC 0.67068273
aggregate(LIMMTOTAL ~ DiagnosisGroup, data = aibl_model_data, mean)
## DiagnosisGroup LIMMTOTAL
## 1 HC 12.56158
## 2 Non-HC 5.60241
aggregate(LDELTOTAL ~ DiagnosisGroup, data = aibl_model_data, mean)
## DiagnosisGroup LDELTOTAL
## 1 HC 11.155993
## 2 Non-HC 3.040161
biomarker_cols <- c("AXT117", "BAT126", "HMT3", "HMT7", "HMT13",
"HMT40", "HMT100", "HMT102", "RCT6",
"RCT11", "RCT20", "RCT392")
missing_biomarkers <- sapply(aibl_model_data[biomarker_cols], function(x) mean(is.na(x)) * 100)
round(missing_biomarkers, 2)
## AXT117 BAT126 HMT3 HMT7 HMT13 HMT40 HMT100 HMT102 RCT6 RCT11 RCT20
## 0 0 0 0 0 0 0 0 0 0 0
## RCT392
## 0
pre_proc <- preProcess(aibl_model_data, method = c("medianImpute"))
data_imputed <- predict(pre_proc, aibl_model_data)
set.seed(123)
split_index <- createDataPartition(data_imputed$DiagnosisGroup, p = 0.7, list = FALSE)
train_data <- data_imputed[split_index, ]
test_data <- data_imputed[-split_index, ]
rf_model <- randomForest(DiagnosisGroup ~ ., data = train_data, ntree = 100, importance = TRUE)
pred <- predict(rf_model, newdata = test_data)
conf_matrix <- confusionMatrix(pred, test_data$DiagnosisGroup)
conf_matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction HC Non-HC
## HC 178 6
## Non-HC 4 68
##
## Accuracy : 0.9609
## 95% CI : (0.9293, 0.9811)
## No Information Rate : 0.7109
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9042
##
## Mcnemar's Test P-Value : 0.7518
##
## Sensitivity : 0.9780
## Specificity : 0.9189
## Pos Pred Value : 0.9674
## Neg Pred Value : 0.9444
## Prevalence : 0.7109
## Detection Rate : 0.6953
## Detection Prevalence : 0.7188
## Balanced Accuracy : 0.9485
##
## 'Positive' Class : HC
##
importance_df <- as.data.frame(importance(rf_model))
importance_df$Feature <- rownames(importance_df)
top_features <- importance_df[order(-importance_df$MeanDecreaseGini), ][1:10, ]
ggplot(top_features, aes(x = reorder(Feature, MeanDecreaseGini), y = MeanDecreaseGini)) +
geom_col(fill = "darkgreen") +
coord_flip() +
labs(title = "Top 10 Feature Importances", x = "Feature", y = "Mean Decrease in Gini") +
theme_minimal()
## Confusion Matrix Visualisation
conf_df <- as.data.frame(conf_matrix$table)
ggplot(conf_df, aes(Prediction, Reference, fill = Freq)) +
geom_tile(color = "white") +
geom_text(aes(label = Freq), size = 6) +
scale_fill_gradient(low = "white", high = "steelblue") +
labs(title = "Confusion Matrix", x = "Predicted", y = "Actual") +
theme_minimal()
## Precision, Recall, F1 Score
metrics <- data.frame(
Metric = rep(c("Precision", "Recall", "F1-Score"), 2),
Class = rep(c("HC", "Non-HC"), each = 3),
Value = c(
178 / (178 + 6),
178 / (178 + 4),
2 * (0.967 * 0.978) / (0.967 + 0.978),
68 / (68 + 4),
68 / (68 + 6),
2 * (0.944 * 0.919) / (0.944 + 0.919)
)
)
metrics$Value <- round(metrics$Value, 3)
ggplot(metrics, aes(x = Metric, y = Value, fill = Class)) +
geom_bar(stat = "identity", position = position_dodge()) +
geom_text(aes(label = Value), vjust = -0.3, position = position_dodge(0.9), size = 3.5) +
labs(title = "Precision, Recall, and F1-Score by Class", y = "Score", x = NULL) +
ylim(0, 1.05) +
theme_minimal()
## Hyperparameter Tuning with Caret
ctrl <- trainControl(method = "cv", number = 5, verboseIter = TRUE)
tune_grid <- expand.grid(mtry = c(2, 4, 6, 8, 10, 12))
set.seed(123)
rf_tuned <- train(
DiagnosisGroup ~ .,
data = train_data,
method = "rf",
metric = "Accuracy",
trControl = ctrl,
tuneGrid = tune_grid,
ntree = 100
)
## + Fold1: mtry= 2
## - Fold1: mtry= 2
## + Fold1: mtry= 4
## - Fold1: mtry= 4
## + Fold1: mtry= 6
## - Fold1: mtry= 6
## + Fold1: mtry= 8
## - Fold1: mtry= 8
## + Fold1: mtry=10
## - Fold1: mtry=10
## + Fold1: mtry=12
## - Fold1: mtry=12
## + Fold2: mtry= 2
## - Fold2: mtry= 2
## + Fold2: mtry= 4
## - Fold2: mtry= 4
## + Fold2: mtry= 6
## - Fold2: mtry= 6
## + Fold2: mtry= 8
## - Fold2: mtry= 8
## + Fold2: mtry=10
## - Fold2: mtry=10
## + Fold2: mtry=12
## - Fold2: mtry=12
## + Fold3: mtry= 2
## - Fold3: mtry= 2
## + Fold3: mtry= 4
## - Fold3: mtry= 4
## + Fold3: mtry= 6
## - Fold3: mtry= 6
## + Fold3: mtry= 8
## - Fold3: mtry= 8
## + Fold3: mtry=10
## - Fold3: mtry=10
## + Fold3: mtry=12
## - Fold3: mtry=12
## + Fold4: mtry= 2
## - Fold4: mtry= 2
## + Fold4: mtry= 4
## - Fold4: mtry= 4
## + Fold4: mtry= 6
## - Fold4: mtry= 6
## + Fold4: mtry= 8
## - Fold4: mtry= 8
## + Fold4: mtry=10
## - Fold4: mtry=10
## + Fold4: mtry=12
## - Fold4: mtry=12
## + Fold5: mtry= 2
## - Fold5: mtry= 2
## + Fold5: mtry= 4
## - Fold5: mtry= 4
## + Fold5: mtry= 6
## - Fold5: mtry= 6
## + Fold5: mtry= 8
## - Fold5: mtry= 8
## + Fold5: mtry=10
## - Fold5: mtry=10
## + Fold5: mtry=12
## - Fold5: mtry=12
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 12 on full training set
rf_tuned
## Random Forest
##
## 602 samples
## 32 predictor
## 2 classes: 'HC', 'Non-HC'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 482, 482, 481, 482, 481
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9119697 0.7780485
## 4 0.9319008 0.8337297
## 6 0.9352204 0.8441862
## 8 0.9352204 0.8440296
## 10 0.9302342 0.8323479
## 12 0.9368871 0.8494611
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 12.
plot(rf_tuned)
pred_tuned <- predict(rf_tuned, newdata = test_data)
confusionMatrix(pred_tuned, test_data$DiagnosisGroup)
## Confusion Matrix and Statistics
##
## Reference
## Prediction HC Non-HC
## HC 176 4
## Non-HC 6 70
##
## Accuracy : 0.9609
## 95% CI : (0.9293, 0.9811)
## No Information Rate : 0.7109
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9057
##
## Mcnemar's Test P-Value : 0.7518
##
## Sensitivity : 0.9670
## Specificity : 0.9459
## Pos Pred Value : 0.9778
## Neg Pred Value : 0.9211
## Prevalence : 0.7109
## Detection Rate : 0.6875
## Detection Prevalence : 0.7031
## Balanced Accuracy : 0.9565
##
## 'Positive' Class : HC
##