Introduction

This project analyzes the AIBL dataset to classify participants into Healthy Controls (HC) and Non-HC (MCI or AD) using a Random Forest model in R. The process includes data preparation, feature engineering, model training, evaluation, and hyperparameter tuning using packages such as tidyverse, caret, and randomForest.

Load required libraries

library(tidyverse)
library(caret)
library(randomForest)
library(stringr)
library(ggplot2)

Load and prepare dataset

Load datasets

neurobat <- read.csv("aibl_neurobat_01-Jun-2018.csv", stringsAsFactors = FALSE)
pdxconv  <- read.csv("aibl_pdxconv_01-Jun-2018.csv", stringsAsFactors = FALSE)
mmse     <- read.csv("aibl_mmse_01-Jun-2018.csv", stringsAsFactors = FALSE)
cdr      <- read.csv("aibl_cdr_01-Jun-2018.csv", stringsAsFactors = FALSE)
labdata  <- read.csv("aibl_labdata_01-Jun-2018.csv", stringsAsFactors = FALSE)
apoeres  <- read.csv("aibl_apoeres_01-Jun-2018.csv", stringsAsFactors = FALSE)
medhist  <- read.csv("aibl_medhist_01-Jun-2018.csv", stringsAsFactors = FALSE)
ptdemog  <- read.csv("aibl_ptdemog_01-Jun-2018.csv", stringsAsFactors = FALSE)

Filter and Merge Dataset

cdr      <- filter(cdr, VISCODE == "bl")
labdata  <- filter(labdata, VISCODE == "bl")
mmse     <- filter(mmse, VISCODE == "bl")
neurobat <- filter(neurobat, VISCODE == "bl")
pdxconv  <- filter(pdxconv, VISCODE == "bl")

aibl_total <- cdr %>%
  inner_join(labdata,  by = c("RID", "SITEID", "VISCODE")) %>%
  inner_join(mmse,     by = c("RID", "SITEID", "VISCODE")) %>%
  inner_join(neurobat, by = c("RID", "SITEID", "VISCODE")) %>%
  inner_join(pdxconv,  by = c("RID", "SITEID", "VISCODE"))

medhist1  <- medhist[, !(names(medhist) %in% c("SITEID", "VISCODE"))]
apoeres1  <- apoeres[, !(names(apoeres) %in% c("SITEID", "VISCODE"))]
ptdemog1  <- ptdemog[, !(names(ptdemog) %in% c("SITEID", "VISCODE"))]

aibl_total <- aibl_total %>%
  left_join(medhist1, by = "RID") %>%
  left_join(apoeres1, by = "RID") %>%
  left_join(ptdemog1, by = "RID")

Feature Engineering

aibl_total <- aibl_total %>%
  mutate(
    Gender = recode(PTGENDER, `1` = "Male", `2` = "Female"),
    BirthYear = as.numeric(str_extract(PTDOB, "\\d{4}")),
    Age = 2018 - BirthYear,
    DiagnosisGroup = case_when(
      DXCURREN == 1 ~ "HC",
      DXCURREN %in% c(2, 3) ~ "Non-HC",
      TRUE ~ NA_character_
    ),
    APOE4 = rowSums(select(., APGEN1, APGEN2) == 4, na.rm = TRUE)
  ) %>%
  filter(!is.na(DiagnosisGroup))

Prepare data for modelling

aibl_model_data <- aibl_total %>%
  select(-c(RID, SITEID, VISCODE, EXAMDATE.x, EXAMDATE.y, EXAMDATE,
            PTGENDER, PTDOB, DXCURREN, APTESTDT)) %>%
  mutate_if(is.character, as.factor)

str(aibl_model_data)
## 'data.frame':    858 obs. of  33 variables:
##  $ CDGLOBAL      : num  0 0 0 0.5 1 0 0.5 0 0.5 0 ...
##  $ AXT117        : num  1.26 1.31 1.33 1.37 6.99 -4 2.44 1.32 1.77 -4 ...
##  $ BAT126        : num  484 403 430 362 346 ...
##  $ HMT3          : num  4.39 3.87 4.13 5.64 4.29 4.36 3.97 4.42 4.76 4.86 ...
##  $ HMT7          : num  5.5 5.2 6.8 5.7 4.8 5.2 3.8 4.8 5.1 6.1 ...
##  $ HMT13         : num  220 254 327 140 164 219 179 326 182 298 ...
##  $ HMT40         : num  14.4 12.6 13.1 16.5 15 13.8 12.8 13.6 15.1 14 ...
##  $ HMT100        : num  32.8 32.5 31.7 29.3 34.9 31.7 32.2 30.8 31.6 28.9 ...
##  $ HMT102        : num  34.3 34.2 34.2 34 35.1 34.5 35 34.9 34.2 33.3 ...
##  $ RCT6          : num  37.8 18 52.3 33 33.6 ...
##  $ RCT11         : num  90.1 129.7 86.5 108.1 126.1 ...
##  $ RCT20         : num  174 251 217 162 159 ...
##  $ RCT392        : num  0.916 0.792 1.018 1.131 1.131 ...
##  $ MMSCORE       : int  30 30 27 30 21 29 28 30 28 27 ...
##  $ LIMMTOTAL     : int  16 9 9 10 3 9 3 10 9 12 ...
##  $ LDELTOTAL     : int  14 11 2 0 0 7 4 8 7 11 ...
##  $ MHPSYCH       : int  0 1 0 0 0 0 0 1 0 0 ...
##  $ MH2NEURL      : int  1 0 0 0 0 0 0 1 0 0 ...
##  $ MH4CARD       : int  0 0 1 1 0 1 1 0 1 0 ...
##  $ MH6HEPAT      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ MH8MUSCL      : int  0 0 1 0 0 0 1 1 1 1 ...
##  $ MH9ENDO       : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ MH10GAST      : int  0 1 0 0 0 1 0 1 0 1 ...
##  $ MH12RENA      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ MH16SMOK      : int  -4 0 1 0 1 1 0 1 1 0 ...
##  $ MH17MALI      : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ APGEN1        : int  3 4 3 3 4 3 4 4 3 4 ...
##  $ APGEN2        : int  3 3 3 3 4 3 3 2 3 3 ...
##  $ Gender        : Factor w/ 2 levels "Female","Male": 2 1 1 2 2 2 2 1 2 1 ...
##  $ BirthYear     : num  1941 1939 1922 1933 1925 ...
##  $ Age           : num  77 79 96 85 93 101 90 77 98 92 ...
##  $ DiagnosisGroup: Factor w/ 2 levels "HC","Non-HC": 1 1 1 2 2 1 2 1 2 1 ...
##  $ APOE4         : num  0 1 0 0 2 0 1 1 0 1 ...
table(aibl_model_data$DiagnosisGroup)
## 
##     HC Non-HC 
##    609    249

Descriptive Statistics

aggregate(Age ~ DiagnosisGroup, data = aibl_model_data, summary)
##   DiagnosisGroup  Age.Min. Age.1st Qu. Age.Median  Age.Mean Age.3rd Qu.
## 1             HC  65.00000    75.00000   79.00000  80.01478    84.00000
## 2         Non-HC  61.00000    77.00000   83.00000  82.42570    88.00000
##    Age.Max.
## 1 101.00000
## 2 103.00000
prop.table(table(aibl_model_data$Gender, aibl_model_data$DiagnosisGroup), 2)
##         
##                 HC    Non-HC
##   Female 0.5599343 0.5100402
##   Male   0.4400657 0.4899598
prop.table(table(aibl_model_data$APOE4 > 0, aibl_model_data$DiagnosisGroup), 2)
##        
##                HC    Non-HC
##   FALSE 0.7208539 0.4698795
##   TRUE  0.2791461 0.5301205
aggregate(MMSCORE ~ DiagnosisGroup, data = aibl_model_data, mean)
##   DiagnosisGroup  MMSCORE
## 1             HC 28.71757
## 2         Non-HC 24.12851
aggregate(CDGLOBAL ~ DiagnosisGroup, data = aibl_model_data, mean)
##   DiagnosisGroup   CDGLOBAL
## 1             HC 0.02298851
## 2         Non-HC 0.67068273
aggregate(LIMMTOTAL ~ DiagnosisGroup, data = aibl_model_data, mean)
##   DiagnosisGroup LIMMTOTAL
## 1             HC  12.56158
## 2         Non-HC   5.60241
aggregate(LDELTOTAL ~ DiagnosisGroup, data = aibl_model_data, mean)
##   DiagnosisGroup LDELTOTAL
## 1             HC 11.155993
## 2         Non-HC  3.040161

Missing Data Check

biomarker_cols <- c("AXT117", "BAT126", "HMT3", "HMT7", "HMT13", 
                    "HMT40", "HMT100", "HMT102", "RCT6", 
                    "RCT11", "RCT20", "RCT392")
missing_biomarkers <- sapply(aibl_model_data[biomarker_cols], function(x) mean(is.na(x)) * 100)
round(missing_biomarkers, 2)
## AXT117 BAT126   HMT3   HMT7  HMT13  HMT40 HMT100 HMT102   RCT6  RCT11  RCT20 
##      0      0      0      0      0      0      0      0      0      0      0 
## RCT392 
##      0

Model Training: Random Forest

pre_proc <- preProcess(aibl_model_data, method = c("medianImpute"))
data_imputed <- predict(pre_proc, aibl_model_data)

set.seed(123)
split_index <- createDataPartition(data_imputed$DiagnosisGroup, p = 0.7, list = FALSE)
train_data <- data_imputed[split_index, ]
test_data  <- data_imputed[-split_index, ]

rf_model <- randomForest(DiagnosisGroup ~ ., data = train_data, ntree = 100, importance = TRUE)

pred <- predict(rf_model, newdata = test_data)
conf_matrix <- confusionMatrix(pred, test_data$DiagnosisGroup)
conf_matrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  HC Non-HC
##     HC     178      6
##     Non-HC   4     68
##                                           
##                Accuracy : 0.9609          
##                  95% CI : (0.9293, 0.9811)
##     No Information Rate : 0.7109          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9042          
##                                           
##  Mcnemar's Test P-Value : 0.7518          
##                                           
##             Sensitivity : 0.9780          
##             Specificity : 0.9189          
##          Pos Pred Value : 0.9674          
##          Neg Pred Value : 0.9444          
##              Prevalence : 0.7109          
##          Detection Rate : 0.6953          
##    Detection Prevalence : 0.7188          
##       Balanced Accuracy : 0.9485          
##                                           
##        'Positive' Class : HC              
## 

Feature Importance

importance_df <- as.data.frame(importance(rf_model))
importance_df$Feature <- rownames(importance_df)
top_features <- importance_df[order(-importance_df$MeanDecreaseGini), ][1:10, ]

ggplot(top_features, aes(x = reorder(Feature, MeanDecreaseGini), y = MeanDecreaseGini)) +
  geom_col(fill = "darkgreen") +
  coord_flip() +
  labs(title = "Top 10 Feature Importances", x = "Feature", y = "Mean Decrease in Gini") +
  theme_minimal()

## Confusion Matrix Visualisation

conf_df <- as.data.frame(conf_matrix$table)
ggplot(conf_df, aes(Prediction, Reference, fill = Freq)) +
  geom_tile(color = "white") +
  geom_text(aes(label = Freq), size = 6) +
  scale_fill_gradient(low = "white", high = "steelblue") +
  labs(title = "Confusion Matrix", x = "Predicted", y = "Actual") +
  theme_minimal()

## Precision, Recall, F1 Score

metrics <- data.frame(
  Metric = rep(c("Precision", "Recall", "F1-Score"), 2),
  Class = rep(c("HC", "Non-HC"), each = 3),
  Value = c(
    178 / (178 + 6),
    178 / (178 + 4),
    2 * (0.967 * 0.978) / (0.967 + 0.978),
    68 / (68 + 4),
    68 / (68 + 6),
    2 * (0.944 * 0.919) / (0.944 + 0.919)
  )
)
metrics$Value <- round(metrics$Value, 3)

ggplot(metrics, aes(x = Metric, y = Value, fill = Class)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  geom_text(aes(label = Value), vjust = -0.3, position = position_dodge(0.9), size = 3.5) +
  labs(title = "Precision, Recall, and F1-Score by Class", y = "Score", x = NULL) +
  ylim(0, 1.05) +
  theme_minimal()

## Hyperparameter Tuning with Caret

ctrl <- trainControl(method = "cv", number = 5, verboseIter = TRUE)
tune_grid <- expand.grid(mtry = c(2, 4, 6, 8, 10, 12))

set.seed(123)
rf_tuned <- train(
  DiagnosisGroup ~ .,
  data = train_data,
  method = "rf",
  metric = "Accuracy",
  trControl = ctrl,
  tuneGrid = tune_grid,
  ntree = 100
)
## + Fold1: mtry= 2 
## - Fold1: mtry= 2 
## + Fold1: mtry= 4 
## - Fold1: mtry= 4 
## + Fold1: mtry= 6 
## - Fold1: mtry= 6 
## + Fold1: mtry= 8 
## - Fold1: mtry= 8 
## + Fold1: mtry=10 
## - Fold1: mtry=10 
## + Fold1: mtry=12 
## - Fold1: mtry=12 
## + Fold2: mtry= 2 
## - Fold2: mtry= 2 
## + Fold2: mtry= 4 
## - Fold2: mtry= 4 
## + Fold2: mtry= 6 
## - Fold2: mtry= 6 
## + Fold2: mtry= 8 
## - Fold2: mtry= 8 
## + Fold2: mtry=10 
## - Fold2: mtry=10 
## + Fold2: mtry=12 
## - Fold2: mtry=12 
## + Fold3: mtry= 2 
## - Fold3: mtry= 2 
## + Fold3: mtry= 4 
## - Fold3: mtry= 4 
## + Fold3: mtry= 6 
## - Fold3: mtry= 6 
## + Fold3: mtry= 8 
## - Fold3: mtry= 8 
## + Fold3: mtry=10 
## - Fold3: mtry=10 
## + Fold3: mtry=12 
## - Fold3: mtry=12 
## + Fold4: mtry= 2 
## - Fold4: mtry= 2 
## + Fold4: mtry= 4 
## - Fold4: mtry= 4 
## + Fold4: mtry= 6 
## - Fold4: mtry= 6 
## + Fold4: mtry= 8 
## - Fold4: mtry= 8 
## + Fold4: mtry=10 
## - Fold4: mtry=10 
## + Fold4: mtry=12 
## - Fold4: mtry=12 
## + Fold5: mtry= 2 
## - Fold5: mtry= 2 
## + Fold5: mtry= 4 
## - Fold5: mtry= 4 
## + Fold5: mtry= 6 
## - Fold5: mtry= 6 
## + Fold5: mtry= 8 
## - Fold5: mtry= 8 
## + Fold5: mtry=10 
## - Fold5: mtry=10 
## + Fold5: mtry=12 
## - Fold5: mtry=12 
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 12 on full training set
rf_tuned
## Random Forest 
## 
## 602 samples
##  32 predictor
##   2 classes: 'HC', 'Non-HC' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 482, 482, 481, 482, 481 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.9119697  0.7780485
##    4    0.9319008  0.8337297
##    6    0.9352204  0.8441862
##    8    0.9352204  0.8440296
##   10    0.9302342  0.8323479
##   12    0.9368871  0.8494611
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 12.
plot(rf_tuned)

pred_tuned <- predict(rf_tuned, newdata = test_data)
confusionMatrix(pred_tuned, test_data$DiagnosisGroup)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  HC Non-HC
##     HC     176      4
##     Non-HC   6     70
##                                           
##                Accuracy : 0.9609          
##                  95% CI : (0.9293, 0.9811)
##     No Information Rate : 0.7109          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9057          
##                                           
##  Mcnemar's Test P-Value : 0.7518          
##                                           
##             Sensitivity : 0.9670          
##             Specificity : 0.9459          
##          Pos Pred Value : 0.9778          
##          Neg Pred Value : 0.9211          
##              Prevalence : 0.7109          
##          Detection Rate : 0.6875          
##    Detection Prevalence : 0.7031          
##       Balanced Accuracy : 0.9565          
##                                           
##        'Positive' Class : HC              
##