MSI Status Prediction RF Model

Author

Sehyun Oh

Published

January 10, 2025

Abstract
Using the combined cancer samples for training
RAVmodel <- getModel('C2', load=TRUE)
[1] "downloading"

1. Data Preprocessing

## Samples with MSI status info
combined_data <- readRDS("data/combined_data.rds") # combined cancer samples from three TCGA datasets
target_attr <- "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"
combined_data_msi <- combined_data[,combined_data[[target_attr]] %in% c("msi-h", "mss")]

## Predictor RAVs
RAVs_combinedTCGA <- c(
    517, 220, 2109, 1303, 324, 438, 868, # RAVs that have statistically significant pairwise wilcoxon p-values of mss vs msi-h
    834, 190, 1166, # RAVs with significant KW test statistic (p-value < 0.05) for COAD
    2344, # significant KW test value for STAD, includes 324, 868, 517 above
    357) # UCEC KW test value (p-value = 0.056)
RAVmodel_sub <- RAVmodel[,RAVs_combinedTCGA]

## Calculate sample scores
sampleScore <- calculateScore(assay(combined_data_msi), RAVmodel_sub)

The data object with the predictors (i.e., sample scores from 12 RAVs) and response (i.e., MSI status) variable.

data <- as.data.frame(sampleScore)
data$status <- colData(combined_data_msi)[[target_attr]]
data$status <- ifelse(data$status == "msi-h", "MSI", "MSS")
data$status <- factor(data$status, levels = c("MSS", "MSI")) # Convert outcome to factor

Split data into training and testing sets

# Randomly select 70% of samples for training
set.seed(1)
num_sample <- nrow(data)
index <- sample(seq_len(num_sample), round(num_sample*0.7))

train_data <- data[index, ]
test_data <- data[-index, ]
table(colData(combined_data)$admin.disease_code)

coad stad ucec 
 283  415  176 
disease_code <- colData(combined_data)[c("patientID", "admin.disease_code")]

## Split test datasets per cancer types
coad_id <- disease_code %>% as.data.frame %>% dplyr::filter(admin.disease_code == "coad") %>% pull(patientID)
test_data_coad <- test_data[rownames(test_data) %in% coad_id,]
stad_id <- disease_code %>% as.data.frame %>% dplyr::filter(admin.disease_code == "stad") %>% pull(patientID)
test_data_stad <- test_data[rownames(test_data) %in% stad_id,]
ucec_id <- disease_code %>% as.data.frame %>% dplyr::filter(admin.disease_code == "ucec") %>% pull(patientID)
test_data_ucec <- test_data[rownames(test_data) %in% ucec_id,]

2. Handle class imbalance using ROSE

ROSE is used for handling class imbalance (alternatives include SMOTE from the DMwR package)

# balanced_train <- ROSE(status ~ ., data = train_data)$data

3. Basic Logistic Regression

Both logistic regression and random forest are implemented for comparison

# ## With cross-validation
# ctrl <- trainControl(method = "cv", 
#                      number = 5,
#                      classProbs = TRUE,
#                      summaryFunction = twoClassSummary)
# 
# ## Train logistic regression
# log_model <- train(status ~ .,
#                    data = balanced_train,
#                    method = "glm",
#                    family = "binomial",
#                    trControl = ctrl,
#                    metric = "ROC")

4. Random Forest with class weights

Feature importance is assessed through random forest

rf_model <- randomForest(status ~ .,
                         data = train_data,
                         ntree = 500,
                         classwt = c("MSS" = 1, "MSI" = 3))

5. Model Evaluation

Performance evaluation includes ROC curves and confusion matrices

Combined test data

## Predictions on test set
rf_pred <- predict(rf_model, test_data, type = "prob")

## ROC curves
roc_rf <- roc(test_data$status, rf_pred[, "MSI"])
Setting levels: control = MSS, case = MSI
Setting direction: controls < cases
## Plot ROC curves
# plot(roc_rf, col = "blue")
plot_roc_pROC(test_data$status, rf_pred[, "MSI"], 
              paste0("MSI Status ROC (n = ", nrow(test_data),")")) # update the plot title
Setting levels: control = MSS, case = MSI
Setting direction: controls < cases

TCGA-COAD test data

dat <- test_data_coad
rf_pred <- predict(rf_model, dat, type = "prob")
plot_roc_pROC(dat$status, rf_pred[, "MSI"], 
              paste0("TCGA-COAD MSI Status ROC (n = ", nrow(dat),")")) # update the plot title
Setting levels: control = MSS, case = MSI
Setting direction: controls < cases

roc_coad <- roc(dat$status, rf_pred[, "MSI"])
Setting levels: control = MSS, case = MSI
Setting direction: controls < cases

TCGA-STAD test data

dat <- test_data_stad
rf_pred <- predict(rf_model, dat, type = "prob")
plot_roc_pROC(dat$status, rf_pred[, "MSI"], 
              paste0("TCGA-STAD MSI Status ROC (n = ", nrow(dat),")")) # update the plot title
Setting levels: control = MSS, case = MSI
Setting direction: controls < cases

roc_stad <- roc(dat$status, rf_pred[, "MSI"])
Setting levels: control = MSS, case = MSI
Setting direction: controls < cases

TCGA-UCEC test data

dat <- test_data_ucec
rf_pred <- predict(rf_model, dat, type = "prob")
plot_roc_pROC(dat$status, rf_pred[, "MSI"], 
              paste0("TCGA-UCEC MSI Status ROC (n = ", nrow(dat),")")) # update the plot title
Setting levels: control = MSS, case = MSI
Setting direction: controls < cases

roc_ucec <- roc(dat$status, rf_pred[, "MSI"])
Setting levels: control = MSS, case = MSI
Setting direction: controls < cases
## Combined ROC curves
plot(roc_rf, col = "blue")
lines(roc_coad, col = "red")
lines(roc_stad, col = "orange")
# lines(roc_ucec, col = "yellow")

legend("bottomright", legend = c("All", "TCGA-COAD", "TCGA-STAD"),
       col = c("blue", "red", "orange"), lwd = 2)

6. Feature Importance (for Random Forest)

importance(rf_model)
        MeanDecreaseGini
RAV517         35.372148
RAV220          8.684906
RAV2109        18.565805
RAV1303         9.636089
RAV324         13.026021
RAV438         11.328097
RAV868         12.726725
RAV834         42.848759
RAV190          9.745338
RAV1166        11.657381
RAV2344        13.221008
RAV357          8.587273
varImpPlot(rf_model)

7. Confusion Matrix

confusionMatrix(predict(rf_model, test_data), test_data$status) # combined 
Confusion Matrix and Statistics

          Reference
Prediction MSS MSI
       MSS 155  25
       MSI  14  29
                                          
               Accuracy : 0.8251          
                 95% CI : (0.7688, 0.8726)
    No Information Rate : 0.7578          
    P-Value [Acc > NIR] : 0.00985         
                                          
                  Kappa : 0.488           
                                          
 Mcnemar's Test P-Value : 0.10931         
                                          
            Sensitivity : 0.9172          
            Specificity : 0.5370          
         Pos Pred Value : 0.8611          
         Neg Pred Value : 0.6744          
             Prevalence : 0.7578          
         Detection Rate : 0.6951          
   Detection Prevalence : 0.8072          
      Balanced Accuracy : 0.7271          
                                          
       'Positive' Class : MSS             
                                          
confusionMatrix(predict(rf_model, test_data_coad), test_data_coad$status) # TCGA-COAD (n = 80)
Confusion Matrix and Statistics

          Reference
Prediction MSS MSI
       MSS  59   2
       MSI   5  14
                                         
               Accuracy : 0.9125         
                 95% CI : (0.828, 0.9641)
    No Information Rate : 0.8            
    P-Value [Acc > NIR] : 0.005272       
                                         
                  Kappa : 0.7445         
                                         
 Mcnemar's Test P-Value : 0.449692       
                                         
            Sensitivity : 0.9219         
            Specificity : 0.8750         
         Pos Pred Value : 0.9672         
         Neg Pred Value : 0.7368         
             Prevalence : 0.8000         
         Detection Rate : 0.7375         
   Detection Prevalence : 0.7625         
      Balanced Accuracy : 0.8984         
                                         
       'Positive' Class : MSS            
                                         
confusionMatrix(predict(rf_model, test_data_stad), test_data_stad$status) # TCGA-STAD (n = 99)
Confusion Matrix and Statistics

          Reference
Prediction MSS MSI
       MSS  70   9
       MSI   7  13
                                          
               Accuracy : 0.8384          
                 95% CI : (0.7509, 0.9047)
    No Information Rate : 0.7778          
    P-Value [Acc > NIR] : 0.08846         
                                          
                  Kappa : 0.5168          
                                          
 Mcnemar's Test P-Value : 0.80259         
                                          
            Sensitivity : 0.9091          
            Specificity : 0.5909          
         Pos Pred Value : 0.8861          
         Neg Pred Value : 0.6500          
             Prevalence : 0.7778          
         Detection Rate : 0.7071          
   Detection Prevalence : 0.7980          
      Balanced Accuracy : 0.7500          
                                          
       'Positive' Class : MSS             
                                          
confusionMatrix(predict(rf_model, test_data_ucec), test_data_ucec$status) # TCGA-UCEC (n = 44)
Confusion Matrix and Statistics

          Reference
Prediction MSS MSI
       MSS  26  14
       MSI   2   2
                                          
               Accuracy : 0.6364          
                 95% CI : (0.4777, 0.7759)
    No Information Rate : 0.6364          
    P-Value [Acc > NIR] : 0.56766         
                                          
                  Kappa : 0.0638          
                                          
 Mcnemar's Test P-Value : 0.00596         
                                          
            Sensitivity : 0.9286          
            Specificity : 0.1250          
         Pos Pred Value : 0.6500          
         Neg Pred Value : 0.5000          
             Prevalence : 0.6364          
         Detection Rate : 0.5909          
   Detection Prevalence : 0.9091          
      Balanced Accuracy : 0.5268          
                                          
       'Positive' Class : MSS