MSI Status Prediction RF Model

Author

Sehyun Oh

Published

January 10, 2025

Abstract

Using the TCGA-COAD cancer samples for training

RAVmodel <- getModel('C2', load=TRUE)

[1] "downloading"

1. Data Preprocessing

## Samples with MSI status info
combined_data <- readRDS("data/combined_data.rds") # combined cancer samples from three TCGA datasets
target_attr <- "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"
combined_data_msi <- combined_data[,combined_data[[target_attr]] %in% c("msi-h", "mss")]

## Predictor RAVs
RAVs_combinedTCGA <- c(
    517, 220, 2109, 1303, 324, 438, 868, # RAVs that have statistically significant pairwise wilcoxon p-values of mss vs msi-h
    834, 190, 1166, # RAVs with significant KW test statistic (p-value < 0.05) for COAD
    2344, # significant KW test value for STAD, includes 324, 868, 517 above
    357) # UCEC KW test value (p-value = 0.056)
RAVmodel_sub <- RAVmodel[,RAVs_combinedTCGA]

## Calculate sample scores
sampleScore <- calculateScore(assay(combined_data_msi), RAVmodel_sub)

The data object with the predictors (i.e., sample scores from 12 RAVs) and response (i.e., MSI status) variable.

data <- as.data.frame(sampleScore)
data$status <- colData(combined_data_msi)[[target_attr]]
data$status <- ifelse(data$status == "msi-h", "MSI", "MSS")
data$status <- factor(data$status, levels = c("MSS", "MSI")) # Convert outcome to factor

Split data into training and testing sets

table(colData(combined_data)$admin.disease_code)


coad stad ucec 
 283  415  176

disease_code <- colData(combined_data)[c("patientID", "admin.disease_code")]

## Train dataset: TCGA-COAD
coad_id <- disease_code %>% as.data.frame %>% dplyr::filter(admin.disease_code == "coad") %>% pull(patientID)
train_coad <- data[rownames(data) %in% coad_id,]

## Test datasets
stad_id <- disease_code %>% as.data.frame %>% dplyr::filter(admin.disease_code == "stad") %>% pull(patientID)
test_stad <- data[rownames(data) %in% stad_id,]
ucec_id <- disease_code %>% as.data.frame %>% dplyr::filter(admin.disease_code == "ucec") %>% pull(patientID)
test_ucec <- data[rownames(data) %in% ucec_id,]

2. Handle class imbalance using `ROSE`

ROSE is used for handling class imbalance (alternatives include SMOTE from the DMwR package)

# balanced_train <- ROSE(status ~ ., data = train_coad)$data

3. Basic Logistic Regression

Both logistic regression and random forest are implemented for comparison

# ## With cross-validation
# ctrl <- trainControl(method = "cv", 
#                      number = 5,
#                      classProbs = TRUE,
#                      summaryFunction = twoClassSummary)
# 
# ## Train logistic regression
# log_model <- train(status ~ .,
#                    data = balanced_train,
#                    method = "glm",
#                    family = "binomial",
#                    trControl = ctrl,
#                    metric = "ROC")

4. Random Forest with class weights

Feature importance is assessed through random forest

rf_model <- randomForest(status ~ .,
                         data = train_coad,
                         ntree = 500,
                         classwt = c("MSS" = 1, "MSI" = 3))

5. Model Evaluation

Performance evaluation includes ROC curves and confusion matrices

TCGA-STAD test data

dat <- test_stad
rf_pred <- predict(rf_model, dat, type = "prob")
plot_roc_pROC(dat$status, rf_pred[, "MSI"], 
              paste0("TCGA-STAD MSI Status ROC (n = ", nrow(dat),")")) # update the plot title

Setting levels: control = MSS, case = MSI

Setting direction: controls < cases

roc_stad <- roc(dat$status, rf_pred[, "MSI"])

Setting levels: control = MSS, case = MSI
Setting direction: controls < cases

TCGA-UCEC test data

dat <- test_ucec
rf_pred <- predict(rf_model, dat, type = "prob")
plot_roc_pROC(dat$status, rf_pred[, "MSI"], 
              paste0("TCGA-UCEC MSI Status ROC (n = ", nrow(dat),")")) # update the plot title

Setting levels: control = MSS, case = MSI

Setting direction: controls < cases

roc_ucec <- roc(dat$status, rf_pred[, "MSI"])

Setting levels: control = MSS, case = MSI
Setting direction: controls < cases

## Combined ROC curves
# plot(roc_rf, col = "blue")
# lines(roc_coad, col = "red")
plot(roc_stad, col = "orange")
lines(roc_ucec, col = "yellow")

legend("bottomright", legend = c("TCGA-STAD", "TCGA-UCEC"),
       col = c("orange", "yellow"), lwd = 2)

6. Feature Importance (for Random Forest)

importance(rf_model)

        MeanDecreaseGini
RAV517         16.517806
RAV220          3.324617
RAV2109         2.865768
RAV1303         4.550412
RAV324          3.570059
RAV438          2.955644
RAV868          2.790610
RAV834         31.745404
RAV190          3.415508
RAV1166         8.949654
RAV2344         3.510100
RAV357          2.797849

varImpPlot(rf_model)

7. Confusion Matrix

confusionMatrix(predict(rf_model, test_stad), test_stad$status) # TCGA-STAD (n = 99)

Confusion Matrix and Statistics

          Reference
Prediction MSS MSI
       MSS 250  55
       MSI  26  25
                                         
               Accuracy : 0.7725         
                 95% CI : (0.7253, 0.815)
    No Information Rate : 0.7753         
    P-Value [Acc > NIR] : 0.579905       
                                         
                  Kappa : 0.2505         
                                         
 Mcnemar's Test P-Value : 0.001864       
                                         
            Sensitivity : 0.9058         
            Specificity : 0.3125         
         Pos Pred Value : 0.8197         
         Neg Pred Value : 0.4902         
             Prevalence : 0.7753         
         Detection Rate : 0.7022         
   Detection Prevalence : 0.8567         
      Balanced Accuracy : 0.6091         
                                         
       'Positive' Class : MSS

confusionMatrix(predict(rf_model, test_ucec), test_ucec$status) # TCGA-UCEC (n = 44)

Confusion Matrix and Statistics

          Reference
Prediction MSS MSI
       MSS 101  43
       MSI   8   4
                                          
               Accuracy : 0.6731          
                 95% CI : (0.5935, 0.7459)
    No Information Rate : 0.6987          
    P-Value [Acc > NIR] : 0.7851          
                                          
                  Kappa : 0.0149          
                                          
 Mcnemar's Test P-Value : 1.927e-06       
                                          
            Sensitivity : 0.92661         
            Specificity : 0.08511         
         Pos Pred Value : 0.70139         
         Neg Pred Value : 0.33333         
             Prevalence : 0.69872         
         Detection Rate : 0.64744         
   Detection Prevalence : 0.92308         
      Balanced Accuracy : 0.50586         
                                          
       'Positive' Class : MSS