Deliverable 2: Random Forest Classification Model - Predicting Microsatellite Status

Author

Britney Pheng

Prepartion

Packages

knitr::opts_chunk$set(echo = TRUE)

suppressPackageStartupMessages({
  # BiocManager
  library(GenomicSuperSignature)
  library(curatedTCGAData)
  library(MultiAssayExperiment)
  library(TCGAutils)
  library(ComplexHeatmap)
  
  # CRAN
  library(tidyverse) # includes dplyr, ggplot2, magrittr, tidyr
  library(magick)
  library(wordcloud)
  library(ztable)
  library(metafolio)
  library(randomForest)
  library(caret)
})
Warning: package 'matrixStats' was built under R version 4.3.3
Warning: package 'GenomeInfoDb' was built under R version 4.3.3
Warning: package 'magick' was built under R version 4.3.3
Warning: package 'randomForest' was built under R version 4.3.3

Load RAVmodel

RAVmodel <- getModel('C2', load=TRUE)

Load combined_data (COAD, STAD, UCEC cancer sample data)

combined_data <- readRDS("data/combinedTCGAdata/combined_data.rds")

tcga_dat <- combined_data[,combined_data@colData@listData$patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status %in% c('msi-h', 'mss')]

combined_data_meta <- colData(tcga_dat)

Split combined_data for training

# Randomly select 70% of samples for training
set.seed(1)
num_sample <- ncol(tcga_dat)
train_sample_ind <- sample(seq_len(num_sample), round(num_sample*0.7))

meta_train <- combined_data_meta[train_sample_ind,] 
data_train <- tcga_dat[, train_sample_ind] # 20,501 genes x 612 samples x 3205 attributes
# Remove batch effect variables from metadata table
batch_var <- "analyte|portion|procurement|aliquot|uuid|barcode"
batch_ind <- grep(batch_var, colnames(meta_train))

meta_train <- meta_train[,-batch_ind] # 612 samples x 1064 metadata attributes
meta_test <- combined_data_meta[-train_sample_ind,-batch_ind]
data_test <- tcga_dat[, -train_sample_ind]

Split Variable Types for Train Data

## Check for data types in listData
var_type <- meta_train@listData
unique(sapply(var_type, type))
[1] "character" "integer"   "double"    "logical"  
## Separate training data's metadata into two subsets: 
## character variables (~ categorical) and numeric variables (~ continuous)
charcTb <- meta_train[, sapply(var_type, class) == 'character']
numTb <- meta_train[, sapply(var_type, class) %in% c('numeric', 'integer')]

Split Variable Types for Test Data

## Check for data types in listData
var_type_testdata <- meta_test@listData

## Separate test data's metadata into two subsets: 
## character variables (~ categorical) and numeric variables (~ continuous)
charcTb_testdata <- meta_test[, sapply(var_type_testdata, class) == 'character']
numTb_testdata <- meta_test[, sapply(var_type_testdata, class) %in% c('numeric', 'integer')]

Sample Scores

## Calculate validation scores
sampleScore <- calculateScore(data_train, RAVmodel)
rownames(sampleScore) <- gsub("\\.", "-", rownames(sampleScore))

## Test data: calculate validation scores
sampleScore_testdata <- calculateScore(data_test, RAVmodel)
rownames(sampleScore_testdata) <- gsub("\\.", "-", rownames(sampleScore_testdata))

Sample scores for all RAVs

## Training Data
validate_data <- validate(data_train, RAVmodel)
#heatmapTable(validate_data, RAVmodel)

# validated_ind <- validatedSignatures(validate_data,
#                                      RAVmodel,
#                                      num.out = 4764, # We want to validate all RAVs so we can select which ones to include in the RF model
#                                      #scoreCutoff = 0.45,
#                                      indexOnly = TRUE)
# saveRDS(validated_ind, file="/Users/bpheng/GSS-Britney-Fieldwork-2024/traindata_validatedRAVs.rds")
validated_ind <- readRDS("/Users/bpheng/GSS-Britney-Fieldwork-2024/traindata_validatedRAVs.rds")
#Already saved as an RDS file, called above

## Subset sampleScore
sampleScore_sub <- sampleScore[,validated_ind] %>% as.data.frame()

## Test Data
validate_testdata <- validate(data_test, RAVmodel)
# heatmapTable(validate_testdata, RAVmodel, num.out = 15)
# 
# validated_ind_testdata <- validatedSignatures(validate_testdata,
#                                      RAVmodel,
#                                      num.out = 4764,
#                                      #scoreCutoff = 0.45,
#                                      indexOnly = TRUE)
# 
# saveRDS(validated_ind_testdata, file="/Users/bpheng/GSS-Britney-Fieldwork-2024/testdata_validatedRAVs.rds")

validated_ind_testdata <- readRDS("/Users/bpheng/GSS-Britney-Fieldwork-2024/testdata_validatedRAVs.rds")

## Subset sampleScore
sampleScore_sub_testdata <- sampleScore_testdata[,validated_ind_testdata] %>% as.data.frame()

Clean up factor level name for microsatellite instability - high to “msi”

## Convert character variables into the factor data type
factorTb <- charcTb
factorTb[sapply(factorTb, is.character)] <- lapply(factorTb[sapply(factorTb, is.character)], factor)

levels(factorTb@listData$patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status)[levels(factorTb@listData$patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status) == "msi-h"] <- "msi"
## Convert character variables into the factor data type
factorTb_testdata <- charcTb_testdata
factorTb_testdata[sapply(factorTb_testdata, is.character)] <- lapply(factorTb_testdata[sapply(factorTb_testdata, is.character)], factor)

levels(factorTb_testdata@listData$patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status)[levels(factorTb_testdata@listData$patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status) == "msi-h"] <- "msi"

Use top validated RAVs for TCGA-COAD,STAD, UCEC

RAVs_combinedTCGA <- c(517, 220, 2109, 1303, 324, 438, 868, #RAVs that have statistically significant pairwise wilcoxon p-values of mss vs msi-h
                       834, 190, 1166, #RAVs with significant KW test statistic (p-value < 0.05) for COAD
                       2344, #significant KW test value for STAD, includes 324, 868, 517 above
                       357) #UCEC KW test value (p-value = 0.056)

validated_RAVs <- paste("RAV", RAVs_combinedTCGA, sep="")
#validated_RAVs <- c("RAV357", "RAV27", "RAV834", "RAV190", "RAV1166", "RAV517",
                    #"RAV2344", "RAV324", "RAV438", "RAV220", "RAV868", "RAV1008", "RAV625")
target_attr <- "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"

labels <- factorTb[[target_attr]]
nonNALabels <- which(!is.na(labels))
data <- sampleScore_sub[,validated_RAVs]

train_data <- data[nonNALabels,]
train_labels <- labels[nonNALabels]
train_data$MSI_Status <- train_labels

Random forest classification model

rf <- randomForest(MSI_Status~., data=train_data, proximity=TRUE)
print(rf)

Call:
 randomForest(formula = MSI_Status ~ ., data = train_data, proximity = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 3

        OOB estimate of  error rate: 17.82%
Confusion matrix:
    msi mss class.error
msi  63  62  0.49600000
mss  31 366  0.07808564
p1 <- predict(rf, train_data)
#confusionMatrix(p1, train_data$MSI_Status)

confusion_matrix <- table(p1, train_data$MSI_Status)
confusion_matrix
     
p1    msi mss
  msi 125   0
  mss   0 397
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(accuracy)
[1] 1
importance(rf)
        MeanDecreaseGini
RAV517         32.884866
RAV220         10.589733
RAV2109        16.042686
RAV1303        10.312380
RAV324         11.622534
RAV438         10.265993
RAV868         12.059423
RAV834         39.521919
RAV190         10.838033
RAV1166        11.921161
RAV2344        13.844759
RAV357          9.036766
varImpPlot(rf)

predictions1 <- predict(rf, newdata = train_data, predict.all = TRUE)

tree_predictions <- predictions1$individual  # Extract tree predictions

# Calculate accuracy for each tree (for classification)
tree_accuracy <- apply(tree_predictions, 2, function(tree_pred) {
  mean(tree_pred == train_data$MSI_Status)
})

# print(tree_accuracy)
# sort(tree_accuracy)

# Calculate weights proportional to accuracy
weights <- tree_accuracy / sum(tree_accuracy)

# Combine predictions using weighted average
weighted_predictions <- apply(tree_predictions, 1, function(preds) {
  weighted_pred <- tapply(weights, preds, sum)
  names(which.max(weighted_pred))
})

# Convert weighted predictions to factor
weighted_predictions <- as.factor(weighted_predictions)

confusion_matrix <- table(weighted_predictions, train_data$MSI_Status)
confusion_matrix
                    
weighted_predictions msi mss
                 msi 125   0
                 mss   0 397
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(accuracy)
[1] 1

Using Test Data with the RF Model

labels <- factorTb_testdata[[target_attr]]
nonNALabels <- which(!is.na(labels))
data <- sampleScore_sub_testdata[,validated_RAVs]

test_data <- data[nonNALabels,]
test_labels <- labels[nonNALabels]

test_data$MSI_Status <- test_labels
p2 <- predict(rf, test_data)
confusionMatrix(p2, test_data$MSI_Status)
Confusion Matrix and Statistics

          Reference
Prediction msi mss
       msi  31  14
       mss  23 155
                                          
               Accuracy : 0.8341          
                 95% CI : (0.7786, 0.8804)
    No Information Rate : 0.7578          
    P-Value [Acc > NIR] : 0.003787        
                                          
                  Kappa : 0.5208          
                                          
 Mcnemar's Test P-Value : 0.188445        
                                          
            Sensitivity : 0.5741          
            Specificity : 0.9172          
         Pos Pred Value : 0.6889          
         Neg Pred Value : 0.8708          
             Prevalence : 0.2422          
         Detection Rate : 0.1390          
   Detection Prevalence : 0.2018          
      Balanced Accuracy : 0.7456          
                                          
       'Positive' Class : msi             
                                          
library("rpart")
library("rpart.plot")

#Fit new model with rpart
train_msi_model <- rpart(MSI_Status ~., data = train_data)
test_msi_model <- rpart(MSI_Status ~., data = test_data)
rpart.plot(train_msi_model)

rpart.plot(test_msi_model)

Test RF model with individual TCGA data cancer types: COAD, UCEC, STAD

COAD

coad.prediction <- predict(rf, new_coad_data)
confusionMatrix(coad.prediction, new_coad_data$msi)
Confusion Matrix and Statistics

          Reference
Prediction msi mss
       msi  37   7
       mss  15 174
                                          
               Accuracy : 0.9056          
                 95% CI : (0.8605, 0.9399)
    No Information Rate : 0.7768          
    P-Value [Acc > NIR] : 2.01e-07        
                                          
                  Kappa : 0.7119          
                                          
 Mcnemar's Test P-Value : 0.1356          
                                          
            Sensitivity : 0.7115          
            Specificity : 0.9613          
         Pos Pred Value : 0.8409          
         Neg Pred Value : 0.9206          
             Prevalence : 0.2232          
         Detection Rate : 0.1588          
   Detection Prevalence : 0.1888          
      Balanced Accuracy : 0.8364          
                                          
       'Positive' Class : msi             
                                          

UCEC

ucec.prediction <- predict(rf, new_ucec_data)
confusionMatrix(ucec.prediction, new_ucec_data$msi)
Confusion Matrix and Statistics

          Reference
Prediction msi mss
       msi  15  10
       mss  32  99
                                         
               Accuracy : 0.7308         
                 95% CI : (0.654, 0.7986)
    No Information Rate : 0.6987         
    P-Value [Acc > NIR] : 0.217495       
                                         
                  Kappa : 0.2623         
                                         
 Mcnemar's Test P-Value : 0.001194       
                                         
            Sensitivity : 0.31915        
            Specificity : 0.90826        
         Pos Pred Value : 0.60000        
         Neg Pred Value : 0.75573        
             Prevalence : 0.30128        
         Detection Rate : 0.09615        
   Detection Prevalence : 0.16026        
      Balanced Accuracy : 0.61370        
                                         
       'Positive' Class : msi            
                                         

STAD

stad.prediction <- predict(rf, new_stad_data)
confusionMatrix(stad.prediction, new_stad_data$msi)
Confusion Matrix and Statistics

          Reference
Prediction msi mss
       msi  48  26
       mss  32 250
                                          
               Accuracy : 0.8371          
                 95% CI : (0.7945, 0.8739)
    No Information Rate : 0.7753          
    P-Value [Acc > NIR] : 0.002418        
                                          
                  Kappa : 0.5196          
                                          
 Mcnemar's Test P-Value : 0.511482        
                                          
            Sensitivity : 0.6000          
            Specificity : 0.9058          
         Pos Pred Value : 0.6486          
         Neg Pred Value : 0.8865          
             Prevalence : 0.2247          
         Detection Rate : 0.1348          
   Detection Prevalence : 0.2079          
      Balanced Accuracy : 0.7529          
                                          
       'Positive' Class : msi             
                                          

Test with curatedCRCData

library(curatedCRCData)
data(package="curatedCRCData")

curatedCRC Dataset 1: GSE13067_eset

data(GSE13067_eset)

library(SummarizedExperiment)

mySummarizedExperiment <- makeSummarizedExperimentFromExpressionSet(GSE13067_eset)
assay(mySummarizedExperiment) <- log2(assay(mySummarizedExperiment) + 1)

#mySummarizedExperiment@colData$msi
mySummarizedExperiment@colData@listData[["msi"]] <- gsub("MSS", "mss", mySummarizedExperiment@colData@listData[["msi"]] )
mySummarizedExperiment@colData@listData[["msi"]] <- gsub("MSI", "msi", mySummarizedExperiment@colData@listData[["msi"]] )

GSE13067.meta <- colData(mySummarizedExperiment)
GSE13067.sampleScore <- calculateScore(mySummarizedExperiment, RAVmodel) %>% as.data.frame()
GSE13067.prediction <- predict(rf, new_data)
confusionMatrix(GSE13067.prediction, new_data$msi)
Confusion Matrix and Statistics

          Reference
Prediction msi mss
       msi   8   0
       mss   3  63
                                          
               Accuracy : 0.9595          
                 95% CI : (0.8861, 0.9916)
    No Information Rate : 0.8514          
    P-Value [Acc > NIR] : 0.00297         
                                          
                  Kappa : 0.8195          
                                          
 Mcnemar's Test P-Value : 0.24821         
                                          
            Sensitivity : 0.7273          
            Specificity : 1.0000          
         Pos Pred Value : 1.0000          
         Neg Pred Value : 0.9545          
             Prevalence : 0.1486          
         Detection Rate : 0.1081          
   Detection Prevalence : 0.1081          
      Balanced Accuracy : 0.8636          
                                          
       'Positive' Class : msi             
                                          

curatedCRC Dataset 2: GSE13294_eset

data(GSE13294_eset)
GSE13294_eset <- makeSummarizedExperimentFromExpressionSet(GSE13294_eset)
assay(GSE13294_eset) <- log2(assay(GSE13294_eset) + 1)
Warning: NaNs produced
GSE13294_eset@colData@listData[["msi"]] <- gsub("MSS", "mss", GSE13294_eset@colData@listData[["msi"]] )
GSE13294_eset@colData@listData[["msi"]] <- gsub("MSI", "msi", GSE13294_eset@colData@listData[["msi"]] )

GSE13294.meta <- colData(GSE13294_eset)
GSE13294.sampleScore <- calculateScore(GSE13294_eset, RAVmodel) %>% as.data.frame()
GSE13294.prediction <- predict(rf, new_data)
confusionMatrix(GSE13294.prediction, new_data$msi)
Confusion Matrix and Statistics

          Reference
Prediction msi mss
       msi  25   1
       mss  53  76
                                         
               Accuracy : 0.6516         
                 95% CI : (0.571, 0.7263)
    No Information Rate : 0.5032         
    P-Value [Acc > NIR] : 0.0001358      
                                         
                  Kappa : 0.3062         
                                         
 Mcnemar's Test P-Value : 3.915e-12      
                                         
            Sensitivity : 0.3205         
            Specificity : 0.9870         
         Pos Pred Value : 0.9615         
         Neg Pred Value : 0.5891         
             Prevalence : 0.5032         
         Detection Rate : 0.1613         
   Detection Prevalence : 0.1677         
      Balanced Accuracy : 0.6538         
                                         
       'Positive' Class : msi