RAVmodel <- getModel('C2', load=TRUE)[1] "downloading"
RAVmodel <- getModel('C2', load=TRUE)[1] "downloading"
## Samples with MSI status info
combined_data <- readRDS("data/combined_data.rds") # combined cancer samples from three TCGA datasets
target_attr <- "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"
combined_data_msi <- combined_data[,combined_data[[target_attr]] %in% c("msi-h", "mss")]
## Predictor RAVs
RAVs_combinedTCGA <- c(
517, 220, 2109, 1303, 324, 438, 868, # RAVs that have statistically significant pairwise wilcoxon p-values of mss vs msi-h
834, 190, 1166, # RAVs with significant KW test statistic (p-value < 0.05) for COAD
2344, # significant KW test value for STAD, includes 324, 868, 517 above
357) # UCEC KW test value (p-value = 0.056)
RAVmodel_sub <- RAVmodel[,RAVs_combinedTCGA]
## Calculate sample scores
sampleScore <- calculateScore(assay(combined_data_msi), RAVmodel_sub)The data object with the predictors (i.e., sample scores from 12 RAVs) and response (i.e., MSI status) variable.
data <- as.data.frame(sampleScore)
data$status <- colData(combined_data_msi)[[target_attr]]
data$status <- ifelse(data$status == "msi-h", "MSI", "MSS")
data$status <- factor(data$status, levels = c("MSS", "MSI")) # Convert outcome to factor# Randomly select 70% of samples for training
set.seed(1)
num_sample <- nrow(data)
index <- sample(seq_len(num_sample), round(num_sample*0.7))
train_data <- data[index, ]
test_data <- data[-index, ]table(colData(combined_data)$admin.disease_code)
coad stad ucec
283 415 176
disease_code <- colData(combined_data)[c("patientID", "admin.disease_code")]
## Split test datasets per cancer types
coad_id <- disease_code %>% as.data.frame %>% dplyr::filter(admin.disease_code == "coad") %>% pull(patientID)
test_data_coad <- test_data[rownames(test_data) %in% coad_id,]
stad_id <- disease_code %>% as.data.frame %>% dplyr::filter(admin.disease_code == "stad") %>% pull(patientID)
test_data_stad <- test_data[rownames(test_data) %in% stad_id,]
ucec_id <- disease_code %>% as.data.frame %>% dplyr::filter(admin.disease_code == "ucec") %>% pull(patientID)
test_data_ucec <- test_data[rownames(test_data) %in% ucec_id,]ROSEROSE is used for handling class imbalance (alternatives include SMOTE from the DMwR package)
# balanced_train <- ROSE(status ~ ., data = train_data)$dataBoth logistic regression and random forest are implemented for comparison
# ## With cross-validation
# ctrl <- trainControl(method = "cv",
# number = 5,
# classProbs = TRUE,
# summaryFunction = twoClassSummary)
#
# ## Train logistic regression
# log_model <- train(status ~ .,
# data = balanced_train,
# method = "glm",
# family = "binomial",
# trControl = ctrl,
# metric = "ROC")Feature importance is assessed through random forest
rf_model <- randomForest(status ~ .,
data = train_data,
ntree = 500,
classwt = c("MSS" = 1, "MSI" = 3))Performance evaluation includes ROC curves and confusion matrices
## Predictions on test set
rf_pred <- predict(rf_model, test_data, type = "prob")
## ROC curves
roc_rf <- roc(test_data$status, rf_pred[, "MSI"])Setting levels: control = MSS, case = MSI
Setting direction: controls < cases
## Plot ROC curves
# plot(roc_rf, col = "blue")
plot_roc_pROC(test_data$status, rf_pred[, "MSI"],
paste0("MSI Status ROC (n = ", nrow(test_data),")")) # update the plot titleSetting levels: control = MSS, case = MSI
Setting direction: controls < cases
dat <- test_data_coad
rf_pred <- predict(rf_model, dat, type = "prob")
plot_roc_pROC(dat$status, rf_pred[, "MSI"],
paste0("TCGA-COAD MSI Status ROC (n = ", nrow(dat),")")) # update the plot titleSetting levels: control = MSS, case = MSI
Setting direction: controls < cases
roc_coad <- roc(dat$status, rf_pred[, "MSI"])Setting levels: control = MSS, case = MSI
Setting direction: controls < cases
dat <- test_data_stad
rf_pred <- predict(rf_model, dat, type = "prob")
plot_roc_pROC(dat$status, rf_pred[, "MSI"],
paste0("TCGA-STAD MSI Status ROC (n = ", nrow(dat),")")) # update the plot titleSetting levels: control = MSS, case = MSI
Setting direction: controls < cases
roc_stad <- roc(dat$status, rf_pred[, "MSI"])Setting levels: control = MSS, case = MSI
Setting direction: controls < cases
dat <- test_data_ucec
rf_pred <- predict(rf_model, dat, type = "prob")
plot_roc_pROC(dat$status, rf_pred[, "MSI"],
paste0("TCGA-UCEC MSI Status ROC (n = ", nrow(dat),")")) # update the plot titleSetting levels: control = MSS, case = MSI
Setting direction: controls < cases
roc_ucec <- roc(dat$status, rf_pred[, "MSI"])Setting levels: control = MSS, case = MSI
Setting direction: controls < cases
## Combined ROC curves
plot(roc_rf, col = "blue")
lines(roc_coad, col = "red")
lines(roc_stad, col = "orange")
# lines(roc_ucec, col = "yellow")
legend("bottomright", legend = c("All", "TCGA-COAD", "TCGA-STAD"),
col = c("blue", "red", "orange"), lwd = 2)importance(rf_model) MeanDecreaseGini
RAV517 35.372148
RAV220 8.684906
RAV2109 18.565805
RAV1303 9.636089
RAV324 13.026021
RAV438 11.328097
RAV868 12.726725
RAV834 42.848759
RAV190 9.745338
RAV1166 11.657381
RAV2344 13.221008
RAV357 8.587273
varImpPlot(rf_model)confusionMatrix(predict(rf_model, test_data), test_data$status) # combined Confusion Matrix and Statistics
Reference
Prediction MSS MSI
MSS 155 25
MSI 14 29
Accuracy : 0.8251
95% CI : (0.7688, 0.8726)
No Information Rate : 0.7578
P-Value [Acc > NIR] : 0.00985
Kappa : 0.488
Mcnemar's Test P-Value : 0.10931
Sensitivity : 0.9172
Specificity : 0.5370
Pos Pred Value : 0.8611
Neg Pred Value : 0.6744
Prevalence : 0.7578
Detection Rate : 0.6951
Detection Prevalence : 0.8072
Balanced Accuracy : 0.7271
'Positive' Class : MSS
confusionMatrix(predict(rf_model, test_data_coad), test_data_coad$status) # TCGA-COAD (n = 80)Confusion Matrix and Statistics
Reference
Prediction MSS MSI
MSS 59 2
MSI 5 14
Accuracy : 0.9125
95% CI : (0.828, 0.9641)
No Information Rate : 0.8
P-Value [Acc > NIR] : 0.005272
Kappa : 0.7445
Mcnemar's Test P-Value : 0.449692
Sensitivity : 0.9219
Specificity : 0.8750
Pos Pred Value : 0.9672
Neg Pred Value : 0.7368
Prevalence : 0.8000
Detection Rate : 0.7375
Detection Prevalence : 0.7625
Balanced Accuracy : 0.8984
'Positive' Class : MSS
confusionMatrix(predict(rf_model, test_data_stad), test_data_stad$status) # TCGA-STAD (n = 99)Confusion Matrix and Statistics
Reference
Prediction MSS MSI
MSS 70 9
MSI 7 13
Accuracy : 0.8384
95% CI : (0.7509, 0.9047)
No Information Rate : 0.7778
P-Value [Acc > NIR] : 0.08846
Kappa : 0.5168
Mcnemar's Test P-Value : 0.80259
Sensitivity : 0.9091
Specificity : 0.5909
Pos Pred Value : 0.8861
Neg Pred Value : 0.6500
Prevalence : 0.7778
Detection Rate : 0.7071
Detection Prevalence : 0.7980
Balanced Accuracy : 0.7500
'Positive' Class : MSS
confusionMatrix(predict(rf_model, test_data_ucec), test_data_ucec$status) # TCGA-UCEC (n = 44)Confusion Matrix and Statistics
Reference
Prediction MSS MSI
MSS 26 14
MSI 2 2
Accuracy : 0.6364
95% CI : (0.4777, 0.7759)
No Information Rate : 0.6364
P-Value [Acc > NIR] : 0.56766
Kappa : 0.0638
Mcnemar's Test P-Value : 0.00596
Sensitivity : 0.9286
Specificity : 0.1250
Pos Pred Value : 0.6500
Neg Pred Value : 0.5000
Prevalence : 0.6364
Detection Rate : 0.5909
Detection Prevalence : 0.9091
Balanced Accuracy : 0.5268
'Positive' Class : MSS