RAVmodel <- getModel('C2', load=TRUE)[1] "downloading"
RAVmodel <- getModel('C2', load=TRUE)[1] "downloading"
## Samples with MSI status info
combined_data <- readRDS("data/combined_data.rds") # combined cancer samples from three TCGA datasets
target_attr <- "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"
combined_data_msi <- combined_data[,combined_data[[target_attr]] %in% c("msi-h", "mss")]
## Predictor RAVs
RAVs_combinedTCGA <- c(
517, 220, 2109, 1303, 324, 438, 868, # RAVs that have statistically significant pairwise wilcoxon p-values of mss vs msi-h
834, 190, 1166, # RAVs with significant KW test statistic (p-value < 0.05) for COAD
2344, # significant KW test value for STAD, includes 324, 868, 517 above
357) # UCEC KW test value (p-value = 0.056)
RAVmodel_sub <- RAVmodel[,RAVs_combinedTCGA]
## Calculate sample scores
sampleScore <- calculateScore(assay(combined_data_msi), RAVmodel_sub)The data object with the predictors (i.e., sample scores from 12 RAVs) and response (i.e., MSI status) variable.
data <- as.data.frame(sampleScore)
data$status <- colData(combined_data_msi)[[target_attr]]
data$status <- ifelse(data$status == "msi-h", "MSI", "MSS")
data$status <- factor(data$status, levels = c("MSS", "MSI")) # Convert outcome to factortable(colData(combined_data)$admin.disease_code)
coad stad ucec
283 415 176
disease_code <- colData(combined_data)[c("patientID", "admin.disease_code")]
## Train dataset: TCGA-COAD
coad_id <- disease_code %>% as.data.frame %>% dplyr::filter(admin.disease_code == "coad") %>% pull(patientID)
train_coad <- data[rownames(data) %in% coad_id,]
## Test datasets
stad_id <- disease_code %>% as.data.frame %>% dplyr::filter(admin.disease_code == "stad") %>% pull(patientID)
test_stad <- data[rownames(data) %in% stad_id,]
ucec_id <- disease_code %>% as.data.frame %>% dplyr::filter(admin.disease_code == "ucec") %>% pull(patientID)
test_ucec <- data[rownames(data) %in% ucec_id,]ROSEROSE is used for handling class imbalance (alternatives include SMOTE from the DMwR package)
# balanced_train <- ROSE(status ~ ., data = train_coad)$dataBoth logistic regression and random forest are implemented for comparison
# ## With cross-validation
# ctrl <- trainControl(method = "cv",
# number = 5,
# classProbs = TRUE,
# summaryFunction = twoClassSummary)
#
# ## Train logistic regression
# log_model <- train(status ~ .,
# data = balanced_train,
# method = "glm",
# family = "binomial",
# trControl = ctrl,
# metric = "ROC")Feature importance is assessed through random forest
rf_model <- randomForest(status ~ .,
data = train_coad,
ntree = 500,
classwt = c("MSS" = 1, "MSI" = 3))Performance evaluation includes ROC curves and confusion matrices
dat <- test_stad
rf_pred <- predict(rf_model, dat, type = "prob")
plot_roc_pROC(dat$status, rf_pred[, "MSI"],
paste0("TCGA-STAD MSI Status ROC (n = ", nrow(dat),")")) # update the plot titleSetting levels: control = MSS, case = MSI
Setting direction: controls < cases
roc_stad <- roc(dat$status, rf_pred[, "MSI"])Setting levels: control = MSS, case = MSI
Setting direction: controls < cases
dat <- test_ucec
rf_pred <- predict(rf_model, dat, type = "prob")
plot_roc_pROC(dat$status, rf_pred[, "MSI"],
paste0("TCGA-UCEC MSI Status ROC (n = ", nrow(dat),")")) # update the plot titleSetting levels: control = MSS, case = MSI
Setting direction: controls < cases
roc_ucec <- roc(dat$status, rf_pred[, "MSI"])Setting levels: control = MSS, case = MSI
Setting direction: controls < cases
## Combined ROC curves
# plot(roc_rf, col = "blue")
# lines(roc_coad, col = "red")
plot(roc_stad, col = "orange")
lines(roc_ucec, col = "yellow")
legend("bottomright", legend = c("TCGA-STAD", "TCGA-UCEC"),
col = c("orange", "yellow"), lwd = 2)importance(rf_model) MeanDecreaseGini
RAV517 16.517806
RAV220 3.324617
RAV2109 2.865768
RAV1303 4.550412
RAV324 3.570059
RAV438 2.955644
RAV868 2.790610
RAV834 31.745404
RAV190 3.415508
RAV1166 8.949654
RAV2344 3.510100
RAV357 2.797849
varImpPlot(rf_model)confusionMatrix(predict(rf_model, test_stad), test_stad$status) # TCGA-STAD (n = 99)Confusion Matrix and Statistics
Reference
Prediction MSS MSI
MSS 250 55
MSI 26 25
Accuracy : 0.7725
95% CI : (0.7253, 0.815)
No Information Rate : 0.7753
P-Value [Acc > NIR] : 0.579905
Kappa : 0.2505
Mcnemar's Test P-Value : 0.001864
Sensitivity : 0.9058
Specificity : 0.3125
Pos Pred Value : 0.8197
Neg Pred Value : 0.4902
Prevalence : 0.7753
Detection Rate : 0.7022
Detection Prevalence : 0.8567
Balanced Accuracy : 0.6091
'Positive' Class : MSS
confusionMatrix(predict(rf_model, test_ucec), test_ucec$status) # TCGA-UCEC (n = 44)Confusion Matrix and Statistics
Reference
Prediction MSS MSI
MSS 101 43
MSI 8 4
Accuracy : 0.6731
95% CI : (0.5935, 0.7459)
No Information Rate : 0.6987
P-Value [Acc > NIR] : 0.7851
Kappa : 0.0149
Mcnemar's Test P-Value : 1.927e-06
Sensitivity : 0.92661
Specificity : 0.08511
Pos Pred Value : 0.70139
Neg Pred Value : 0.33333
Prevalence : 0.69872
Detection Rate : 0.64744
Detection Prevalence : 0.92308
Balanced Accuracy : 0.50586
'Positive' Class : MSS