## Check for data types in listDatavar_type <- meta_train@listDataunique(sapply(var_type, type))
[1] "character" "integer" "double" "logical"
## Separate training data's metadata into two subsets: ## character variables (~ categorical) and numeric variables (~ continuous)charcTb <- meta_train[, sapply(var_type, class) =='character']numTb <- meta_train[, sapply(var_type, class) %in%c('numeric', 'integer')]
Split Variable Types for Test Data
## Check for data types in listDatavar_type_testdata <- meta_test@listData## Separate test data's metadata into two subsets: ## character variables (~ categorical) and numeric variables (~ continuous)charcTb_testdata <- meta_test[, sapply(var_type_testdata, class) =='character']numTb_testdata <- meta_test[, sapply(var_type_testdata, class) %in%c('numeric', 'integer')]
## Training Datavalidate_data <-validate(data_train, RAVmodel)#heatmapTable(validate_data, RAVmodel)# validated_ind <- validatedSignatures(validate_data,# RAVmodel,# num.out = 4764, # We want to validate all RAVs so we can select which ones to include in the RF model# #scoreCutoff = 0.45,# indexOnly = TRUE)# saveRDS(validated_ind, file="/Users/bpheng/GSS-Britney-Fieldwork-2024/traindata_validatedRAVs.rds")validated_ind <-readRDS("/Users/bpheng/GSS-Britney-Fieldwork-2024/traindata_validatedRAVs.rds")#Already saved as an RDS file, called above## Subset sampleScoresampleScore_sub <- sampleScore[,validated_ind] %>%as.data.frame()## Test Datavalidate_testdata <-validate(data_test, RAVmodel)# heatmapTable(validate_testdata, RAVmodel, num.out = 15)# # validated_ind_testdata <- validatedSignatures(validate_testdata,# RAVmodel,# num.out = 4764,# #scoreCutoff = 0.45,# indexOnly = TRUE)# # saveRDS(validated_ind_testdata, file="/Users/bpheng/GSS-Britney-Fieldwork-2024/testdata_validatedRAVs.rds")validated_ind_testdata <-readRDS("/Users/bpheng/GSS-Britney-Fieldwork-2024/testdata_validatedRAVs.rds")## Subset sampleScoresampleScore_sub_testdata <- sampleScore_testdata[,validated_ind_testdata] %>%as.data.frame()
Clean up factor level name for microsatellite instability - high to “msi”
## Convert character variables into the factor data typefactorTb <- charcTbfactorTb[sapply(factorTb, is.character)] <-lapply(factorTb[sapply(factorTb, is.character)], factor)levels(factorTb@listData$patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status)[levels(factorTb@listData$patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status) =="msi-h"] <-"msi"
## Convert character variables into the factor data typefactorTb_testdata <- charcTb_testdatafactorTb_testdata[sapply(factorTb_testdata, is.character)] <-lapply(factorTb_testdata[sapply(factorTb_testdata, is.character)], factor)levels(factorTb_testdata@listData$patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status)[levels(factorTb_testdata@listData$patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status) =="msi-h"] <-"msi"
Use top validated RAVs for TCGA-COAD,STAD, UCEC
RAVs_combinedTCGA <-c(517, 220, 2109, 1303, 324, 438, 868, #RAVs that have statistically significant pairwise wilcoxon p-values of mss vs msi-h834, 190, 1166, #RAVs with significant KW test statistic (p-value < 0.05) for COAD2344, #significant KW test value for STAD, includes 324, 868, 517 above357) #UCEC KW test value (p-value = 0.056)validated_RAVs <-paste("RAV", RAVs_combinedTCGA, sep="")#validated_RAVs <- c("RAV357", "RAV27", "RAV834", "RAV190", "RAV1166", "RAV517",#"RAV2344", "RAV324", "RAV438", "RAV220", "RAV868", "RAV1008", "RAV625")target_attr <-"patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"labels <- factorTb[[target_attr]]nonNALabels <-which(!is.na(labels))data <- sampleScore_sub[,validated_RAVs]train_data <- data[nonNALabels,]train_labels <- labels[nonNALabels]train_data$MSI_Status <- train_labels
Call:
randomForest(formula = MSI_Status ~ ., data = train_data, proximity = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 3
OOB estimate of error rate: 17.82%
Confusion matrix:
msi mss class.error
msi 63 62 0.49600000
mss 31 366 0.07808564
Confusion Matrix and Statistics
Reference
Prediction msi mss
msi 31 14
mss 23 155
Accuracy : 0.8341
95% CI : (0.7786, 0.8804)
No Information Rate : 0.7578
P-Value [Acc > NIR] : 0.003787
Kappa : 0.5208
Mcnemar's Test P-Value : 0.188445
Sensitivity : 0.5741
Specificity : 0.9172
Pos Pred Value : 0.6889
Neg Pred Value : 0.8708
Prevalence : 0.2422
Detection Rate : 0.1390
Detection Prevalence : 0.2018
Balanced Accuracy : 0.7456
'Positive' Class : msi
library("rpart")library("rpart.plot")#Fit new model with rparttrain_msi_model <-rpart(MSI_Status ~., data = train_data)test_msi_model <-rpart(MSI_Status ~., data = test_data)
rpart.plot(train_msi_model)
rpart.plot(test_msi_model)
Test RF model with individual TCGA data cancer types: COAD, UCEC, STAD