Load RAVmodel

RAVmodel <- getModel('C2', load=TRUE)
## [1] "downloading"
coad <- curatedTCGAData(diseaseCode = 'COAD',
                        assays = 'RNA*',
                        version = '2.0.1',
                        dry.run = FALSE)

sampleTables(coad)
## $`COAD_RNASeq2Gene-20160128`
## 
##  01  02  06  11 
## 283   1   1  41 
## 
## $`COAD_RNASeq2GeneNorm_illuminaga-20160128`
## 
##  01 
## 191 
## 
## $`COAD_RNASeq2GeneNorm_illuminahiseq-20160128`
## 
##  01  02  06  11 
## 283   1   1  41 
## 
## $`COAD_RNASeqGene-20160128`
## 
## 01 
## 10
coad2 <- TCGAsplitAssays(coad, c("01", "11"))
## Warning: Some 'sampleCodes' not found in assays
coad_rna_cancer <- getWithColData(coad2,
                           '01_COAD_RNASeq2Gene-20160128')
## Warning: 'experiments' dropped; see 'drops()'
assay(coad_rna_cancer) <- log2(assay(coad_rna_cancer) + 1)

coad_rna_cancer <- coad_rna_cancer[,coad_rna_cancer@colData@listData$patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status %in% c('msi-h', 'mss')]

table(coad_rna_cancer$patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status)
## 
## msi-h   mss 
##    52   181
table(coad_rna_cancer$MSI_status)
## 
## MSI-H   MSS 
##     6    15
table(coad_rna_cancer$patient.microsatellite_instability)
## 
##  no yes 
##  60   8
# MSI_status and patient.microsatellite_instability_test_results have the same values; ignore MSI_status
# coad_msi <- coad_rna_cancer %>%
#   as.data.frame() %>%
#   dplyr::select(patientID, patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status, patient.microsatellite_instability)
# head(coad_msi)

which(coad_rna_cancer$patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status == "msi-h" &
           coad_rna_cancer$patient.microsatellite_instability == "no")
## [1]  64 160 161
coad_rna_cancer <- coad_rna_cancer[-c(64,160,161)]

#which(coad_rna_cancer$patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status == "msi-l" &
#coad_rna_cancer$patient.microsatellite_instability == "no")
#coad_rna_cancer <- coad_rna_cancer[-c(83,86,87,120,146,197,198,200,205,218,239,253,266)]

coad_meta_cancer <- colData(coad_rna_cancer)
validate_coad_cancer <- validate(coad_rna_cancer, RAVmodel)
heatmapTable(validate_coad_cancer, RAVmodel, num.out=15)
## RAV61 can be filtered based on GSEA_C2
## RAV438 can be filtered based on GSEA_C2

# Select columns with >10% completeness
non_sparse_ind <- which(colSums(!is.na(coad_meta_cancer)) > round(nrow(coad_meta_cancer)/10))
meta_non_sparse <- coad_meta_cancer[non_sparse_ind] %>% subset (select = -patientID) # remove `patientID`
# Randomly select 80% of samples for training
set.seed(1)
num_sample <- ncol(coad_rna_cancer)
train_sample_ind <- sample(seq_len(num_sample), round(num_sample*0.8))

meta_train <- meta_non_sparse[train_sample_ind,] # 226 samples x 702 metadata attributes
coad_train <- coad_rna_cancer[, train_sample_ind] # 20,501 genes x 226 samples

Remove batch effect variables from metadata table

batch_var <- "analyte|portion|procurement|aliquot|uuid|barcode"
batch_ind <- grep(batch_var, colnames(meta_train))
meta_train <- meta_train[,-batch_ind] # 226 samples x 245 metadata attributes
meta_test <- meta_non_sparse[-train_sample_ind,] # 226 samples x 702 metadata attributes
coad_test <- coad_rna_cancer[, -train_sample_ind] # 20,501 genes x 57 samples

Based on variable types

## Check for data types in listData
var_type <- meta_train@listData
unique(sapply(var_type, type))
## [1] "integer"   "character" "double"
## Separate training data's metadata into two subsets: 
## character variables (~ categorical) and numeric variables (~ continuous)
charcTb <- meta_train[, sapply(var_type, class) == 'character'] # 261 samples x 182 metadata attributes (char)
numTb <- meta_train[, sapply(var_type, class) %in% c('numeric', 'integer')] # 261 samples x 108 metadata attributes (num)

Based on variable types; Test Data

## Check for data types in listData
var_type_2 <- meta_test@listData
unique(sapply(var_type_2, type))
## [1] "integer"   "character" "double"
## Separate training data's metadata into two subsets: 
## character variables (~ categorical) and numeric variables (~ continuous)
charcTb_2 <- meta_test[, sapply(var_type_2, class) == 'character'] # 261 samples x 182 metadata attributes (char)
numTb_2 <- meta_test[, sapply(var_type_2, class) %in% c('numeric', 'integer')] # 261 samples x 108 metadata attributes (num)

Sample Score

## Calculate validation scores
sampleScore <- calculateScore(coad_train, RAVmodel)
rownames(sampleScore) <- gsub("\\.", "-", rownames(sampleScore))

## Test data: calculate validation scores
sampleScore_2 <- calculateScore(coad_test, RAVmodel)
rownames(sampleScore_2) <- gsub("\\.", "-", rownames(sampleScore_2))

Sample scores only for the validated RAVs

## Training
validate_coad <- validate(coad_train, RAVmodel)
heatmapTable(validate_coad, RAVmodel)

validated_ind <- validatedSignatures(validate_coad, 
                                     RAVmodel, 
                                     num.out = 4764, 
                                     #scoreCutoff = 0.45, 
                                     indexOnly = TRUE)

## Subset sampleScore
sampleScore_sub <- sampleScore[,validated_ind] %>% as.data.frame()

## Test
validate_coad_2 <- validate(coad_test, RAVmodel)
heatmapTable(validate_coad_2, RAVmodel, num.out = 15)

validated_ind_2 <- validatedSignatures(validate_coad_2, 
                                     RAVmodel, 
                                     num.out = 4764, 
                                     #scoreCutoff = 0.45, 
                                     indexOnly = TRUE)
## Subset sampleScore
sampleScore_sub_2 <- sampleScore_2[,validated_ind_2] %>% as.data.frame()
## SummarizedExperiment object containing COAD train dataset 
#saveRDS(coad_train, "data/tcga_coad_cancer_train.rds") # 20,501 genes x 226 samples

## Sample scores for the train set (80% of the randomly selected COAD samples)
## Only the top 15 validated RAVs
#write.csv(sampleScore_sub, "data/sampleScore_train_cancer.csv") # 226samples x 15 RAVs

## Training set's metadata: character and numeric variables
#write.csv(charcTb, "data/meta_train_char_cancer.csv", row.names = TRUE)
#write.csv(numTb, "data/meta_train_num_cancer.csv", row.names = TRUE)
## Convert character variables into the factor data type
factorTb <- charcTb
factorTb[sapply(factorTb, is.character)] <- lapply(factorTb[sapply(factorTb, is.character)], factor)
## Convert character variables into the factor data type
factorTb_2 <- charcTb_2
factorTb_2[sapply(factorTb_2, is.character)] <- lapply(factorTb_2[sapply(factorTb_2, is.character)], factor)

Prediction models

Use top 15 validated RAVs for TCGA-COAD

validated_RAVs <- c("RAV188", "RAV1575", "RAV324", "RAV61", "RAV1008", 
                    "RAV833", "RAV190", "RAV868", "RAV64", "RAV220", 
                    "RAV834", "RAV438", "RAV192", "RAV1166", "RAV579")
target_attr <- "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"

labels <- factorTb[[target_attr]]
nonNALabels <- which(!is.na(labels))
data <- sampleScore_sub[,validated_RAVs]

train_data <- data[nonNALabels,]
train_labels <- labels[nonNALabels]
train_data$MSI_Status <- train_labels

Random Forest classification model

rf <- randomForest(MSI_Status~., data=train_data, proximity=TRUE)
print(rf)
## 
## Call:
##  randomForest(formula = MSI_Status ~ ., data = train_data, proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 10.75%
## Confusion matrix:
##       msi-h mss class.error
## msi-h    24  14  0.36842105
## mss       6 142  0.04054054
p1 <- predict(rf, train_data)
confusionMatrix(p1, train_data$MSI_Status)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction msi-h mss
##      msi-h    38   0
##      mss       0 148
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9804, 1)
##     No Information Rate : 0.7957     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.2043     
##          Detection Rate : 0.2043     
##    Detection Prevalence : 0.2043     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : msi-h      
## 

Test Data

validated_RAVs_2 <- c("RAV61", "RAV64", "RAV188", "RAV190", "RAV220", 
                    "RAV324", "RAV438", "RAV579", "RAV832", "RAV833", 
                    "RAV834", "RAV868", "RAV1008", "RAV1575", "RAV2528")
target_attr <- "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"

labels <- factorTb_2[[target_attr]]
nonNALabels <- which(!is.na(labels))
data <- sampleScore_sub_2[,validated_RAVs] #Did not choose RAVs that were validated for the test data, used the RAVs validated for the testing data. We will need to then work on selecting which RAVs are actually validated for COAD/MSI

test_data <- data[nonNALabels,]
test_labels <- labels[nonNALabels]

test_data$MSI_Status <- test_labels
p2 <- predict(rf, test_data)
confusionMatrix(p2, test_data$MSI_Status)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction msi-h mss
##      msi-h     8   0
##      mss       6  33
##                                           
##                Accuracy : 0.8723          
##                  95% CI : (0.7426, 0.9517)
##     No Information Rate : 0.7021          
##     P-Value [Acc > NIR] : 0.005498        
##                                           
##                   Kappa : 0.6519          
##                                           
##  Mcnemar's Test P-Value : 0.041227        
##                                           
##             Sensitivity : 0.5714          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.8462          
##              Prevalence : 0.2979          
##          Detection Rate : 0.1702          
##    Detection Prevalence : 0.1702          
##       Balanced Accuracy : 0.7857          
##                                           
##        'Positive' Class : msi-h           
## 
# devtools::install_github("araastat/reprtree")
# library("reprtree")
#reprtree:::plot.getTree(rf, k=2, depth=4)
#reprtree:::plot.reprtree( ReprTree(rf, train_data, metric='d2'))

Decision Tree

#library("rpart")
library("rpart.plot")
## Loading required package: rpart
#fit new model with rpart
train_msi_model <- rpart(MSI_Status ~., data = train_data)

# plot the tree
rpart.plot(train_msi_model)

#BiocManager::install('seandavi/GEOquery')
#library(GEOquery)