Initial Setup

Load packages

suppressPackageStartupMessages({
  # BiocManager
  library(GenomicSuperSignature)
  library(curatedTCGAData)
  library(MultiAssayExperiment)
  library(TCGAutils)
  library(ComplexHeatmap)
  
  # CRAN
  library(tidyverse) # includes dplyr, ggplot2, magrittr, tidyr
  library(magick)
  library(wordcloud)
  library(ztable)
  library(metafolio)
})

Load RAVmodel

RAVmodel <- getModel('C2', load=TRUE)

## [1] "downloading"

Select COAD RNA metadata

coad <- curatedTCGAData(diseaseCode = 'COAD',
                        assays = 'RNA*',
                        version = '2.0.1',
                        dry.run = FALSE)

coad_rna <- getWithColData(coad,
                            'COAD_RNASeq2Gene-20160128')

assay(coad_rna) <- log2(assay(coad_rna) + 1)

table(coad$patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status)

## 
## indeterminate         msi-h         msi-l           mss 
##             2            83            82           288

## Parse out cancer vs normal samples
sampleTables(coad)

## $`COAD_RNASeq2Gene-20160128`
## 
##  01  02  06  11 
## 283   1   1  41 
## 
## $`COAD_RNASeq2GeneNorm_illuminaga-20160128`
## 
##  01 
## 191 
## 
## $`COAD_RNASeq2GeneNorm_illuminahiseq-20160128`
## 
##  01  02  06  11 
## 283   1   1  41 
## 
## $`COAD_RNASeqGene-20160128`
## 
## 01 
## 10

coad2 <- TCGAsplitAssays(coad, c("01", "11"))

coad_rna_cancer <- getWithColData(coad2,
                           '01_COAD_RNASeq2Gene-20160128')
assay(coad_rna_cancer) <- log2(assay(coad_rna_cancer) + 1)
coad_cancer <- colData(coad_rna_cancer)

coad_rna_normal <- getWithColData(coad2,
                           '11_COAD_RNASeq2Gene-20160128')
assay(coad_rna_normal) <- log2(assay(coad_rna_normal) + 1)
coad_normal <- colData(coad_rna_normal)

table(coad_cancer$patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status)

## 
## indeterminate         msi-h         msi-l           mss 
##             2            52            48           181

table(coad_normal$patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status)

## 
## msi-h msi-l   mss 
##     9     7    25

normal_tb <- coad_normal %>%
  as.data.frame() %>%
  dplyr::select(patientID, patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status)

cancer_tb <- coad_cancer %>%
  as.data.frame() %>%
  dplyr::select(patientID, patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status)

combined_tb <- inner_join(normal_tb, cancer_tb, by="patientID") # All paired cancer and normal samples have the same MSI status

heatmapTable: COAD

validate_coad_rna <- validate(coad_rna, RAVmodel)
validate_coad_normal <- validate(coad_rna_normal, RAVmodel)
validate_coad_cancer <- validate(coad_rna_cancer, RAVmodel)

heatmapTable(validate_coad_rna, RAVmodel, num.out = 10)
heatmapTable(validate_coad_normal, RAVmodel, num.out = 10, column_title="COAD Normal Samples")

## RAV1187 can be filtered based on GSEA_C2

heatmapTable(validate_coad_cancer, RAVmodel, num.out=10, column_title="COAD Cancer Samples")

## RAV61 can be filtered based on GSEA_C2

Subset

Filter attributes

# Cancer
sparsity_summary <- table(colSums(is.na(coad_cancer)))
sparsity_summary

## 
##   0   1   2   3   4   5   6   7   8   9  10  11  13  14  15  17  18  19  20  23 
##  80  75  20  73   2  17   2   2   1  24   3   1  25   2   1   1   1   3   1   4 
##  25  30  32  33  35  36  37  38  40  41  43  44  45  48  50  54  56  57  58  59 
##   1   1   1   2  16   2   2   2  13   1   1   9   1   1   1   1  13   2   1   3 
##  63  64  65  66  67  68  69  72  80  81  95  96  97  98 100 105 106 112 139 148 
##   8   3   5   2   3   5   2   2   3   1   5   2   1   1   1   1   1   1   1   1 
## 149 154 176 177 178 179 183 184 188 189 193 198 199 204 205 206 208 209 210 211 
##   1   2   5   8   2   1   1   2   1   1   3   1   1   2   9   2   1   2   5   9 
## 212 213 214 215 217 218 220 223 224 225 226 227 228 230 231 232 233 234 235 236 
##   2   1   2   1   3   1   1   5   1   4   2   1   1   2  15   1   3   1  13   4 
## 237 238 239 240 241 242 244 246 247 248 249 250 251 252 253 254 255 256 257 258 
##   1   1   5  14   6   3   2   7   3   2   3   1   1  13   2  55   5  61  18  30 
## 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 
##  53   4  11   6   3  10  10  16   1  13   2  41   5  43  34  44  17   8  54 103 
## 279 280 281 282 283 
## 280 287 115 287 351

Sparsity Plot

# Cancer
plot(stack(sparsity_summary)$ind,
     stack(sparsity_summary)$values)

# Select columns with >10% completeness
# Cancer
keep_attribute_ind <- which(colSums(!is.na(coad_cancer)) > round(nrow(coad_cancer)/10))
meta_sub1 <- coad_cancer[keep_attribute_ind]
meta_sub1 <- subset(coad_cancer, select= -patientID)

# Randomly select for 140 rows
set.seed(1)
random_sample_ind <- sample(1:nrow(meta_sub1), 140)
meta_sub2 <- meta_sub1[random_sample_ind,]

# Check for data types in listData
unique(sapply(coad_cancer@listData, type))

## [1] "character" "integer"   "double"

charcTb <- meta_sub2[, sapply(meta_sub2, class) == 'character']
numTb <- meta_sub2[, sapply(meta_sub2, class) %in% c('numeric', 'integer')]

# View numeric variables that have <=4 unique values to determine if they should be converted to character variables
addToFactors <- c()
for (i in 1:length(numTb)) {
  if (length(table(numTb[i])) <= 4) {
    addToFactors <- c(addToFactors, i)
  }
}
#numTb[addToFactors] #None at this time
#charcTb <- c(charcTb, numTb[addToFactors])
#numTb <- numTb[-addToFactors]

# Calculate validation scores
sampleScore <- calculateScore(coad_rna_cancer, RAVmodel)

validated_ind <- validatedSignatures(validate_coad_cancer, RAVmodel, num.out = 15, scoreCutoff = 0.45, indexOnly = TRUE) #Using Pearson Coefficient

## RAV61 can be filtered based on GSEA_C2

## RAV438 can be filtered based on GSEA_C2

## Subset sampleScore to join with MCPcounter
sampleScore_sub <- sampleScore[random_sample_ind, validated_ind] %>% as.data.frame()

Separate out Factor Variables by number of levels (1, 2, 3+) not including NA

# Convert to factor data type
factorTb <- meta_sub2[, sapply(meta_sub2, class) == 'character']

#factorTb <- charcTb
factorTb[sapply(factorTb, is.character)] <- lapply(factorTb[sapply(factorTb, is.character)], factor, exclude = NULL)

#any(is.na(levels(factorTb[,2])))
single_factor_ind <- c()
binary_factor_ind <- c()
multi_factor_ind <- c()

# Testing factor grouping
for (i in 1:length(factorTb)) {
  if (nlevels(factorTb[,i]) == 1 | 
      (nlevels(factorTb[,i]) == 2 & any(is.na(levels(factorTb[,i]))))
      ) {
    single_factor_ind <- c(single_factor_ind, i)
    
  } else if (nlevels(factorTb[,i]) == 3 & any(is.na(levels(factorTb[,i]))) |
             (nlevels(factorTb[,i]) == 2 & !any(is.na(levels(factorTb[,i]))))
          ) {
    binary_factor_ind <- c(binary_factor_ind, i)
    
  } else {
    multi_factor_ind <- c(multi_factor_ind, i)
  }
}

new_factorTb <- factorTb[,multi_factor_ind]
binary_factor <- factorTb[,binary_factor_ind]
single_factor <- factorTb[,single_factor_ind]

Calculate Wilcoxon Test for Binomial Factor Variables

#Remove factors without enough y-observations

remove_index <- c()

for (i in seq_len(ncol(binary_factor))) {
  x <- summary(binary_factor[,i])[1:2]
  
  if (all(x > 1)) {
    #print(x)
  } else {
    remove_index <- c(remove_index, i)
    #print(paste("index", i, "does not have enough observations"))
  }
}

binary_factor_2 <- binary_factor[,-remove_index]

wilcox_test_res <- as.data.frame(matrix(nrow = ncol(binary_factor_2),
                                ncol = ncol(sampleScore_sub)))

rownames(wilcox_test_res) <- colnames(binary_factor_2)
colnames(wilcox_test_res) <- colnames(sampleScore_sub)

wtest_coad_wvalue <- wilcox_test_res
wtest_coad_pvalue <- wilcox_test_res

for (i in seq_len(ncol(sampleScore_sub))) {
  for (j in seq_len(ncol(binary_factor_2))) {
    ## wilcoxon test
    wilcox_test <- wilcox.test(sampleScore_sub[, i] ~ binary_factor_2[,j], alternative="two.sided")
    
      ## W value
      wval <- wilcox_test$statistic
      wtest_coad_wvalue[j, i] <- wval
    
    ## p-value
      pval <- wilcox_test$p.value
      wtest_coad_pvalue[j, i] <- pval
    
  }
}

batch_char_ind <- grep('analyte|analytes|portion|procurement|aliquot|uuid|barcode',
                  rownames(wtest_coad_wvalue))
wtest_coad_wvalue_2 <- wtest_coad_wvalue[-batch_char_ind,]
wtest_coad_pvalue_2 <- wtest_coad_pvalue[-batch_char_ind,]

heatmap(as.matrix(wtest_coad_wvalue_2), main = 'COAD Wilcoxon Test Cancer Binomial')

#Only view w-values with significant p-value (<0.01)
wtest_coad_sig_wvalue <- wilcox_test_res

 for (i in seq_len(ncol(sampleScore_sub))) {
   for (j in seq_len(ncol(binary_factor_2))) {
     if (wilcox.test(sampleScore_sub[, i] ~ binary_factor_2[,j], alternative="two.sided")$p.val < 0.01) {
       wtest_coad_sig_wvalue[j, i] <- wilcox.test(sampleScore_sub[, i] ~ binary_factor_2[,j],
                                                  alternative="two.sided")$statistic
     }
   }
 }

na_ind <- apply(wtest_coad_sig_wvalue, 1, function(x) all(is.na(x)))
wtest_coad_sig_wvalue <- wtest_coad_sig_wvalue[!na_ind, ]
batch_char_ind_2 <- grep('analyte|analytes|portion|procurement|aliquot|uuid|barcode',
                  rownames(wtest_coad_sig_wvalue))
wtest_coad_sig_wvalue <- wtest_coad_sig_wvalue[-batch_char_ind_2, ]

options(ztable.type='html')
ztable(wtest_coad_sig_wvalue) %>%
  makeHeatmap(palette = 'Blues') %>%
  print(caption='Cancer Sample W-test with p-values < 0.01')

Cancer Sample W-test with p-values < 0.01
	RAV188	RAV1575	RAV324	RAV832	RAV61	RAV1008	RAV868	RAV64	RAV220	RAV834	RAV833	RAV438	RAV1166	RAV192
patient.braf_gene_analysis_performed		1094.00	362.00	NA		1086.00	349.00	333.00	1121.00			1142.00	352.00	347.00
patient.braf_gene_analysis_result				NA						36.00
patient.follow_ups.follow_up.2.new_tumor_events.new_tumor_event_after_initial_treatment				NA			58.00		269.00				69.00	48.00
patient.kras_gene_analysis_performed	1603.00	1698.00	718.00	NA	645.00	1623.00	728.00	544.00	1707.00				524.00	591.00
patient.lymphatic_invasion				NA							2078.00
patient.microsatellite_instability				NA						7.00
patient.tissue_prospective_collection_indicator		1682.00	2896.00	NA		1622.00	2861.00				2860.00
patient.tissue_retrospective_collection_indicator		2861.00	1647.00	NA		2921.00	1682.00				1683.00
patient.biospecimen_cqcf.path_confirm_report_attached	832.00		218.00	NA		829.00	211.00	223.00	827.00		204.00	811.00	201.00

Calculate Kruskal-Wallis Test for Multinomial Factor Variables

kruskal_wallis_res <- as.data.frame(matrix(nrow = ncol(new_factorTb),
                                ncol = ncol(sampleScore_sub)))

rownames(kruskal_wallis_res) <- colnames(new_factorTb)
colnames(kruskal_wallis_res) <- colnames(sampleScore_sub)

kwtest_coad_wvalue <- kruskal_wallis_res
kwtest_coad_pvalue <- kruskal_wallis_res

for (i in seq_len(ncol(sampleScore_sub))) {
  for (j in seq_len(ncol(new_factorTb))) {
    ## Kruskal-Wallis Test
    kruskal_test <- kruskal.test(sampleScore_sub[, i] ~ new_factorTb[,j])
    
    ## Kruskal-Wallis Chi-squared value
    kw_val <-  kruskal_test$statistic
    kwtest_coad_wvalue[j, i] <- kw_val

    ## p-value
    pval <-  kruskal_test$p.value
    kwtest_coad_pvalue[j, i] <- pval
  }
}

batch_char_ind <- grep('analyte|analytes|portion|procurement|aliquot|uuid|barcode',
                  rownames(kwtest_coad_wvalue))
kwtest_coad_wvalue <- kwtest_coad_wvalue[-batch_char_ind,]
kwtest_coad_pvalue <- kwtest_coad_pvalue[-batch_char_ind,]

heatmap(as.matrix(kwtest_coad_wvalue), main = 'COAD Kruskal-Wallis Test Cancer Multinomial')

#Only view kw-values with significant p-value (<0.01)
kwtest_coad_sig_wvalue <- kruskal_wallis_res

 for (i in seq_len(ncol(sampleScore_sub))) {
   for (j in seq_len(ncol(new_factorTb))) {
     if (kruskal.test(sampleScore_sub[, i] ~ new_factorTb[,j])$p.value < 0.01) {
       kwtest_coad_sig_wvalue[j, i] <- kruskal.test(sampleScore_sub[, i] ~ new_factorTb[,j])$statistic
     }
   }
 }

batch_char_ind <- grep('analyte|analytes|portion|procurement|aliquot|uuid|barcode',
                  rownames(kwtest_coad_sig_wvalue))
kwtest_coad_sig_wvalue <- kwtest_coad_sig_wvalue[-batch_char_ind, ]
na_ind <- apply(kwtest_coad_sig_wvalue, 1, function(x) all(is.na(x)))
kwtest_coad_sig_wvalue <- kwtest_coad_sig_wvalue[!na_ind, ]

options(ztable.type='html')
ztable(kwtest_coad_sig_wvalue) %>%
  makeHeatmap(palette = 'Blues') %>%
  print(caption='Cancer Sample Kruskal-Wallis Test with p-values < 0.01')

Cancer Sample Kruskal-Wallis Test with p-values < 0.01
	RAV188	RAV1575	RAV324	RAV832	RAV1008	RAV868	RAV64	RAV220	RAV834	RAV833	RAV438	RAV190	RAV1166	RAV192
patient.clinical_cqcf.frozen_specimen_anatomic_site				NA					29.64			NA
patient.icd_10				NA	23.12			20.77				NA
patient.icd_o_3_site				NA	23.12			20.77				NA
patient.tissue_source_site	39.26	38.02	46.85	NA	42.07	52.66	43.61	47.00		40.72	44.97	NA	46.60	48.51
patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status				NA					46.83			NA
patient.biospecimen_cqcf.frozen_specimen_anatomic_site				NA					21.43			NA
patient.biospecimen_cqcf.site_of_disease_description				NA					21.43			NA

Function to Display Categorical Attributes’ Score Plots

graph_categorical <- function(RAV1, RAV2, factor.df, sampleScore.df, phenotype, graph_title = "Score Plot", legend_title = "Legend") {
  
  # Sanity Check 1: Check if RAVs are available; if RAVs don't exist in sampleScore.df, the function will error: "undefined columns selected"
  # Sanity Check 2: phenotype is in factor.df
  if(nlevels(factor.df[[phenotype]]) == 0) {
    print("Phenotype is not in provided factor dataframe")
  }
  
  sampleScore1 <- paste0("RAV", RAV1)
  sampleScore2 <- paste0("RAV", RAV2)

  new_df <- sampleScore.df[, c(sampleScore1, sampleScore2)]
  colnames(new_df)[1] <- "sampleScore1"
  colnames(new_df)[2] <- "sampleScore2"

  new_df <- data.frame(new_df, factor.df)
  plot_data <- new_df[which(!is.na(factor(new_df[[phenotype]]))),] #Filter out rows of the phenotype that are N/A
  #print(plot_data[[phenotype]])

  colors <- gg_color_hue(length(unique(plot_data[[phenotype]]))) # Count number of factor levels excluding NA
  colors.toplot <- c(colors)

  pA <- ggplot(plot_data,
             aes(x = sampleScore1, y = sampleScore2, color = plot_data[[phenotype]])) +
        geom_point() +
        labs(title = graph_title) +
        scale_color_manual(values = colors.toplot, name = legend_title) +
        theme_bw() +
        theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank()) +
        geom_hline(yintercept = 0, linetype = 'dashed') +
        geom_vline(xintercept = 0, linetype = 'dashed') +
        xlab(sampleScore1) + ylab(sampleScore2)

print(pA)
}

graph_categorical(834, 1166, new_factorTb, sampleScore_sub,
                  "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status", 
                  "Score Plot for Microsatellite Status", "Microsatellite Status")

graph_categorical(834, 188, binary_factor_2, sampleScore_sub, 'patient.braf_gene_analysis_result', 'BRAF Gene Analysis Result')

## Warning: Use of `plot_data[[phenotype]]` is discouraged.
## ℹ Use `.data[[phenotype]]` instead.

EDA_COAD_MSI

Britney Pheng

2024-09-25

Initial Setup

Load packages

Load RAVmodel

Select COAD RNA metadata

heatmapTable: COAD

Subset

Filter attributes

Sparsity Plot

Separate out Factor Variables by number of levels (1, 2, 3+) not including NA

Calculate Wilcoxon Test for Binomial Factor Variables

Calculate Kruskal-Wallis Test for Multinomial Factor Variables

Function to Display Categorical Attributes’ Score Plots