Initial Setup

Load packages

suppressPackageStartupMessages({
  # BiocManager
  library(GenomicSuperSignature)
  library(curatedTCGAData)
  library(MultiAssayExperiment)
  library(TCGAutils)
  library(ComplexHeatmap)
  
  # CRAN
  library(tidyverse) # includes dplyr, ggplot2, magrittr, tidyr
  library(magick)
  library(wordcloud)
  library(ztable)
  library(metafolio)
})

Load RAVmodel

RAVmodel <- getModel('C2', load=TRUE)

## [1] "downloading"

Select COAD RNA metadata

coad <- curatedTCGAData(diseaseCode = 'COAD',
                        assays = 'RNA*',
                        version = '2.0.1',
                        dry.run = FALSE)

coad_rna <- getWithColData(coad,
                            'COAD_RNASeq2Gene-20160128')

assay(coad_rna) <- log2(assay(coad_rna) + 1)
#head(colData(coad_rna)[,1:4])

# Clean MSI Data
table(coad$patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status)

## 
## indeterminate         msi-h         msi-l           mss 
##             2            83            82           288

# Conditions and Replacement Values
conditions <- c("indeterminate", "msi-l")
replacements <- c(NA, "mss")

coad$patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status <-
  replace(coad$patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status,
          coad$patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status %in% conditions, replacements)

table(coad$patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status)

## 
## msi-h   mss 
##    83   330

## Parse out cancer vs normal samples
sampleTables(coad)

## $`COAD_RNASeq2Gene-20160128`
## 
##  01  02  06  11 
## 283   1   1  41 
## 
## $`COAD_RNASeq2GeneNorm_illuminaga-20160128`
## 
##  01 
## 191 
## 
## $`COAD_RNASeq2GeneNorm_illuminahiseq-20160128`
## 
##  01  02  06  11 
## 283   1   1  41 
## 
## $`COAD_RNASeqGene-20160128`
## 
## 01 
## 10

coad2 <- TCGAsplitAssays(coad, c("01", "11"))

coad_rna_cancer <- getWithColData(coad2,
                           '01_COAD_RNASeq2Gene-20160128')
assay(coad_rna_cancer) <- log2(assay(coad_rna_cancer) + 1)
coad_cancer <- colData(coad_rna_cancer)

coad_rna_normal <- getWithColData(coad2,
                           '11_COAD_RNASeq2Gene-20160128')
assay(coad_rna_normal) <- log2(assay(coad_rna_normal) + 1)
coad_normal <- colData(coad_rna_normal)

table(coad_cancer$patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status)

## 
## msi-h   mss 
##    52   206

table(coad_normal$patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status)

## 
## msi-h   mss 
##     9    28

normal_tb <- coad_normal %>% 
  as.data.frame() %>%
  select(patientID, patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status)

cancer_tb <- coad_cancer %>% 
  as.data.frame() %>%
  select(patientID, patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status)

combined_tb <- inner_join(normal_tb, cancer_tb, by="patientID")

heatmapTable: COAD

#validate_coad_rna <- validate(coad_rna, RAVmodel)
validate_coad_normal <- validate(coad_rna_normal, RAVmodel)
validate_coad_cancer <- validate(coad_rna_cancer, RAVmodel)

#heatmapTable(validate_coad_rna, RAVmodel, num.out = 10)
heatmapTable(validate_coad_normal, RAVmodel, num.out = 10, column_title="COAD Normal Samples")

## RAV1187 can be filtered based on GSEA_C2

heatmapTable(validate_coad_cancer, RAVmodel, num.out=10, column_title="COAD Cancer Samples")

## RAV61 can be filtered based on GSEA_C2

# Subset ## Filter attributes

# Cancer
sparsity_summary <- table(colSums(is.na(coad_cancer)))
sparsity_summary

## 
##   0   1   2   3   4   5   6   7   8   9  10  11  13  14  15  17  18  19  20  23 
##  79  75  20  73   2  17   2   2   1  24   3   1  25   2   1   1   1   3   1   4 
##  25  30  32  33  35  36  37  38  40  41  43  44  45  48  50  54  56  57  58  59 
##   2   1   1   2  16   2   2   2  13   1   1   9   1   1   1   1  13   2   1   3 
##  63  64  65  66  67  68  69  72  80  81  95  96  97  98 100 105 106 112 139 148 
##   8   3   5   2   3   5   2   2   3   1   5   2   1   1   1   1   1   1   1   1 
## 149 154 176 177 178 179 183 184 188 189 193 198 199 204 205 206 208 209 210 211 
##   1   2   5   8   2   1   1   2   1   1   3   1   1   2   9   2   1   2   5   9 
## 212 213 214 215 217 218 220 223 224 225 226 227 228 230 231 232 233 234 235 236 
##   2   1   2   1   3   1   1   5   1   4   2   1   1   2  15   1   3   1  13   4 
## 237 238 239 240 241 242 244 246 247 248 249 250 251 252 253 254 255 256 257 258 
##   1   1   5  14   6   3   2   7   3   2   3   1   1  13   2  55   5  61  18  30 
## 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 
##  53   4  11   6   3  10  10  16   1  13   2  41   5  43  34  44  17   8  54 103 
## 279 280 281 282 283 
## 280 287 115 287 351

# Normal
sparsity_summary_2 <- table(colSums(is.na(coad_normal)))
sparsity_summary_2

## 
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19 
## 228 153  11   5   4  24   5   4  11   4   1   3   1   2   2   1   3 109  13   4 
##  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39 
##  24  62  68  48  47  16  17   1  13  39  13  13  14  24  15  42  45  36 242 188 
##  40  41 
## 583 477

Sparsity Plot

# Cancer
plot(stack(sparsity_summary)$ind,
     stack(sparsity_summary)$values)

# Normal
plot(stack(sparsity_summary_2)$ind,
     stack(sparsity_summary_2)$values)

# Select columns with >10% completeness
# Cancer
keep_attribute_ind <- which(colSums(!is.na(coad_cancer)) > round(nrow(coad_cancer)/10))
meta_sub1 <- coad_cancer[keep_attribute_ind]
meta_sub1 <- subset(coad_cancer, select= -patientID)

# Normal
keep_attribute_ind_2 <- which(colSums(!is.na(coad_normal)) > round(nrow(coad_normal)/10))
meta_sub1_2 <- coad_normal[keep_attribute_ind]
meta_sub1_2 <- subset(coad_normal, select= -patientID)

# Randomly select for 140 rows
# Cancer
set.seed(1)
random_sample_ind <- sample(1:nrow(meta_sub1), 140)
meta_sub2 <- meta_sub1[random_sample_ind,]

# Normal
set.seed(2)
random_sample_ind_2 <- sample(1:nrow(meta_sub1_2), 20)
meta_sub2_2 <- meta_sub1_2[random_sample_ind_2,]

# Check for data types in listData
# Cancer
unique(sapply(coad_cancer@listData, type))

## [1] "character" "integer"   "double"

charcTb <- meta_sub2[, sapply(meta_sub2, class) == 'character']
numTb <- meta_sub2[, sapply(meta_sub2, class) %in% c('numeric', 'integer')]

# Sort numeric variables that have <=5 unique values as character variables
addToFactors <- c()
for (i in 1:length(numTb)) {
  if (length(table(numTb[i])) <= 3) {
    addToFactors <- c(addToFactors, i)
  }
}

charcTb <- c(charcTb, numTb[addToFactors])
numTb <- numTb[-addToFactors]

# Normal
unique(sapply(coad_normal@listData, type))

## [1] "character" "integer"   "double"

charcTb_2 <- meta_sub2_2[, sapply(meta_sub2_2, class) == 'character']
numTb_2 <- meta_sub2_2[, sapply(meta_sub2_2, class) %in% c('numeric', 'integer')]

# Sort numeric variables that have <=5 unique values as character variables
addToFactors_2 <- c()
for (i in 1:length(numTb_2)) {
  if (length(table(numTb_2[i])) <= 3) {
    addToFactors_2 <- c(addToFactors_2, i)
  }
}

charcTb_2 <- c(charcTb_2, numTb_2[addToFactors_2])
numTb_2 <- numTb_2[-addToFactors_2]

# Calculate validation scores
# Cancer
sampleScore <- calculateScore(coad_rna_cancer, RAVmodel)

# Normal
sampleScore_2 <- calculateScore(coad_rna_normal, RAVmodel)

# Cancer
validated_ind <- validatedSignatures(validate_coad_cancer, RAVmodel, num.out = 15, scoreCutoff = 0.45, indexOnly = TRUE)

## RAV61 can be filtered based on GSEA_C2

## RAV438 can be filtered based on GSEA_C2

## Subset sampleScore to join with MCPcounter
sampleScore_sub <- sampleScore[random_sample_ind, validated_ind] %>% as.data.frame()

# Normal
validated_ind_2 <- validatedSignatures(validate_coad_normal, RAVmodel, num.out = 15, scoreCutoff = 0.45, indexOnly = TRUE)

## Subset sampleScore to join with MCPcounter
sampleScore_sub_2 <- sampleScore[random_sample_ind_2, validated_ind_2] %>% as.data.frame()

Separate out Factor Variables by number of levels (1, 2, 3+) not including NA

# Convert to factor data type
# Cancer
factorTb <- meta_sub2[, sapply(meta_sub1, class) == 'character']

factorTb[sapply(factorTb, is.character)] <- lapply(factorTb[sapply(factorTb, is.character)], factor, exclude = NULL)

#any(is.na(levels(factorTb[,2])))
#!any(is.na(levels(factorTb[,6])))

single_fac_ind <- c()
binary_fac_ind <- c()
multi_fac_ind <- c()

# Testing factor grouping
for (i in 1:length(factorTb)) {
  if (nlevels(factorTb[,i]) == 1 | 
      (nlevels(factorTb[,i]) == 2 & 
       any(is.na(levels(factorTb[,i]))))
      ) {
    single_fac_ind <- c(single_fac_ind, i)
  } else if (nlevels(factorTb[,i]) == 3 & 
             any(is.na(levels(factorTb[,i]))) |
             
             (nlevels(factorTb[,i]) == 2 & 
              !any(is.na(levels(factorTb[,i]))))
          ) {
    binary_fac_ind <- c(binary_fac_ind, i)
  } else {
    multi_fac_ind <- c(multi_fac_ind, i)
  }
}

new_factorTb <- factorTb[,multi_fac_ind]
binary_factor <- factorTb[,binary_fac_ind]
single_factor <- factorTb[,single_fac_ind]


# Normal
factorTb_2 <- meta_sub2_2[, sapply(meta_sub1_2, class) == 'character']

factorTb_2[sapply(factorTb_2, is.character)] <- lapply(factorTb_2[sapply(factorTb_2, is.character)], factor, exclude = NULL)

single_fac_ind_2 <- c()
binary_fac_ind_2 <- c()
multi_fac_ind_2 <- c()

# Testing factor grouping
for (i in 1:length(factorTb_2)) {
  if (nlevels(factorTb_2[,i]) == 1 | 
      (nlevels(factorTb_2[,i]) == 2 & 
       any(is.na(levels(factorTb_2[,i]))))
      ) {
    single_fac_ind_2 <- c(single_fac_ind_2, i)
  } else if (nlevels(factorTb_2[,i]) == 3 & 
             any(is.na(levels(factorTb_2[,i]))) |
             
             (nlevels(factorTb_2[,i]) == 2 & 
              !any(is.na(levels(factorTb_2[,i]))))
          ) {
    binary_fac_ind_2 <- c(binary_fac_ind_2, i)
  } else {
    multi_fac_ind_2 <- c(multi_fac_ind_2, i)
  }
}

new_factorTb_norm <- factorTb_2[,multi_fac_ind_2]
binary_factor_norm <- factorTb_2[,binary_fac_ind_2]
single_factor_norm <- factorTb_2[,single_fac_ind_2]

Calculate Wilcoxon Test for binomial Character Variables

#Remove factors without enough y-observations

# Cancer
remove_index <- c()

for (i in seq_len(ncol(binary_factor))) {
  x <- summary(binary_factor[,i])[1:2]
  
  if (all(x > 1)) {
    #print(x)
  } else {
    remove_index <- c(remove_index, i)
    #print(paste("index", i, "does not have enough observations"))
  }
}

binary_factor_2 <- binary_factor[,-remove_index]

wilcox_test_res <- as.data.frame(matrix(nrow = ncol(binary_factor_2),
                                ncol = ncol(sampleScore_sub)))

rownames(wilcox_test_res) <- colnames(binary_factor_2)
colnames(wilcox_test_res) <- colnames(sampleScore_sub)

wtest_coad_wvalue <- wilcox_test_res
wtest_coad_pvalue <- wilcox_test_res

for (i in seq_len(ncol(sampleScore_sub))) {
  for (j in seq_len(ncol(binary_factor_2))) {
    ## wilcoxon test
    wilcox_test <- wilcox.test(sampleScore_sub[, i] ~ binary_factor_2[,j], alternative="two.sided")
    
      ## W value
      wval <- wilcox_test$statistic
      wtest_coad_wvalue[j, i] <- wval
    
    ## p-value
      pval <- wilcox_test$p.value
      wtest_coad_pvalue[j, i] <- pval
    
  }
}

batch_char_ind <- grep('analyte|analytes|portion|procurement|aliquot|uuid|barcode',
                  rownames(wtest_coad_wvalue))
wtest_coad_wvalue_2 <- wtest_coad_wvalue[-batch_char_ind,]
wtest_coad_pvalue_2 <- wtest_coad_pvalue[-batch_char_ind,]

# Normal
remove_index_2 <- c()

for (i in seq_len(ncol(binary_factor_norm))) {
  x <- summary(binary_factor_norm[,i])[1:2]
  
  if (all(x > 1)) {
    print(x)
  } else {
    remove_index_2 <- c(remove_index_2, i)
    print(x)
    print(paste("index", i, "does not have enough observations"))
  }
}

## female   male 
##     11      9 
##  no yes 
##  17   1 
## [1] "index 2 does not have enough observations"
##          colon adenocarcinoma colon mucinous adenocarcinoma 
##                            19                             1 
## [1] "index 3 does not have enough observations"
## r0 r2 
## 15  1 
## [1] "index 4 does not have enough observations"
## black or african american                     white 
##                         1                         8 
## [1] "index 5 does not have enough observations"
## consented  deceased 
##        18         2 
##       germany united states 
##            11             3 
##          colon adenocarcinoma colon mucinous adenocarcinoma 
##                            19                             1 
## [1] "index 8 does not have enough observations"
##                               no yes, history of prior malignancy 
##                               16                                4 
##  no yes 
##   6   1 
## [1] "index 10 does not have enough observations"
##               chemotherapy targeted molecular therapy 
##                          3                          1 
## [1] "index 11 does not have enough observations"
##  no yes 
##   3   1 
## [1] "index 12 does not have enough observations"
##   chemotherapy other, specify 
##              3              1 
## [1] "index 13 does not have enough observations"
##  leucovorin oxaliplatin 
##           2           2 
##   chemotherapy other, specify 
##              2              2 
##   chemotherapy other, specify 
##              2              1 
## [1] "index 16 does not have enough observations"
## intravenous (iv)             oral 
##                2                1 
## [1] "index 17 does not have enough observations"
##  no yes 
##   2   1 
## [1] "index 18 does not have enough observations"
## tcga-a6-2671-d6056 tcga-a6-2682-d6912 
##                  1                  1 
## [1] "index 19 does not have enough observations"
## 4c5e40d8-6a8a-4638-8b23-2461b1dcdd79 ce77ae94-e1dc-41e4-989a-8891fd070a58 
##                                    1                                    1 
## [1] "index 20 does not have enough observations"
##    5 fu avastin 
##       1       1 
## [1] "index 21 does not have enough observations"
## 2500-5136   300-325 
##         1         1 
## [1] "index 22 does not have enough observations"
##    adjuvant progression 
##           1           1 
## [1] "index 23 does not have enough observations"
##  no yes 
##   1   1 
## [1] "index 24 does not have enough observations"
##               chemotherapy targeted molecular therapy 
##                          1                          1 
## [1] "index 25 does not have enough observations"
## tcga-a6-2671-d6049 tcga-a6-2682-d6908 
##                  1                  1 
## [1] "index 26 does not have enough observations"
## 6d41c707-6c72-4ead-9e68-6677b9ae902d 891f0411-bdcc-468d-bc50-acc539d6bc11 
##                                    1                                    1 
## [1] "index 27 does not have enough observations"
##       5 fu leucovorin 
##          1          1 
## [1] "index 28 does not have enough observations"
## 2000-5000   700-735 
##         1         1 
## [1] "index 29 does not have enough observations"
##   adjuvant palliative 
##          1          1 
## [1] "index 30 does not have enough observations"
##   chemotherapy other, specify 
##              1              1 
## [1] "index 31 does not have enough observations"
##    mg mg/kg 
##     3     1 
## [1] "index 32 does not have enough observations"
## 1 2 
## 3 1 
## [1] "index 33 does not have enough observations"
##    mg mg/kg 
##     3     1 
## [1] "index 34 does not have enough observations"
##     additional new tumor event scheduled follow-up submission 
##                              3                              1 
## [1] "index 35 does not have enough observations"
## complete remission/response         progressive disease 
##                           1                           1 
## [1] "index 36 does not have enough observations"
##  no yes 
##   2   1 
## [1] "index 37 does not have enough observations"
##  no yes 
##   1   2 
## [1] "index 38 does not have enough observations"
##  no yes 
##   3   3 
## tumor free with tumor 
##          3          2 
##     additional new tumor event scheduled follow-up submission 
##                              1                              1 
## [1] "index 41 does not have enough observations"
##  no yes 
##   1   2 
## [1] "index 42 does not have enough observations"
## tumor free with tumor 
##          1          2 
## [1] "index 43 does not have enough observations"
## complete remission/response  partial remission/response 
##                           5                           1 
## [1] "index 44 does not have enough observations"
##  no yes 
##   1   1 
## [1] "index 45 does not have enough observations"
##  no yes 
##  10   3 
## tumor free with tumor 
##         11          3 
##  no yes 
##  13   5 
## complete remission/response  partial remission/response 
##                           5                           1 
## [1] "index 49 does not have enough observations"
##  no yes 
##  17   1 
## [1] "index 50 does not have enough observations"
## alive  dead 
##    13     5 
## female   male 
##     11      9 
##          colon adenocarcinoma colon mucinous adenocarcinoma 
##                            19                             1 
## [1] "index 53 does not have enough observations"
##  no yes 
##   9   8 
##  no yes 
##  19   1 
## [1] "index 55 does not have enough observations"
## 8140/3 8480/3 
##     19      1 
## [1] "index 56 does not have enough observations"
##  no yes 
##  16   2 
##  no yes 
##  11   2 
##  no yes 
##  11   5 
##  no yes 
##   4   3 
## tumor free with tumor 
##          5         13 
##  no yes 
##  15   5 
## black or african american                     white 
##                         1                         8 
## [1] "index 63 does not have enough observations"
## r0 r2 
## 15  1 
## [1] "index 64 does not have enough observations"
## 5th 6th 
##  11   6 
##  no yes 
##  17   1 
## [1] "index 66 does not have enough observations"
##  no yes 
##  13   7 
##  no yes 
##   7  13 
##  no yes 
##  13   3 
## alive  dead 
##    17     3 
## msi-h   mss 
##     5    14 
## case matched expression control                 ffpe validation 
##                              18                               2 
## consented  deceased 
##        18         2 
##       germany united states 
##            11             3 
##          colon adenocarcinoma colon mucinous adenocarcinoma 
##                            19                             1 
## [1] "index 75 does not have enough observations"
##  no yes 
##  11   9 
##               cryovial other (please specify) 
##                     18                      2 
## be127778-e160-4ae3-9e5a-13a16eae2e7a c671806f-013e-4d99-9841-cda5bd43eff1 
##                                    1                                    1 
## [1] "index 78 does not have enough observations"
##  no yes 
##  18   2 
## tcga-a6-2684.9dabf713-dd0c-4388-b69e-f1bbb08e4d21.pdf 
##                                                     1 
## tcga-a6-5659.16172149-a834-4f41-a786-5d87a69fc65c.pdf 
##                                                     1 
## [1] "index 80 does not have enough observations"
## 16172149-a834-4f41-a786-5d87a69fc65c 9dabf713-dd0c-4388-b69e-f1bbb08e4d21 
##                                    1                                    1 
## [1] "index 81 does not have enough observations"
##  no yes 
##   5   2 
##  no yes 
##  18   2 
## repli-g (qiagen) dna                  rna 
##                    9                   11 
##  r  w 
## 11  9 
##  no yes 
##  18   2 
## picogreen   uv spec 
##         8         1 
## [1] "index 87 does not have enough observations"
##  no yes 
##  18   2 
##  no yes 
##  18   2 
##  no yes 
##  18   2 
##  no yes 
##  18   2 
##  no yes 
##  10  10 
##  no yes 
##  18   2 
## picogreen   uv spec 
##         2        18 
##  no yes 
##  18   2 
## tcga-a6-2684-01c-08-bs8 tcga-a6-5659-01b-04-bs4 
##                       1                       1 
## [1] "index 96 does not have enough observations"
## 3557f6b1-2db8-48fa-87a0-86b293ba9ffc 981d2609-a2d8-4573-b704-5711d132abcd 
##                                    1                                    1 
## [1] "index 97 does not have enough observations"
## tcga-a6-2684-01c-08-bs8.981d2609-a2d8-4573-b704-5711d132abcd.svs 
##                                                                1 
## tcga-a6-5659-01b-04-bs4.3557f6b1-2db8-48fa-87a0-86b293ba9ffc.svs 
##                                                                1 
## [1] "index 98 does not have enough observations"
##  no yes 
##   9   2 
## 1758 a32z 
##    1    7 
## [1] "index 100 does not have enough observations"
## repli-g (qiagen) dna                  rna 
##                    2                    9 
## r w 
## 9 2 
## allprep rna extraction                repli-g 
##                      9                      2 
## tcga-a6-2684-10a-01d-2188-10 tcga-a6-5659-10a-01d-a46w-08 
##                            1                            1 
## [1] "index 104 does not have enough observations"
## 0575ab82-6307-4b28-b46f-394309950246 205dbcc1-d726-49bd-8987-ffdfe0bd82ee 
##                                    1                                    1 
## [1] "index 105 does not have enough observations"
## 2188 a46w 
##    1    1 
## [1] "index 106 does not have enough observations"
## e h 
## 1 1 
## [1] "index 107 does not have enough observations"
##  no yes 
##  10   1 
## [1] "index 108 does not have enough observations"
##         adna preparation type chemical lysis dna extraction 
##                             9                             2 
## blood derived normal  solid tissue normal 
##                    2                    9 
## tcga-a6-2684-11a tcga-a6-5659-11a 
##                1                1 
## [1] "index 111 does not have enough observations"
## 54aad2df-8ee1-474f-8d94-858cfca51970 bf2b2ca1-17b5-4a4d-bae8-46f571129706 
##                                    1                                    1 
## [1] "index 112 does not have enough observations"
## tcga-a6-2684-11a-01r-1757-13 tcga-a6-5659-11a-01r-1653-07 
##                            1                            1 
## [1] "index 113 does not have enough observations"
## 2897d2b0-0d34-4679-9b36-fc1385bfb5ee bdc32766-c722-43b9-946b-5204fd535f87 
##                                    1                                    1 
## [1] "index 114 does not have enough observations"
## c g 
## 1 1 
## [1] "index 115 does not have enough observations"
## tcga-a6-2684-11a-01r tcga-a6-5659-11a-01r 
##                    1                    1 
## [1] "index 116 does not have enough observations"
## 0d77fa62-02b5-46cb-974a-2d1d078e9f53 9ffa633a-c640-4cae-8e6e-2e4f9e57a85d 
##                                    1                                    1 
## [1] "index 117 does not have enough observations"
## tcga-a6-2684-11a-01d-1549-01 tcga-a6-5659-11a-01d-1649-01 
##                            1                            1 
## [1] "index 118 does not have enough observations"
## 2a11206d-f931-496c-aedb-0a522955d77f ee3881ea-2adb-424e-9a7b-1dd05dd93a34 
##                                    1                                    1 
## [1] "index 119 does not have enough observations"
## a c 
## 1 1 
## [1] "index 120 does not have enough observations"
## tcga-a6-2684-11a-01d-1551-05 tcga-a6-5659-11a-01d-1650-10 
##                            1                            1 
## [1] "index 121 does not have enough observations"
## 014243b3-34cd-4881-a197-6a79248cd89d b5eb58df-e753-4dda-b7d1-335de55750b9 
##                                    1                                    1 
## [1] "index 122 does not have enough observations"
## a c 
## 1 1 
## [1] "index 123 does not have enough observations"
## tcga-a6-2684-11a-01d-1554-10 tcga-a6-5659-11a-01d-1651-05 
##                            1                            1 
## [1] "index 124 does not have enough observations"
## 81311020-2894-49a1-90d0-6699b0141841 90152d16-ac20-46c2-81f7-c5ced81bba83 
##                                    1                                    1 
## [1] "index 125 does not have enough observations"
## c d 
## 1 1 
## [1] "index 126 does not have enough observations"
## tcga-a6-2684-11a-01d-1555-09 tcga-a6-5659-11a-01d-1862-23 
##                            1                            1 
## [1] "index 127 does not have enough observations"
## 83c9efe2-9bf9-432b-a8b0-45d7209e99a0 ff1bcb1b-7746-47b0-8665-c987d4df5995 
##                                    1                                    1 
## [1] "index 128 does not have enough observations"
## d e 
## 1 1 
## [1] "index 129 does not have enough observations"
## tcga-a6-2684-11a-01d-1547-02 tcga-a6-5659-11a-01d-1648-02 
##                            1                            1 
## [1] "index 130 does not have enough observations"
## 2e6beb98-208e-4cdf-ab01-73b9db5f9743 dd8601de-3687-4438-ae59-8acd37d07e32 
##                                    1                                    1 
## [1] "index 131 does not have enough observations"
## a c 
## 1 1 
## [1] "index 132 does not have enough observations"
## tcga-a6-2684-11a-01d tcga-a6-5659-11a-01d 
##                    1                    1 
## [1] "index 133 does not have enough observations"
## 1474e2bf-5cc7-47f3-93db-6ccb704f482d 83c3b52f-985c-459b-b1d2-165c38b153ca 
##                                    1                                    1 
## [1] "index 134 does not have enough observations"
## tcga-a6-2684-11a-01 tcga-a6-5659-11a-01 
##                   1                   1 
## [1] "index 135 does not have enough observations"
## 81181d2e-7cfa-44c6-b3f3-d08993bc1bd3 d91940bc-f960-467a-9a72-a4b0783e428f 
##                                    1                                    1 
## [1] "index 136 does not have enough observations"
## tcga-a6-2684-11a-01-ts1 tcga-a6-5659-11a-01-ts1 
##                       1                       1 
## [1] "index 137 does not have enough observations"
## 373c2bcf-43e7-4862-ad25-27319410fa23 d1a844c4-de66-4210-9201-8b74b14fea32 
##                                    1                                    1 
## [1] "index 138 does not have enough observations"
## tcga-a6-2684-11a-01-ts1.373c2bcf-43e7-4862-ad25-27319410fa23.svs 
##                                                                1 
## tcga-a6-5659-11a-01-ts1.d1a844c4-de66-4210-9201-8b74b14fea32.svs 
##                                                                1 
## [1] "index 139 does not have enough observations"
## tcga-a6-2684-01a-01r-a278-07 tcga-a6-5659-01a-01r-a278-07 
##                            1                            1 
## [1] "index 140 does not have enough observations"
## d517fd04-2868-4d22-a0d6-cfe81e885601 fa89193c-5cdd-4092-931e-867270bef743 
##                                    1                                    1 
## [1] "index 141 does not have enough observations"
## c g 
## 1 1 
## [1] "index 142 does not have enough observations"
## tcga-a6-2684-01a-01r-a46x-36 tcga-a6-5659-01a-01r-a46x-36 
##                            1                            1 
## [1] "index 143 does not have enough observations"
## 22fdac9c-0f50-426e-b89f-c5e7fdf7ab17 a40deaff-59e3-445e-b5ea-87608b5b449d 
##                                    1                                    1 
## [1] "index 144 does not have enough observations"
## c g 
## 1 1 
## [1] "index 145 does not have enough observations"
##  h  r 
##  6 14 
##    allprep rna extraction mirvana (allprep dna) rna 
##                        14                         6 
##       rna total rna 
##         6        14 
##  r  t 
##  6 14 
## allprep rna extraction              total rna 
##                      6                     14 
## 0896 0897 
##    3    2 
## picogreen   uv spec 
##         8        12 
## tcga-a6-2684-01a-01d-a274-01 tcga-aa-3518-01a-02d-1618-23 
##                            1                            1 
## [1] "index 153 does not have enough observations"
## 114e87d6-5b9c-41d7-924b-ea8a85b78798 43a50f21-31ce-462d-94ee-a2b069fcdb00 
##                                    1                                    1 
## [1] "index 154 does not have enough observations"
## 1618 a274 
##    1    1 
## [1] "index 155 does not have enough observations"
## b f 
## 1 1 
## [1] "index 156 does not have enough observations"
## tcga-a6-2684-01a-01d-a27a-05 tcga-aa-3518-01a-02d-1953-10 
##                            1                            1 
## [1] "index 157 does not have enough observations"
## 5edb7fd3-9802-4c1e-9956-b5fcfb900f28 8c9ff2b3-a02f-4ec1-b8c8-863e17447a23 
##                                    1                                    1 
## [1] "index 158 does not have enough observations"
## 1953 a27a 
##    1    1 
## [1] "index 159 does not have enough observations"
##  no yes 
##   8  12 
## bottom    top 
##      7     13 
## bottom    top 
##     13      7 
##   CIMP.H Cluster3 
##        2        3 
## Invasive MSI/CIMP 
##        2        3 
## M0 M1 
##  7  2 
## FEMALE   MALE 
##      7      2 
##          Colon Adenocarcinoma Colon Mucinous Adenocarcinoma 
##                             8                             1 
## [1] "index 167 does not have enough observations"
##  NO YES 
##   3   5 
## 8140/3 8480/3 
##      8      1 
## [1] "index 169 does not have enough observations"
##  NO YES 
##   6   3 
## TUMOR FREE WITH TUMOR 
##          3          6 
##  NO YES 
##   6   3 
##  NO YES 
##   7   2 
## DECEASED   LIVING 
##        1        8 
## [1] "index 174 does not have enough observations"

binary_factor_norm_2 <- binary_factor_norm[,-remove_index_2]

wilcox_test_res_2 <- as.data.frame(matrix(nrow = ncol(binary_factor_norm_2),
                                ncol = ncol(sampleScore_sub_2)))

rownames(wilcox_test_res_2) <- colnames(binary_factor_norm_2)
colnames(wilcox_test_res_2) <- colnames(sampleScore_sub_2)

wtest_coad_wvalue_norm <- wilcox_test_res_2
wtest_coad_pvalue_norm <- wilcox_test_res_2

for (i in seq_len(ncol(sampleScore_sub_2))) {
  for (j in seq_len(ncol(binary_factor_norm_2))) {
    ## wilcoxon test
    wilcox_test <- wilcox.test(sampleScore_sub_2[, i] ~ binary_factor_norm_2[,j], alternative="two.sided")
    
      ## W value
      wval <- wilcox_test$statistic
      wtest_coad_wvalue_norm[j, i] <- wval
    
    ## p-value
      pval <- wilcox_test$p.value
      wtest_coad_pvalue_norm[j, i] <- pval
    
  }
}

batch_char_ind <- grep('analyte|analytes|portion|procurement|aliquot|uuid|barcode',
                  rownames(wtest_coad_wvalue_norm))
wtest_coad_wvalue_norm_2 <- wtest_coad_wvalue_norm[-batch_char_ind,]
wtest_coad_pvalue_norm_2 <- wtest_coad_pvalue_norm[-batch_char_ind,]

heatmap(as.matrix(wtest_coad_wvalue_2), main = 'COAD Wilcoxon Test Cancer Binomial')

heatmap(as.matrix(wtest_coad_wvalue_norm_2), main = 'COAD Wilcoxon Test Normal Binomial')

#Only view w-values with significant p-value (<0.01)
# Cancer
wtest_coad_sig_wvalue <- wilcox_test_res

 for (i in seq_len(ncol(sampleScore_sub))) {
   for (j in seq_len(ncol(binary_factor_2))) {
     if (wilcox.test(sampleScore_sub[, i] ~ binary_factor_2[,j], alternative="two.sided")$p.val < 0.01) {
       wtest_coad_sig_wvalue[j, i] <- wilcox.test(sampleScore_sub[, i] ~ binary_factor_2[,j],
                                                  alternative="two.sided")$statistic
     }
   }
 }

na_ind <- apply(wtest_coad_sig_wvalue, 1, function(x) all(is.na(x)))
wtest_coad_sig_wvalue <- wtest_coad_sig_wvalue[!na_ind, ]
batch_char_ind_2 <- grep('analyte|analytes|portion|procurement|aliquot|uuid|barcode',
                  rownames(wtest_coad_sig_wvalue))
wtest_coad_sig_wvalue <- wtest_coad_sig_wvalue[-batch_char_ind_2, ]

# Normal
wtest_coad_sig_wvalue_2 <- wilcox_test_res_2

 for (i in seq_len(ncol(sampleScore_sub_2))) {
   for (j in seq_len(ncol(binary_factor_norm_2))) {
     if (wilcox.test(sampleScore_sub_2[, i] ~ binary_factor_norm_2[,j], alternative="two.sided")$p.val < 0.1) {
       wtest_coad_sig_wvalue_2[j, i] <- wilcox.test(sampleScore_sub_2[, i] ~ binary_factor_norm_2[,j],
                                                  alternative="two.sided")$statistic
     }
   }
 }

na_ind <- apply(wtest_coad_sig_wvalue_2, 1, function(x) all(is.na(x)))
wtest_coad_sig_wvalue_2 <- wtest_coad_sig_wvalue_2[!na_ind, ]
batch_char_ind_2 <- grep('analyte|analytes|portion|procurement|aliquot|uuid|barcode',
                  rownames(wtest_coad_sig_wvalue_2))
wtest_coad_sig_wvalue_2 <- wtest_coad_sig_wvalue_2[-batch_char_ind_2, ]

# Cancer
options(ztable.type='html')
ztable(wtest_coad_sig_wvalue) %>%
  makeHeatmap(palette = 'Blues') %>%
  print(caption='Cancer Sample W-test with p-values < 0.01')

Cancer Sample W-test with p-values < 0.01
	RAV188	RAV1575	RAV324	RAV832	RAV61	RAV1008	RAV868	RAV64	RAV220	RAV834	RAV833	RAV438	RAV1166	RAV192
patient.braf_gene_analysis_performed		1094.00	362.00	NA		1086.00	349.00	333.00	1121.00			1142.00	352.00	347.00
patient.braf_gene_analysis_result				NA						36.00
patient.follow_ups.follow_up.2.new_tumor_events.new_tumor_event_after_initial_treatment				NA			58.00		269.00				69.00	48.00
patient.kras_gene_analysis_performed	1603.00	1698.00	718.00	NA	645.00	1623.00	728.00	544.00	1707.00				524.00	591.00
patient.lymphatic_invasion				NA							2078.00
patient.microsatellite_instability				NA						7.00
patient.tissue_prospective_collection_indicator		1682.00	2896.00	NA		1622.00	2861.00				2860.00
patient.tissue_retrospective_collection_indicator		2861.00	1647.00	NA		2921.00	1682.00				1683.00
patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status				NA						2354.00			820.00
patient.biospecimen_cqcf.path_confirm_report_attached	832.00		218.00	NA		829.00	211.00	223.00	827.00		204.00	811.00	201.00

#Normal
# NONE

## Function to Display Categorical Attributes' Score Plots
 
# Cancer
graph_categorical <- function(a, b, c, d = 'Legend') {
  sampleScore1 <- paste0('RAV', a)
  sampleScore2 <- paste0('RAV', b)

  plot_data <- sampleScore_sub[, c(sampleScore1, sampleScore2)]
  colnames(plot_data)[1] <- "sampleScore1"
  colnames(plot_data)[2] <- "sampleScore2"

  plot_data <- data.frame(plot_data, binary_factor_2)
  plot_data <- plot_data[which(!is.na(factor(plot_data[[c]]))),]
  # print(plot_data)

  colors <- gg_color_hue(length(unique(plot_data[[c]])))
  colors.toplot <- c(colors)

  pA <- ggplot(plot_data,
             aes(x = sampleScore1, y = sampleScore2, color = plot_data[[c]]
                 )) +
        geom_point() +
        #ggtitle('Score Plot for Gender') +
        scale_color_manual(values = colors.toplot, name = d) +
        theme_bw() +
        theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank()) +
        geom_hline(yintercept = 0, linetype = 'dashed') +
        geom_vline(xintercept = 0, linetype = 'dashed') +
        xlab(sampleScore1) + ylab(sampleScore2)

print(pA)
}

graph_categorical(834, 1166, 'patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status', 'Microsatellite Status')

graph_categorical(834, 188, 'patient.braf_gene_analysis_result', 'BRAF Gene Analysis Result')

## Warning: Use of `plot_data[[c]]` is discouraged.
## ℹ Use `.data[[c]]` instead.

# Normal
graph_categorical_2 <- function(a, b, c, d = 'Legend') {
  sampleScore1 <- paste0('RAV', a)
  sampleScore2 <- paste0('RAV', b)

  plot_data <- sampleScore_sub_2[, c(sampleScore1, sampleScore2)]
  colnames(plot_data)[1] <- "sampleScore1"
  colnames(plot_data)[2] <- "sampleScore2"

  plot_data <- data.frame(plot_data, binary_factor_norm_2)
  plot_data <- plot_data[which(!is.na(factor(plot_data[[c]]))),]
  # print(plot_data)

  colors <- gg_color_hue(length(unique(plot_data[[c]])))
  colors.toplot <- c(colors)

  pA <- ggplot(plot_data,
             aes(x = sampleScore1, y = sampleScore2, color = plot_data[[c]]
                 )) +
        geom_point() +
        #ggtitle('Score Plot for Gender') +
        scale_color_manual(values = colors.toplot, name = d) +
        theme_bw() +
        theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank()) +
        geom_hline(yintercept = 0, linetype = 'dashed') +
        geom_vline(xintercept = 0, linetype = 'dashed') +
        xlab(sampleScore1) + ylab(sampleScore2)

print(pA)
}

graph_categorical_2(833, 324, 'patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status', 'Microsatellite Status')

## Warning: Use of `plot_data[[c]]` is discouraged.
## ℹ Use `.data[[c]]` instead.

EDA_COAD_MSI

Britney Pheng

2024-09-20

Initial Setup

Load packages

Load RAVmodel

Select COAD RNA metadata

heatmapTable: COAD

Sparsity Plot

Separate out Factor Variables by number of levels (1, 2, 3+) not including NA

Calculate Wilcoxon Test for binomial Character Variables