Initial Setup
Load packages
suppressPackageStartupMessages({
# BiocManager
library(GenomicSuperSignature)
library(curatedTCGAData)
library(MultiAssayExperiment)
library(TCGAutils)
library(ComplexHeatmap)
# CRAN
library(tidyverse) # includes dplyr, ggplot2, magrittr, tidyr
library(magick)
library(wordcloud)
library(ztable)
library(metafolio)
})
Load RAVmodel
RAVmodel <- getModel('C2', load=TRUE)
## [1] "downloading"
Select COAD RNA metadata
coad <- curatedTCGAData(diseaseCode = 'COAD',
assays = 'RNA*',
version = '2.0.1',
dry.run = FALSE)
coad_rna <- getWithColData(coad,
'COAD_RNASeq2Gene-20160128')
assay(coad_rna) <- log2(assay(coad_rna) + 1)
table(coad$patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status)
##
## indeterminate msi-h msi-l mss
## 2 83 82 288
## Parse out cancer vs normal samples
sampleTables(coad)
## $`COAD_RNASeq2Gene-20160128`
##
## 01 02 06 11
## 283 1 1 41
##
## $`COAD_RNASeq2GeneNorm_illuminaga-20160128`
##
## 01
## 191
##
## $`COAD_RNASeq2GeneNorm_illuminahiseq-20160128`
##
## 01 02 06 11
## 283 1 1 41
##
## $`COAD_RNASeqGene-20160128`
##
## 01
## 10
coad2 <- TCGAsplitAssays(coad, c("01", "11"))
coad_rna_cancer <- getWithColData(coad2,
'01_COAD_RNASeq2Gene-20160128')
assay(coad_rna_cancer) <- log2(assay(coad_rna_cancer) + 1)
coad_cancer <- colData(coad_rna_cancer)
coad_rna_normal <- getWithColData(coad2,
'11_COAD_RNASeq2Gene-20160128')
assay(coad_rna_normal) <- log2(assay(coad_rna_normal) + 1)
coad_normal <- colData(coad_rna_normal)
table(coad_cancer$patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status)
##
## indeterminate msi-h msi-l mss
## 2 52 48 181
table(coad_normal$patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status)
##
## msi-h msi-l mss
## 9 7 25
normal_tb <- coad_normal %>%
as.data.frame() %>%
dplyr::select(patientID, patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status)
cancer_tb <- coad_cancer %>%
as.data.frame() %>%
dplyr::select(patientID, patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status)
combined_tb <- inner_join(normal_tb, cancer_tb, by="patientID") # All paired cancer and normal samples have the same MSI status
heatmapTable: COAD
validate_coad_rna <- validate(coad_rna, RAVmodel)
validate_coad_normal <- validate(coad_rna_normal, RAVmodel)
validate_coad_cancer <- validate(coad_rna_cancer, RAVmodel)
heatmapTable(validate_coad_rna, RAVmodel, num.out = 10)
heatmapTable(validate_coad_normal, RAVmodel, num.out = 10, column_title="COAD Normal Samples")
## RAV1187 can be filtered based on GSEA_C2
heatmapTable(validate_coad_cancer, RAVmodel, num.out=10, column_title="COAD Cancer Samples")
## RAV61 can be filtered based on GSEA_C2



Subset
Filter attributes
# Cancer
sparsity_summary <- table(colSums(is.na(coad_cancer)))
sparsity_summary
##
## 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 17 18 19 20 23
## 80 75 20 73 2 17 2 2 1 24 3 1 25 2 1 1 1 3 1 4
## 25 30 32 33 35 36 37 38 40 41 43 44 45 48 50 54 56 57 58 59
## 1 1 1 2 16 2 2 2 13 1 1 9 1 1 1 1 13 2 1 3
## 63 64 65 66 67 68 69 72 80 81 95 96 97 98 100 105 106 112 139 148
## 8 3 5 2 3 5 2 2 3 1 5 2 1 1 1 1 1 1 1 1
## 149 154 176 177 178 179 183 184 188 189 193 198 199 204 205 206 208 209 210 211
## 1 2 5 8 2 1 1 2 1 1 3 1 1 2 9 2 1 2 5 9
## 212 213 214 215 217 218 220 223 224 225 226 227 228 230 231 232 233 234 235 236
## 2 1 2 1 3 1 1 5 1 4 2 1 1 2 15 1 3 1 13 4
## 237 238 239 240 241 242 244 246 247 248 249 250 251 252 253 254 255 256 257 258
## 1 1 5 14 6 3 2 7 3 2 3 1 1 13 2 55 5 61 18 30
## 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278
## 53 4 11 6 3 10 10 16 1 13 2 41 5 43 34 44 17 8 54 103
## 279 280 281 282 283
## 280 287 115 287 351
Sparsity Plot
# Cancer
plot(stack(sparsity_summary)$ind,
stack(sparsity_summary)$values)

# Select columns with >10% completeness
# Cancer
keep_attribute_ind <- which(colSums(!is.na(coad_cancer)) > round(nrow(coad_cancer)/10))
meta_sub1 <- coad_cancer[keep_attribute_ind]
meta_sub1 <- subset(coad_cancer, select= -patientID)
# Randomly select for 140 rows
set.seed(1)
random_sample_ind <- sample(1:nrow(meta_sub1), 140)
meta_sub2 <- meta_sub1[random_sample_ind,]
# Check for data types in listData
unique(sapply(coad_cancer@listData, type))
## [1] "character" "integer" "double"
charcTb <- meta_sub2[, sapply(meta_sub2, class) == 'character']
numTb <- meta_sub2[, sapply(meta_sub2, class) %in% c('numeric', 'integer')]
# View numeric variables that have <=4 unique values to determine if they should be converted to character variables
addToFactors <- c()
for (i in 1:length(numTb)) {
if (length(table(numTb[i])) <= 4) {
addToFactors <- c(addToFactors, i)
}
}
#numTb[addToFactors] #None at this time
#charcTb <- c(charcTb, numTb[addToFactors])
#numTb <- numTb[-addToFactors]
# Calculate validation scores
sampleScore <- calculateScore(coad_rna_cancer, RAVmodel)
validated_ind <- validatedSignatures(validate_coad_cancer, RAVmodel, num.out = 15, scoreCutoff = 0.45, indexOnly = TRUE) #Using Pearson Coefficient
## RAV61 can be filtered based on GSEA_C2
## RAV438 can be filtered based on GSEA_C2
## Subset sampleScore to join with MCPcounter
sampleScore_sub <- sampleScore[random_sample_ind, validated_ind] %>% as.data.frame()
Separate out Factor Variables by number of levels (1, 2, 3+) not
including NA
# Convert to factor data type
factorTb <- meta_sub2[, sapply(meta_sub2, class) == 'character']
#factorTb <- charcTb
factorTb[sapply(factorTb, is.character)] <- lapply(factorTb[sapply(factorTb, is.character)], factor, exclude = NULL)
#any(is.na(levels(factorTb[,2])))
single_factor_ind <- c()
binary_factor_ind <- c()
multi_factor_ind <- c()
# Testing factor grouping
for (i in 1:length(factorTb)) {
if (nlevels(factorTb[,i]) == 1 |
(nlevels(factorTb[,i]) == 2 & any(is.na(levels(factorTb[,i]))))
) {
single_factor_ind <- c(single_factor_ind, i)
} else if (nlevels(factorTb[,i]) == 3 & any(is.na(levels(factorTb[,i]))) |
(nlevels(factorTb[,i]) == 2 & !any(is.na(levels(factorTb[,i]))))
) {
binary_factor_ind <- c(binary_factor_ind, i)
} else {
multi_factor_ind <- c(multi_factor_ind, i)
}
}
new_factorTb <- factorTb[,multi_factor_ind]
binary_factor <- factorTb[,binary_factor_ind]
single_factor <- factorTb[,single_factor_ind]
Calculate Wilcoxon Test for Binomial Factor Variables
#Remove factors without enough y-observations
remove_index <- c()
for (i in seq_len(ncol(binary_factor))) {
x <- summary(binary_factor[,i])[1:2]
if (all(x > 1)) {
#print(x)
} else {
remove_index <- c(remove_index, i)
#print(paste("index", i, "does not have enough observations"))
}
}
binary_factor_2 <- binary_factor[,-remove_index]
wilcox_test_res <- as.data.frame(matrix(nrow = ncol(binary_factor_2),
ncol = ncol(sampleScore_sub)))
rownames(wilcox_test_res) <- colnames(binary_factor_2)
colnames(wilcox_test_res) <- colnames(sampleScore_sub)
wtest_coad_wvalue <- wilcox_test_res
wtest_coad_pvalue <- wilcox_test_res
for (i in seq_len(ncol(sampleScore_sub))) {
for (j in seq_len(ncol(binary_factor_2))) {
## wilcoxon test
wilcox_test <- wilcox.test(sampleScore_sub[, i] ~ binary_factor_2[,j], alternative="two.sided")
## W value
wval <- wilcox_test$statistic
wtest_coad_wvalue[j, i] <- wval
## p-value
pval <- wilcox_test$p.value
wtest_coad_pvalue[j, i] <- pval
}
}
batch_char_ind <- grep('analyte|analytes|portion|procurement|aliquot|uuid|barcode',
rownames(wtest_coad_wvalue))
wtest_coad_wvalue_2 <- wtest_coad_wvalue[-batch_char_ind,]
wtest_coad_pvalue_2 <- wtest_coad_pvalue[-batch_char_ind,]
heatmap(as.matrix(wtest_coad_wvalue_2), main = 'COAD Wilcoxon Test Cancer Binomial')

#Only view w-values with significant p-value (<0.01)
wtest_coad_sig_wvalue <- wilcox_test_res
for (i in seq_len(ncol(sampleScore_sub))) {
for (j in seq_len(ncol(binary_factor_2))) {
if (wilcox.test(sampleScore_sub[, i] ~ binary_factor_2[,j], alternative="two.sided")$p.val < 0.01) {
wtest_coad_sig_wvalue[j, i] <- wilcox.test(sampleScore_sub[, i] ~ binary_factor_2[,j],
alternative="two.sided")$statistic
}
}
}
na_ind <- apply(wtest_coad_sig_wvalue, 1, function(x) all(is.na(x)))
wtest_coad_sig_wvalue <- wtest_coad_sig_wvalue[!na_ind, ]
batch_char_ind_2 <- grep('analyte|analytes|portion|procurement|aliquot|uuid|barcode',
rownames(wtest_coad_sig_wvalue))
wtest_coad_sig_wvalue <- wtest_coad_sig_wvalue[-batch_char_ind_2, ]
options(ztable.type='html')
ztable(wtest_coad_sig_wvalue) %>%
makeHeatmap(palette = 'Blues') %>%
print(caption='Cancer Sample W-test with p-values < 0.01')
Cancer Sample W-test with p-values < 0.01
Â
|
RAV188 |
RAV1575 |
RAV324 |
RAV832 |
RAV61 |
RAV1008 |
RAV868 |
RAV64 |
RAV220 |
RAV834 |
RAV833 |
RAV438 |
RAV190 |
RAV1166 |
RAV192 |
patient.braf_gene_analysis_performed
|
|
1094.00
|
362.00
|
NA
|
|
1086.00
|
349.00
|
333.00
|
1121.00
|
|
|
1142.00
|
|
352.00
|
347.00
|
patient.braf_gene_analysis_result
|
|
|
|
NA
|
|
|
|
|
|
36.00
|
|
|
|
|
|
patient.follow_ups.follow_up.2.new_tumor_events.new_tumor_event_after_initial_treatment
|
|
|
|
NA
|
|
|
58.00
|
|
269.00
|
|
|
|
|
69.00
|
48.00
|
patient.kras_gene_analysis_performed
|
1603.00
|
1698.00
|
718.00
|
NA
|
645.00
|
1623.00
|
728.00
|
544.00
|
1707.00
|
|
|
|
|
524.00
|
591.00
|
patient.lymphatic_invasion
|
|
|
|
NA
|
|
|
|
|
|
|
2078.00
|
|
|
|
|
patient.microsatellite_instability
|
|
|
|
NA
|
|
|
|
|
|
7.00
|
|
|
|
|
|
patient.tissue_prospective_collection_indicator
|
|
1682.00
|
2896.00
|
NA
|
|
1622.00
|
2861.00
|
|
|
|
2860.00
|
|
|
|
|
patient.tissue_retrospective_collection_indicator
|
|
2861.00
|
1647.00
|
NA
|
|
2921.00
|
1682.00
|
|
|
|
1683.00
|
|
|
|
|
patient.biospecimen_cqcf.path_confirm_report_attached
|
832.00
|
|
218.00
|
NA
|
|
829.00
|
211.00
|
223.00
|
827.00
|
|
204.00
|
811.00
|
|
201.00
|
|
|
Calculate Kruskal-Wallis Test for Multinomial Factor Variables
kruskal_wallis_res <- as.data.frame(matrix(nrow = ncol(new_factorTb),
ncol = ncol(sampleScore_sub)))
rownames(kruskal_wallis_res) <- colnames(new_factorTb)
colnames(kruskal_wallis_res) <- colnames(sampleScore_sub)
kwtest_coad_wvalue <- kruskal_wallis_res
kwtest_coad_pvalue <- kruskal_wallis_res
for (i in seq_len(ncol(sampleScore_sub))) {
for (j in seq_len(ncol(new_factorTb))) {
## Kruskal-Wallis Test
kruskal_test <- kruskal.test(sampleScore_sub[, i] ~ new_factorTb[,j])
## Kruskal-Wallis Chi-squared value
kw_val <- kruskal_test$statistic
kwtest_coad_wvalue[j, i] <- kw_val
## p-value
pval <- kruskal_test$p.value
kwtest_coad_pvalue[j, i] <- pval
}
}
batch_char_ind <- grep('analyte|analytes|portion|procurement|aliquot|uuid|barcode',
rownames(kwtest_coad_wvalue))
kwtest_coad_wvalue <- kwtest_coad_wvalue[-batch_char_ind,]
kwtest_coad_pvalue <- kwtest_coad_pvalue[-batch_char_ind,]
heatmap(as.matrix(kwtest_coad_wvalue), main = 'COAD Kruskal-Wallis Test Cancer Multinomial')

#Only view kw-values with significant p-value (<0.01)
kwtest_coad_sig_wvalue <- kruskal_wallis_res
for (i in seq_len(ncol(sampleScore_sub))) {
for (j in seq_len(ncol(new_factorTb))) {
if (kruskal.test(sampleScore_sub[, i] ~ new_factorTb[,j])$p.value < 0.01) {
kwtest_coad_sig_wvalue[j, i] <- kruskal.test(sampleScore_sub[, i] ~ new_factorTb[,j])$statistic
}
}
}
batch_char_ind <- grep('analyte|analytes|portion|procurement|aliquot|uuid|barcode',
rownames(kwtest_coad_sig_wvalue))
kwtest_coad_sig_wvalue <- kwtest_coad_sig_wvalue[-batch_char_ind, ]
na_ind <- apply(kwtest_coad_sig_wvalue, 1, function(x) all(is.na(x)))
kwtest_coad_sig_wvalue <- kwtest_coad_sig_wvalue[!na_ind, ]
options(ztable.type='html')
ztable(kwtest_coad_sig_wvalue) %>%
makeHeatmap(palette = 'Blues') %>%
print(caption='Cancer Sample Kruskal-Wallis Test with p-values < 0.01')
Cancer Sample Kruskal-Wallis Test with p-values < 0.01
Â
|
RAV188 |
RAV1575 |
RAV324 |
RAV832 |
RAV61 |
RAV1008 |
RAV868 |
RAV64 |
RAV220 |
RAV834 |
RAV833 |
RAV438 |
RAV190 |
RAV1166 |
RAV192 |
patient.clinical_cqcf.frozen_specimen_anatomic_site
|
|
|
|
NA
|
|
|
|
|
|
29.64
|
|
|
NA
|
|
|
patient.icd_10
|
|
|
|
NA
|
|
23.12
|
|
|
20.77
|
|
|
|
NA
|
|
|
patient.icd_o_3_site
|
|
|
|
NA
|
|
23.12
|
|
|
20.77
|
|
|
|
NA
|
|
|
patient.tissue_source_site
|
39.26
|
38.02
|
46.85
|
NA
|
|
42.07
|
52.66
|
43.61
|
47.00
|
|
40.72
|
44.97
|
NA
|
46.60
|
48.51
|
patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status
|
|
|
|
NA
|
|
|
|
|
|
46.83
|
|
|
NA
|
|
|
patient.biospecimen_cqcf.frozen_specimen_anatomic_site
|
|
|
|
NA
|
|
|
|
|
|
21.43
|
|
|
NA
|
|
|
patient.biospecimen_cqcf.site_of_disease_description
|
|
|
|
NA
|
|
|
|
|
|
21.43
|
|
|
NA
|
|
|
|
Function to Display Categorical Attributes’ Score Plots
graph_categorical <- function(RAV1, RAV2, factor.df, sampleScore.df, phenotype, graph_title = "Score Plot", legend_title = "Legend") {
# Sanity Check 1: Check if RAVs are available; if RAVs don't exist in sampleScore.df, the function will error: "undefined columns selected"
# Sanity Check 2: phenotype is in factor.df
if(nlevels(factor.df[[phenotype]]) == 0) {
print("Phenotype is not in provided factor dataframe")
}
sampleScore1 <- paste0("RAV", RAV1)
sampleScore2 <- paste0("RAV", RAV2)
new_df <- sampleScore.df[, c(sampleScore1, sampleScore2)]
colnames(new_df)[1] <- "sampleScore1"
colnames(new_df)[2] <- "sampleScore2"
new_df <- data.frame(new_df, factor.df)
plot_data <- new_df[which(!is.na(factor(new_df[[phenotype]]))),] #Filter out rows of the phenotype that are N/A
#print(plot_data[[phenotype]])
colors <- gg_color_hue(length(unique(plot_data[[phenotype]]))) # Count number of factor levels excluding NA
colors.toplot <- c(colors)
pA <- ggplot(plot_data,
aes(x = sampleScore1, y = sampleScore2, color = plot_data[[phenotype]])) +
geom_point() +
labs(title = graph_title) +
scale_color_manual(values = colors.toplot, name = legend_title) +
theme_bw() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank()) +
geom_hline(yintercept = 0, linetype = 'dashed') +
geom_vline(xintercept = 0, linetype = 'dashed') +
xlab(sampleScore1) + ylab(sampleScore2)
print(pA)
}
graph_categorical(834, 1166, new_factorTb, sampleScore_sub,
"patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status",
"Score Plot for Microsatellite Status", "Microsatellite Status")

graph_categorical(834, 188, binary_factor_2, sampleScore_sub, 'patient.braf_gene_analysis_result', 'BRAF Gene Analysis Result')
## Warning: Use of `plot_data[[phenotype]]` is discouraged.
## ℹ Use `.data[[phenotype]]` instead.
