suppressPackageStartupMessages({
library(GenomicSuperSignature)
library(curatedTCGAData)
library(MultiAssayExperiment)
library(TCGAutils)
library(tidyr)
library(dplyr)
library(ggplot2)
library(magick)
library(wordcloud)
library(EnrichmentBrowser)
library(ztable)
library(magrittr)
})
load("~/Documents/GitHub/GSS/data/TCGA_validationDatasets.rda")
datasets <- TCGA_validationDatasets[1:7]
RAVmodel <- getModel("C2", load=TRUE)
## [1] "downloading"
hnsc <- curatedTCGAData(diseaseCode = "HNSC",
assays = "RNA*",
version = "2.0.1",
dry.run = FALSE)
hnsc_rna <- getWithColData(hnsc,
"HNSC_RNASeq2Gene-20160128",
mode = "append")
hnsc_meta <- colData(hnsc_rna)
val_hnsc <- validate(datasets$HNSC, RAVmodel)
heatmapTable(val_hnsc, RAVmodel)
sparsity_summary <- table(colSums(is.na(hnsc_meta)))
sparsity_summary
##
## 0 1 2 3 4 5 7 8 10 11 12 13 14 16 23 26 41 50 54 63
## 235 7 5 23 6 20 2 1 1 1 1 2 1 4 2 3 2 1 1 1
## 72 73 76 83 86 91 100 115 133 144 147 148 150 151 164 173 175 177 186 191
## 3 1 1 1 3 5 1 1 2 1 1 1 1 1 1 1 9 1 1 1
## 192 206 212 216 223 234 241 245 246 250 252 253 256 259 261 263 266 267 269 273
## 1 1 3 3 3 1 1 1 5 3 1 1 1 2 1 1 1 1 1 1
## 289 303 308 310 311 312 313 324 325 326 329 330 333 335 341 359 362 363 367 368
## 2 1 1 2 15 15 1 10 1 2 2 1 1 1 1 1 1 1 3 1
## 369 371 373 379 380 381 382 384 386 387 389 390 391 397 402 407 410 412 414 416
## 2 1 1 1 6 2 1 1 1 1 1 1 2 1 1 5 1 1 1 1
## 420 421 425 426 427 431 433 435 437 440 446 451 452 454 456 457 460 461 462 463
## 2 4 1 1 3 1 2 1 3 1 1 1 1 60 2 3 1 1 5 24
## 466 467 469 472 473 474 475 476 477 479 480 481 482 483 485 488 489 490 491 493
## 1 3 1 1 1 3 2 1 1 17 1 1 2 1 8 41 1 4 2 1
## 495 502 505 507 508 509 510 512 515 516 517 518 519 520 521 522 523 524 525 526
## 1 1 1 1 1 17 2 2 23 1 2 4 2 2 2 19 8 2 3 8
## 527 528 529 530 531 532 533 534 537 538 539 541 542 543 544 545 546 548 549 550
## 2 1 2 10 1 5 3 3 1 1 3 3 8 3 4 5 2 14 6 4
## 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566
## 10 8 10 9 7 14 46 10 19 20 44 79 45 100 140 6
nrow(hnsc_meta)
## [1] 566
nrow(hnsc_meta)/1.5
## [1] 377.3333
nrow(hnsc_meta)/10
## [1] 56.6
# Only select for columns with more than 10% completeness
keep_attr_ind <- which(colSums(!is.na(hnsc_meta)) > round(nrow(hnsc_meta)/7.5))
meta_sub1 <- hnsc_meta[keep_attr_ind]
meta_sub1 <- subset(meta_sub1, select = -patientID)
# Randomly select for 100 rows
set.seed(1)
random_sample_ind <- sample(1:nrow(meta_sub1), 100)
meta_sub2 <- meta_sub1[random_sample_ind,]
# Check for data types in listData
unique(sapply(hnsc_meta@listData, type))
## [1] "character" "integer" "double"
charcTb <- meta_sub2[, sapply(meta_sub2, class) == 'character']
numTb <- meta_sub2[, sapply(meta_sub2, class) %in% c('numeric', 'integer')]
# Calculate validation scores
sampleScore <- calculateScore(hnsc_rna, RAVmodel)
# Select for RAVs with a minimum silhouette width of 0.4
val_all <- validate(hnsc_rna, RAVmodel)
validated_ind <- validatedSignatures(val_all, num.out = 30, RAVmodel,
swCutoff = 0.4, indexOnly = TRUE)
validated_ind
## [1] 4483 95 4489 2670 1301 1959 1302 988 987 3033 1002 1125 4689 2673 1960
## [16] 996 4486 998 999 1001 4113
# Subset sampleScore to join with MCPcounter
sampleScore_sub <- sampleScore[random_sample_ind, validated_ind] %>% as.data.frame()
# R squared value function
calculateRsq <- function (x, y) stats::cor(x, y, use = "na.or.complete") ^ 2
# Calculate r-squared for numeric attributes
rsq_numAttr_tb <- as.data.frame(matrix(nrow = ncol(numTb),
ncol = ncol(sampleScore_sub)))
colnames(rsq_numAttr_tb) <- colnames(sampleScore_sub)
rownames(rsq_numAttr_tb) <- colnames(numTb)
for (i in seq_len(ncol(numTb))) {
for (j in seq_len(ncol(sampleScore_sub))) {
rsq <- calculateRsq(numTb[,i], sampleScore_sub[,j])
rsq_numAttr_tb[i, j] <- rsq
}
}
dim(rsq_numAttr_tb) # 330 features x 21 RAVs
## [1] 326 21
# rsq_numAttr_tb[1:10, 1:20]
max_rav <- apply(rsq_numAttr_tb, 1, max)
max_attr <- which(max_rav > 0.4 & max_rav < 1) # Decreased the lower limit to 0.4
max_rav[max_attr]
## patient.drugs.drug.2.day_of_form_completion
## 0.6934929
## patient.drugs.drug.2.month_of_form_completion
## 0.7550794
## patient.drugs.drug.2.year_of_form_completion
## 0.4328534
## patient.follow_ups.follow_up.2.year_of_form_completion
## 0.4159321
## patient.radiations.radiation.2.day_of_form_completion
## 0.4097006
## patient.samples.sample.portions.portion.analytes.analyte.a260_a280_ratio
## 0.4227139
target_rsq <- rsq_numAttr_tb[max_attr,]
options(ztable.type="html")
z = ztable(target_rsq)
z %>% makeHeatmap()
 | RAV4483 | RAV95 | RAV4489 | RAV2670 | RAV1301 | RAV1959 | RAV1302 | RAV988 | RAV987 | RAV3033 | RAV1002 | RAV1125 | RAV4689 | RAV2673 | RAV1960 | RAV996 | RAV4486 | RAV998 | RAV999 | RAV1001 | RAV4113 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
patient.drugs.drug.2.day_of_form_completion | 0.11 | 0.00 | 0.00 | 0.01 | 0.01 | 0.08 | 0.10 | 0.04 | 0.02 | 0.01 | 0.00 | 0.02 | 0.07 | 0.00 | 0.69 | 0.15 | 0.03 | 0.01 | 0.01 | 0.13 | 0.01 |
# Convert to factor data type
factorTb <- meta_sub2[, sapply(meta_sub2, class) == 'character']
factorTb[sapply(factorTb, is.character)] <- lapply(factorTb[sapply(factorTb, is.character)], as.factor)
new_ind <- c()
# Select for factors with at least two possible values
for (i in 1:length(factorTb)) {
if (nlevels(factorTb[, i]) > 1 & nlevels(factorTb[, i]) < 25) {
new_ind <- c(new_ind, i)
}
}
new_factorTb <- factorTb[,new_ind]
# Calculate p-value for factor attributes
aov_factorAttr_tb <- as.data.frame(matrix(nrow = ncol(new_factorTb),
ncol = ncol(sampleScore_sub)))
rownames(aov_factorAttr_tb) <- colnames(new_factorTb)
colnames(aov_factorAttr_tb) <- colnames(sampleScore_sub)
for (i in seq_len(ncol(sampleScore_sub))) {
for (j in seq_len(ncol(new_factorTb))) {
if (!is.null(summary(aov(sampleScore_sub[, i] ~ new_factorTb[, j]))[[1]]$`Pr(>F)`[1])) {
aov_factorAttr_tb[j, i] <- summary(aov(sampleScore_sub[, i] ~ new_factorTb[, j]))[[1]]$`Pr(>F)`[1]
} else {
next
}
}
}
# Calculate f-statistics for factor attributes
aov_factorAttr_fstat <- as.data.frame(matrix(nrow = ncol(new_factorTb),
ncol = ncol(sampleScore_sub)))
rownames(aov_factorAttr_fstat) <- colnames(new_factorTb)
colnames(aov_factorAttr_fstat) <- colnames(sampleScore_sub)
for (i in seq_len(ncol(sampleScore_sub))) {
for (j in seq_len(ncol(new_factorTb))) {
if (!is.null(summary(aov(sampleScore_sub[, i] ~ new_factorTb[, j]))[[1]]$`F value`[1])) {
aov_factorAttr_fstat[j, i] <- summary(aov(sampleScore_sub[, i] ~ new_factorTb[, j]))[[1]]$`F value`[1]
} else {
next
}
}
}
summary(aov(sampleScore_sub[, 1] ~ new_factorTb[, 2]))
## Df Sum Sq Mean Sq F value Pr(>F)
## new_factorTb[, 2] 6 1.71 0.2849 0.337 0.916
## Residuals 88 74.43 0.8458
## 5 observations deleted due to missingness
summary(aov(sampleScore_sub[, 1] ~ new_factorTb[, 3]))
## Df Sum Sq Mean Sq F value Pr(>F)
## new_factorTb[, 3] 7 17.27 2.4667 3.63 0.00178 **
## Residuals 86 58.44 0.6796
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 6 observations deleted due to missingness
new_factorTb[1:5, 1:5]
## DataFrame with 5 rows and 5 columns
## pathologic_stage pathology_T_stage pathology_N_stage
## <factor> <factor> <factor>
## TCGA-CN-A6V3 stage iva t3 n2b
## TCGA-P3-A6T2 stage iva t3 n2b
## TCGA-HD-A6HZ stage iii t2 n1
## TCGA-CV-7103 stage iva t2 n2b
## TCGA-CV-6951 stage iva t4a n2c
## pathology_M_stage gender
## <factor> <factor>
## TCGA-CN-A6V3 m0 male
## TCGA-P3-A6T2 mx male
## TCGA-HD-A6HZ mx female
## TCGA-CV-7103 NA male
## TCGA-CV-6951 NA male
# Select for p-values < 0.05
min_rav <- apply(aov_factorAttr_tb, 1, min)
max_rav <- apply(aov_factorAttr_tb, 1, max)
min_attr <- which(min_rav < 0.05 & max_rav < 0.5)
head(min_rav[min_attr])
## pathology_M_stage
## 1.588612e-03
## patient.stage_event.tnm_categories.pathologic_categories.pathologic_m
## 1.588612e-03
## patient.samples.sample.2.portions.portion.analytes.analyte.2.dna.pcr_amplification_successful
## 2.466234e-09
## patient.samples.sample.2.portions.portion.analytes.analyte.dna.pcr_amplification_successful
## 4.682376e-09
## patient.samples.sample.2.portions.portion.analytes.analyte.protocols.protocol.experimental_protocol_type
## 3.381745e-05
## patient.samples.sample.portions.portion.analytes.analyte.3.dna.pcr_amplification_successful
## 2.690453e-10
target_aov <- aov_factorAttr_tb[min_attr,]
options(ztable.type="html")
ztable(target_aov) %>%
makeHeatmap(palette = 'Blues') %>%
print(caption="ANOVA p-values")
 | RAV4483 | RAV95 | RAV4489 | RAV2670 | RAV1301 | RAV1959 | RAV1302 | RAV988 | RAV987 | RAV3033 | RAV1002 | RAV1125 | RAV4689 | RAV2673 | RAV1960 | RAV996 | RAV4486 | RAV998 | RAV999 | RAV1001 | RAV4113 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
pathology_M_stage | 0.38 | 0.31 | 0.09 | 0.01 | 0.01 | 0.21 | 0.03 | 0.00 | 0.05 | 0.02 | 0.49 | 0.11 | 0.02 | 0.15 | 0.41 | 0.06 | 0.03 | 0.06 | 0.29 | 0.43 | 0.08 |
# Calculate f-statistics for p-values < 0.01
aov_factorAttr_fstat_2 <- as.data.frame(matrix(nrow = ncol(new_factorTb),
ncol = ncol(sampleScore_sub)))
rownames(aov_factorAttr_fstat_2) <- colnames(new_factorTb)
colnames(aov_factorAttr_fstat_2) <- colnames(sampleScore_sub)
for (i in seq_len(ncol(sampleScore_sub))) {
for (j in seq_len(ncol(new_factorTb))) {
if (!is.null(summary(aov(sampleScore_sub[, i] ~ new_factorTb[, j]))[[1]]$`Pr(>F)`[1])) {
if (!is.null(summary(aov(sampleScore_sub[, i] ~ new_factorTb[, j]))[[1]]$`F value`[1]) &
(summary(aov(sampleScore_sub[, i] ~ new_factorTb[, j]))[[1]]$`Pr(>F)`[1] < 0.01)) {
aov_factorAttr_fstat_2[j, i] <- summary(aov(sampleScore_sub[, i] ~ new_factorTb[, j]))[[1]]$`F value`[1]
} else {
next
}
} else {
next
}
}
}
fstat_ind <- c()
for (i in seq_len(ncol(new_factorTb))) {
if (sum(is.na(aov_factorAttr_fstat_2[i,])) < 15) {
fstat_ind <- c(fstat_ind, i)
} else {
next
}
}
target_aov_2 <- aov_factorAttr_fstat_2[fstat_ind,]
options(ztable.type="html")
ztable(target_aov_2) %>%
makeHeatmap(palette = 'Blues') %>%
print(caption="ANOVA f-stats")
 | RAV4483 | RAV95 | RAV4489 | RAV2670 | RAV1301 | RAV1959 | RAV1302 | RAV988 | RAV987 | RAV3033 | RAV1002 | RAV1125 | RAV4689 | RAV2673 | RAV1960 | RAV996 | RAV4486 | RAV998 | RAV999 | RAV1001 | RAV4113 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
patient.samples.sample.2.portions.portion.analytes.analyte.2.dna.pcr_amplification_successful | 15.44 | 11.29 | 36.59 | 39.67 | 11.09 | 27.42 | 31.88 | 14.29 | 43.50 | 8.21 | 13.49 | 14.34 | 12.53 | 23.78 | 11.41 | ||||||
ind <- 988
findStudiesInCluster(RAVmodel, ind, studyTitle = TRUE)
## studyName PC Variance explained (%)
## 1 SRP024274 2 11.57
## 2 SRP082973 4 1.90
## title
## 1 Smoking Dysregulates the Human Airway Basal Cell Transcriptome at COPD-linked Risk Locus 19q13.2
## 2 Mandatory Role of HMGA1 in Human Airway Epithelial Normal Differentiation and Post-injury Regeneration
subsetEnrichedPathways(RAVmodel, ind, include_nes = TRUE) %>% as.data.frame
## RAV988.Description RAV988.NES
## Up_1 ROSTY_CERVICAL_CANCER_PROLIFERATION_CLUSTER 4.109677
## Up_2 DUTERTRE_ESTRADIOL_RESPONSE_24HR_UP 4.087688
## Up_3 SOTIRIOU_BREAST_CANCER_GRADE_1_VS_3_UP 4.083082
## Up_4 KOBAYASHI_EGFR_SIGNALING_24HR_DN 4.007970
## Up_5 FLORIO_NEOCORTEX_BASAL_RADIAL_GLIA_DN 4.006780
## Up_6 GOBERT_OLIGODENDROCYTE_DIFFERENTIATION_UP 3.918913
## Up_7 LEE_EARLY_T_LYMPHOCYTE_UP 3.783045
## Up_8 ZHOU_CELL_CYCLE_GENES_IN_IR_RESPONSE_24HR 3.689651
## Up_9 FISCHER_G2_M_CELL_CYCLE 3.683076
## Up_10 GRAHAM_CML_DIVIDING_VS_NORMAL_QUIESCENT_UP 3.665484
drawWordcloud(RAVmodel, ind)
## Warning in wordcloud::wordcloud(words = all$word, freq = all$freq, scale =
## scale, : Chromosomes, Human, Pair 19 could not be fit on page. It will not be
## plotted.
## Warning in wordcloud::wordcloud(words = all$word, freq = all$freq, scale =
## scale, : Chronic Obstructive Pulmonary Disease could not be fit on page. It
## will not be plotted.
ind <- 1301
findStudiesInCluster(RAVmodel, ind, studyTitle = TRUE)
## studyName PC Variance explained (%)
## 1 SRP042228 1 20.14
## 2 SRP048801 1 31.86
## 3 SRP096757 1 24.58
## title
## 1 Core Ileal Transcriptome in Pediatric Crohn Disease
## 2 Ileal immune maturation in Pediatric Crohn''s Disease
## 3 Profiling of Ileal Transcriptome in Pediatric Crohn Disease
subsetEnrichedPathways(RAVmodel, ind, include_nes = TRUE) %>% as.data.frame
## RAV1301.Description RAV1301.NES
## Up_1 BILANGES_SERUM_AND_RAPAMYCIN_SENSITIVE_GENES -4.319476
## Up_2 <NA> NA
## Up_3 <NA> NA
## Up_4 <NA> NA
## Up_5 <NA> NA
## Up_6 <NA> NA
## Up_7 <NA> NA
## Up_8 <NA> NA
## Up_9 <NA> NA
## Up_10 <NA> NA
drawWordcloud(RAVmodel, ind)
## Warning in wordcloud::wordcloud(words = all$word, freq = all$freq, scale =
## scale, : Pediatric Crohn's disease could not be fit on page. It will not be
## plotted.