Set up

Load Packages

suppressPackageStartupMessages({
  library(GenomicSuperSignature)
  library(curatedTCGAData)
  library(MultiAssayExperiment)
  library(TCGAutils)
  library(tidyr)
  library(dplyr)
  library(ggplot2)
  library(magick)
  library(wordcloud)
  library(EnrichmentBrowser)
  library(ztable)
  library(magrittr)
})

Load TCGA dataset

load("~/Documents/GitHub/GSS/data/TCGA_validationDatasets.rda")
datasets <- TCGA_validationDatasets[1:7]

Load RAVmodel

RAVmodel <- getModel("C2", load=TRUE)
## [1] "downloading"

Select HNSC RNA metadata

hnsc <- curatedTCGAData(diseaseCode = "HNSC", 
                        assays = "RNA*", 
                        version = "2.0.1", 
                        dry.run = FALSE)

hnsc_rna <- getWithColData(hnsc, 
                           "HNSC_RNASeq2Gene-20160128", 
                           mode = "append")

hnsc_meta <- colData(hnsc_rna)

heatmapTable: HNSC

val_hnsc <- validate(datasets$HNSC, RAVmodel)
heatmapTable(val_hnsc, RAVmodel)

Subset

Filter attributes

sparsity_summary <- table(colSums(is.na(hnsc_meta)))
sparsity_summary
## 
##   0   1   2   3   4   5   7   8  10  11  12  13  14  16  23  26  41  50  54  63 
## 235   7   5  23   6  20   2   1   1   1   1   2   1   4   2   3   2   1   1   1 
##  72  73  76  83  86  91 100 115 133 144 147 148 150 151 164 173 175 177 186 191 
##   3   1   1   1   3   5   1   1   2   1   1   1   1   1   1   1   9   1   1   1 
## 192 206 212 216 223 234 241 245 246 250 252 253 256 259 261 263 266 267 269 273 
##   1   1   3   3   3   1   1   1   5   3   1   1   1   2   1   1   1   1   1   1 
## 289 303 308 310 311 312 313 324 325 326 329 330 333 335 341 359 362 363 367 368 
##   2   1   1   2  15  15   1  10   1   2   2   1   1   1   1   1   1   1   3   1 
## 369 371 373 379 380 381 382 384 386 387 389 390 391 397 402 407 410 412 414 416 
##   2   1   1   1   6   2   1   1   1   1   1   1   2   1   1   5   1   1   1   1 
## 420 421 425 426 427 431 433 435 437 440 446 451 452 454 456 457 460 461 462 463 
##   2   4   1   1   3   1   2   1   3   1   1   1   1  60   2   3   1   1   5  24 
## 466 467 469 472 473 474 475 476 477 479 480 481 482 483 485 488 489 490 491 493 
##   1   3   1   1   1   3   2   1   1  17   1   1   2   1   8  41   1   4   2   1 
## 495 502 505 507 508 509 510 512 515 516 517 518 519 520 521 522 523 524 525 526 
##   1   1   1   1   1  17   2   2  23   1   2   4   2   2   2  19   8   2   3   8 
## 527 528 529 530 531 532 533 534 537 538 539 541 542 543 544 545 546 548 549 550 
##   2   1   2  10   1   5   3   3   1   1   3   3   8   3   4   5   2  14   6   4 
## 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 
##  10   8  10   9   7  14  46  10  19  20  44  79  45 100 140   6

Sparsity Plot

nrow(hnsc_meta)
## [1] 566
nrow(hnsc_meta)/1.5
## [1] 377.3333
nrow(hnsc_meta)/10
## [1] 56.6
# Only select for columns with more than 10% completeness
keep_attr_ind <- which(colSums(!is.na(hnsc_meta)) > round(nrow(hnsc_meta)/7.5))
meta_sub1 <- hnsc_meta[keep_attr_ind]
meta_sub1 <- subset(meta_sub1, select = -patientID)
# Randomly select for 100 rows
set.seed(1)
random_sample_ind <- sample(1:nrow(meta_sub1), 100)
meta_sub2 <- meta_sub1[random_sample_ind,]
# Check for data types in listData
unique(sapply(hnsc_meta@listData, type))
## [1] "character" "integer"   "double"
charcTb <- meta_sub2[, sapply(meta_sub2, class) == 'character']
numTb <- meta_sub2[, sapply(meta_sub2, class) %in% c('numeric', 'integer')]
# Calculate validation scores
sampleScore <- calculateScore(hnsc_rna, RAVmodel)
# Select for RAVs with a minimum silhouette width of 0.4
val_all <- validate(hnsc_rna, RAVmodel)
validated_ind <- validatedSignatures(val_all, num.out = 30, RAVmodel,
                                     swCutoff = 0.4, indexOnly = TRUE)
validated_ind
##  [1] 4483   95 4489 2670 1301 1959 1302  988  987 3033 1002 1125 4689 2673 1960
## [16]  996 4486  998  999 1001 4113
# Subset sampleScore to join with MCPcounter
sampleScore_sub <- sampleScore[random_sample_ind, validated_ind] %>% as.data.frame() 

Calculate r-squared value

Numeric variables

# R squared value function
calculateRsq <- function (x, y) stats::cor(x, y, use = "na.or.complete") ^ 2
# Calculate r-squared for numeric attributes
rsq_numAttr_tb <- as.data.frame(matrix(nrow = ncol(numTb), 
                                       ncol = ncol(sampleScore_sub)))

colnames(rsq_numAttr_tb) <- colnames(sampleScore_sub)
rownames(rsq_numAttr_tb) <- colnames(numTb)

for (i in seq_len(ncol(numTb))) {
    for (j in seq_len(ncol(sampleScore_sub))) {
        rsq <- calculateRsq(numTb[,i], sampleScore_sub[,j])
        rsq_numAttr_tb[i, j] <- rsq
    }
}
dim(rsq_numAttr_tb) # 330 features x 21 RAVs
## [1] 326  21
# rsq_numAttr_tb[1:10, 1:20]
max_rav <- apply(rsq_numAttr_tb, 1, max)
max_attr <- which(max_rav > 0.4 & max_rav < 1) # Decreased the lower limit to 0.4
max_rav[max_attr]
##                              patient.drugs.drug.2.day_of_form_completion 
##                                                                0.6934929 
##                            patient.drugs.drug.2.month_of_form_completion 
##                                                                0.7550794 
##                             patient.drugs.drug.2.year_of_form_completion 
##                                                                0.4328534 
##                   patient.follow_ups.follow_up.2.year_of_form_completion 
##                                                                0.4159321 
##                    patient.radiations.radiation.2.day_of_form_completion 
##                                                                0.4097006 
## patient.samples.sample.portions.portion.analytes.analyte.a260_a280_ratio 
##                                                                0.4227139

Features with an r squared value > 0.4

target_rsq <- rsq_numAttr_tb[max_attr,]

heatmapTable

options(ztable.type="html")
z = ztable(target_rsq)
z %>% makeHeatmap()
  RAV4483 RAV95 RAV4489 RAV2670 RAV1301 RAV1959 RAV1302 RAV988 RAV987 RAV3033 RAV1002 RAV1125 RAV4689 RAV2673 RAV1960 RAV996 RAV4486 RAV998 RAV999 RAV1001 RAV4113
patient.drugs.drug.2.day_of_form_completion 0.11 0.00 0.00 0.01 0.01 0.08 0.10 0.04 0.02 0.01 0.00 0.02 0.07 0.00 0.69 0.15 0.03 0.01 0.01 0.13 0.01
patient.drugs.drug.2.month_of_form_completion 0.05 0.15 0.06 0.24 0.27 0.21 0.32 0.19 0.39 0.18 0.06 0.06 0.18 0.09 0.12 0.03 0.00 0.76 0.03 0.42 0.37
patient.drugs.drug.2.year_of_form_completion 0.31 0.12 0.20 0.02 0.02 0.12 0.01 0.03 0.00 0.09 0.06 0.21 0.25 0.06 0.39 0.01 0.43 0.04 0.01 0.17 0.03
patient.follow_ups.follow_up.2.year_of_form_completion 0.20 0.30 0.05 0.19 0.21 0.06 0.12 0.21 0.09 0.42 0.09 0.06 0.08 0.13 0.01 0.08 0.05 0.09 0.00 0.06 0.05
patient.radiations.radiation.2.day_of_form_completion 0.21 0.17 0.21 0.00 0.00 0.08 0.01 0.04 0.00 0.10 0.07 0.41 0.41 0.30 0.29 0.00 0.11 0.02 0.03 0.12 0.03
patient.samples.sample.portions.portion.analytes.analyte.a260_a280_ratio 0.40 0.42 0.04 0.17 0.18 0.02 0.04 0.13 0.09 0.32 0.37 0.04 0.16 0.26 0.03 0.02 0.01 0.15 0.19 0.00 0.03

Calculate F-Statistic (ANOVA)

Character variables

# Convert to factor data type
factorTb <- meta_sub2[, sapply(meta_sub2, class) == 'character']
factorTb[sapply(factorTb, is.character)] <- lapply(factorTb[sapply(factorTb, is.character)], as.factor)

new_ind <- c()

# Select for factors with at least two possible values
for (i in 1:length(factorTb)) {
  if (nlevels(factorTb[, i]) > 1 & nlevels(factorTb[, i]) < 25) {
    new_ind <- c(new_ind, i)
  }
}

new_factorTb <- factorTb[,new_ind]
# Calculate p-value for factor attributes
aov_factorAttr_tb <- as.data.frame(matrix(nrow = ncol(new_factorTb),
                                          ncol = ncol(sampleScore_sub)))

rownames(aov_factorAttr_tb) <- colnames(new_factorTb)
colnames(aov_factorAttr_tb) <- colnames(sampleScore_sub)

for (i in seq_len(ncol(sampleScore_sub))) {
  for (j in seq_len(ncol(new_factorTb))) {
    if (!is.null(summary(aov(sampleScore_sub[, i] ~ new_factorTb[, j]))[[1]]$`Pr(>F)`[1])) {
      aov_factorAttr_tb[j, i] <- summary(aov(sampleScore_sub[, i] ~ new_factorTb[, j]))[[1]]$`Pr(>F)`[1]
    } else {
      next
    }
  }
}
# Calculate f-statistics for factor attributes
aov_factorAttr_fstat <- as.data.frame(matrix(nrow = ncol(new_factorTb),
                                          ncol = ncol(sampleScore_sub)))

rownames(aov_factorAttr_fstat) <- colnames(new_factorTb)
colnames(aov_factorAttr_fstat) <- colnames(sampleScore_sub)

for (i in seq_len(ncol(sampleScore_sub))) {
  for (j in seq_len(ncol(new_factorTb))) {
    if (!is.null(summary(aov(sampleScore_sub[, i] ~ new_factorTb[, j]))[[1]]$`F value`[1])) {
      aov_factorAttr_fstat[j, i] <- summary(aov(sampleScore_sub[, i] ~ new_factorTb[, j]))[[1]]$`F value`[1]
    } else {
      next
    }
  }
}

summary(aov(sampleScore_sub[, 1] ~ new_factorTb[, 2]))
##                   Df Sum Sq Mean Sq F value Pr(>F)
## new_factorTb[, 2]  6   1.71  0.2849   0.337  0.916
## Residuals         88  74.43  0.8458               
## 5 observations deleted due to missingness
summary(aov(sampleScore_sub[, 1] ~ new_factorTb[, 3]))
##                   Df Sum Sq Mean Sq F value  Pr(>F)   
## new_factorTb[, 3]  7  17.27  2.4667    3.63 0.00178 **
## Residuals         86  58.44  0.6796                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 6 observations deleted due to missingness
new_factorTb[1:5, 1:5]
## DataFrame with 5 rows and 5 columns
##              pathologic_stage pathology_T_stage pathology_N_stage
##                      <factor>          <factor>          <factor>
## TCGA-CN-A6V3        stage iva               t3                n2b
## TCGA-P3-A6T2        stage iva               t3                n2b
## TCGA-HD-A6HZ        stage iii               t2                n1 
## TCGA-CV-7103        stage iva               t2                n2b
## TCGA-CV-6951        stage iva               t4a               n2c
##              pathology_M_stage   gender
##                       <factor> <factor>
## TCGA-CN-A6V3                m0   male  
## TCGA-P3-A6T2                mx   male  
## TCGA-HD-A6HZ                mx   female
## TCGA-CV-7103                NA   male  
## TCGA-CV-6951                NA   male
# Select for p-values < 0.05
min_rav <- apply(aov_factorAttr_tb, 1, min)
max_rav <- apply(aov_factorAttr_tb, 1, max)
min_attr <- which(min_rav < 0.05 & max_rav < 0.5)
head(min_rav[min_attr])
##                                                                                        pathology_M_stage 
##                                                                                             1.588612e-03 
##                                    patient.stage_event.tnm_categories.pathologic_categories.pathologic_m 
##                                                                                             1.588612e-03 
##            patient.samples.sample.2.portions.portion.analytes.analyte.2.dna.pcr_amplification_successful 
##                                                                                             2.466234e-09 
##              patient.samples.sample.2.portions.portion.analytes.analyte.dna.pcr_amplification_successful 
##                                                                                             4.682376e-09 
## patient.samples.sample.2.portions.portion.analytes.analyte.protocols.protocol.experimental_protocol_type 
##                                                                                             3.381745e-05 
##              patient.samples.sample.portions.portion.analytes.analyte.3.dna.pcr_amplification_successful 
##                                                                                             2.690453e-10

Features with a p-value < 0.05

target_aov <- aov_factorAttr_tb[min_attr,]

heatmapTable

options(ztable.type="html")
ztable(target_aov) %>%
  makeHeatmap(palette = 'Blues') %>%
  print(caption="ANOVA p-values")
ANOVA p-values
  RAV4483 RAV95 RAV4489 RAV2670 RAV1301 RAV1959 RAV1302 RAV988 RAV987 RAV3033 RAV1002 RAV1125 RAV4689 RAV2673 RAV1960 RAV996 RAV4486 RAV998 RAV999 RAV1001 RAV4113
pathology_M_stage 0.38 0.31 0.09 0.01 0.01 0.21 0.03 0.00 0.05 0.02 0.49 0.11 0.02 0.15 0.41 0.06 0.03 0.06 0.29 0.43 0.08
patient.stage_event.tnm_categories.pathologic_categories.pathologic_m 0.38 0.31 0.09 0.01 0.01 0.21 0.03 0.00 0.05 0.02 0.49 0.11 0.02 0.15 0.41 0.06 0.03 0.06 0.29 0.43 0.08
patient.samples.sample.2.portions.portion.analytes.analyte.2.dna.pcr_amplification_successful 0.00 0.00 0.05 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.24 0.01 0.00 0.00 0.01 0.01 0.00 0.00 0.03 0.02 0.00
patient.samples.sample.2.portions.portion.analytes.analyte.dna.pcr_amplification_successful 0.00 0.00 0.10 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.19 0.01 0.00 0.00 0.05 0.08 0.00 0.00 0.12 0.02 0.00
patient.samples.sample.2.portions.portion.analytes.analyte.protocols.protocol.experimental_protocol_type 0.00 0.02 0.19 0.00 0.00 0.21 0.01 0.00 0.08 0.00 0.27 0.02 0.03 0.00 0.19 0.20 0.04 0.00 0.33 0.39 0.01
patient.samples.sample.portions.portion.analytes.analyte.3.dna.pcr_amplification_successful 0.00 0.00 0.07 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.19 0.01 0.00 0.00 0.01 0.02 0.00 0.00 0.04 0.01 0.00
patient.samples.sample.portions.portion.analytes.analyte.dna.pcr_amplification_successful 0.00 0.00 0.10 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.19 0.01 0.00 0.00 0.05 0.08 0.00 0.00 0.12 0.02 0.00
patient.samples.sample.portions.shipment_portion.plate_id 0.00 0.02 0.26 0.00 0.00 0.21 0.00 0.00 0.11 0.00 0.32 0.21 0.33 0.02 0.17 0.12 0.12 0.00 0.07 0.27 0.01
# Calculate f-statistics for p-values < 0.01
aov_factorAttr_fstat_2 <- as.data.frame(matrix(nrow = ncol(new_factorTb),
                                          ncol = ncol(sampleScore_sub)))

rownames(aov_factorAttr_fstat_2) <- colnames(new_factorTb)
colnames(aov_factorAttr_fstat_2) <- colnames(sampleScore_sub)

for (i in seq_len(ncol(sampleScore_sub))) {
  for (j in seq_len(ncol(new_factorTb))) {
    if (!is.null(summary(aov(sampleScore_sub[, i] ~ new_factorTb[, j]))[[1]]$`Pr(>F)`[1])) {
      if (!is.null(summary(aov(sampleScore_sub[, i] ~ new_factorTb[, j]))[[1]]$`F value`[1]) & 
          (summary(aov(sampleScore_sub[, i] ~ new_factorTb[, j]))[[1]]$`Pr(>F)`[1] < 0.01)) {
            aov_factorAttr_fstat_2[j, i] <- summary(aov(sampleScore_sub[, i] ~ new_factorTb[, j]))[[1]]$`F value`[1]
      } else {
        next
      }
    } else {
      next
    }
  }
}
fstat_ind <- c()

for (i in seq_len(ncol(new_factorTb))) {
  if (sum(is.na(aov_factorAttr_fstat_2[i,])) < 15) {
    fstat_ind <- c(fstat_ind, i)
  } else {
    next
  }
}

target_aov_2 <- aov_factorAttr_fstat_2[fstat_ind,]

heatmapTable

options(ztable.type="html")
ztable(target_aov_2) %>%
  makeHeatmap(palette = 'Blues') %>%
  print(caption="ANOVA f-stats")
ANOVA f-stats
  RAV4483 RAV95 RAV4489 RAV2670 RAV1301 RAV1959 RAV1302 RAV988 RAV987 RAV3033 RAV1002 RAV1125 RAV4689 RAV2673 RAV1960 RAV996 RAV4486 RAV998 RAV999 RAV1001 RAV4113
patient.samples.sample.2.portions.portion.analytes.analyte.2.dna.pcr_amplification_successful 15.44 11.29 36.59 39.67 11.09 27.42 31.88 14.29 43.50 8.21 13.49 14.34 12.53 23.78 11.41
patient.samples.sample.2.portions.portion.analytes.analyte.dna.pcr_amplification_successful 18.39 12.81 29.56 32.88 8.83 21.42 25.60 12.02 41.33 7.55 15.20 14.21 9.15 21.92 11.89
patient.samples.sample.2.portions.portion.analytes.analyte.protocols.protocol.experimental_protocol_type 6.33 6.94 7.80 5.37 9.36 11.47 6.47 7.50 5.06
patient.samples.sample.portions.portion.analytes.analyte.2.aliquots.aliquot.2.plate_id 3.17 3.52 2.38 4.07 3.79 2.27 2.63
patient.samples.sample.portions.portion.analytes.analyte.2.aliquots.aliquot.plate_id 3.17 3.52 2.38 4.07 3.79 2.27 2.63
patient.samples.sample.portions.portion.analytes.analyte.3.dna.pcr_amplification_successful 17.37 13.55 40.77 44.62 12.87 30.25 33.82 16.47 49.52 7.56 15.94 15.38 7.50 11.36 27.03 7.52 12.36
patient.samples.sample.portions.portion.analytes.analyte.aliquots.aliquot.2.plate_id 3.17 3.52 2.38 4.07 3.79 2.27 2.63
patient.samples.sample.portions.portion.analytes.analyte.aliquots.aliquot.3.plate_id 3.17 3.52 2.38 4.07 3.79 2.27 2.63
patient.samples.sample.portions.portion.analytes.analyte.aliquots.aliquot.plate_id 3.17 3.52 2.38 4.07 3.79 2.27 2.63
patient.samples.sample.portions.portion.analytes.analyte.dna.pcr_amplification_successful 18.39 12.81 29.56 32.88 8.83 21.42 25.60 12.02 41.33 7.55 15.20 14.21 9.15 21.92 11.89
patient.samples.sample.portions.shipment_portion.plate_id 4.83 4.56 5.32 4.45 5.16 4.97 7.29

RAV Exploration

ind <- 988
findStudiesInCluster(RAVmodel, ind, studyTitle = TRUE)
##   studyName PC Variance explained (%)
## 1 SRP024274  2                  11.57
## 2 SRP082973  4                   1.90
##                                                                                                    title
## 1       Smoking Dysregulates the Human Airway Basal Cell Transcriptome at COPD-linked Risk Locus 19q13.2
## 2 Mandatory Role of HMGA1 in Human Airway Epithelial Normal Differentiation and Post-injury Regeneration
subsetEnrichedPathways(RAVmodel, ind, include_nes = TRUE) %>% as.data.frame
##                                RAV988.Description RAV988.NES
## Up_1  ROSTY_CERVICAL_CANCER_PROLIFERATION_CLUSTER   4.109677
## Up_2          DUTERTRE_ESTRADIOL_RESPONSE_24HR_UP   4.087688
## Up_3       SOTIRIOU_BREAST_CANCER_GRADE_1_VS_3_UP   4.083082
## Up_4             KOBAYASHI_EGFR_SIGNALING_24HR_DN   4.007970
## Up_5        FLORIO_NEOCORTEX_BASAL_RADIAL_GLIA_DN   4.006780
## Up_6    GOBERT_OLIGODENDROCYTE_DIFFERENTIATION_UP   3.918913
## Up_7                    LEE_EARLY_T_LYMPHOCYTE_UP   3.783045
## Up_8    ZHOU_CELL_CYCLE_GENES_IN_IR_RESPONSE_24HR   3.689651
## Up_9                      FISCHER_G2_M_CELL_CYCLE   3.683076
## Up_10  GRAHAM_CML_DIVIDING_VS_NORMAL_QUIESCENT_UP   3.665484
drawWordcloud(RAVmodel, ind)
## Warning in wordcloud::wordcloud(words = all$word, freq = all$freq, scale =
## scale, : Chromosomes, Human, Pair 19 could not be fit on page. It will not be
## plotted.
## Warning in wordcloud::wordcloud(words = all$word, freq = all$freq, scale =
## scale, : Chronic Obstructive Pulmonary Disease could not be fit on page. It
## will not be plotted.

ind <- 1301
findStudiesInCluster(RAVmodel, ind, studyTitle = TRUE)
##   studyName PC Variance explained (%)
## 1 SRP042228  1                  20.14
## 2 SRP048801  1                  31.86
## 3 SRP096757  1                  24.58
##                                                         title
## 1         Core Ileal Transcriptome in Pediatric Crohn Disease
## 2       Ileal immune maturation in Pediatric Crohn''s Disease
## 3 Profiling of Ileal Transcriptome in Pediatric Crohn Disease
subsetEnrichedPathways(RAVmodel, ind, include_nes = TRUE) %>% as.data.frame
##                                RAV1301.Description RAV1301.NES
## Up_1  BILANGES_SERUM_AND_RAPAMYCIN_SENSITIVE_GENES   -4.319476
## Up_2                                          <NA>          NA
## Up_3                                          <NA>          NA
## Up_4                                          <NA>          NA
## Up_5                                          <NA>          NA
## Up_6                                          <NA>          NA
## Up_7                                          <NA>          NA
## Up_8                                          <NA>          NA
## Up_9                                          <NA>          NA
## Up_10                                         <NA>          NA
drawWordcloud(RAVmodel, ind)
## Warning in wordcloud::wordcloud(words = all$word, freq = all$freq, scale =
## scale, : Pediatric Crohn's disease could not be fit on page. It will not be
## plotted.