Set up

Load Packages

suppressPackageStartupMessages({
  library(GenomicSuperSignature)
  library(curatedTCGAData)
  library(MultiAssayExperiment)
  library(TCGAutils)
  library(tidyr)
  library(dplyr)
  library(ggplot2)
  library(magick)
  library(wordcloud)
  library(EnrichmentBrowser)
  library(ztable)
  library(magrittr)
})

Load TCGA dataset

load("~/Documents/GitHub/GSS/data/TCGA_validationDatasets.rda")
datasets <- TCGA_validationDatasets[1:7]

Load RAVmodel

RAVmodel <- getModel("C2", load=TRUE)

## [1] "downloading"

Select HNSC RNA metadata

hnsc <- curatedTCGAData(diseaseCode = "HNSC", 
                        assays = "RNA*", 
                        version = "2.0.1", 
                        dry.run = FALSE)

hnsc_rna <- getWithColData(hnsc, 
                           "HNSC_RNASeq2Gene-20160128", 
                           mode = "append")

hnsc_meta <- colData(hnsc_rna)

heatmapTable: HNSC

val_hnsc <- validate(datasets$HNSC, RAVmodel)
heatmapTable(val_hnsc, RAVmodel)

Subset

Filter attributes

sparsity_summary <- table(colSums(is.na(hnsc_meta)))
sparsity_summary

## 
##   0   1   2   3   4   5   7   8  10  11  12  13  14  16  23  26  41  50  54  63 
## 235   7   5  23   6  20   2   1   1   1   1   2   1   4   2   3   2   1   1   1 
##  72  73  76  83  86  91 100 115 133 144 147 148 150 151 164 173 175 177 186 191 
##   3   1   1   1   3   5   1   1   2   1   1   1   1   1   1   1   9   1   1   1 
## 192 206 212 216 223 234 241 245 246 250 252 253 256 259 261 263 266 267 269 273 
##   1   1   3   3   3   1   1   1   5   3   1   1   1   2   1   1   1   1   1   1 
## 289 303 308 310 311 312 313 324 325 326 329 330 333 335 341 359 362 363 367 368 
##   2   1   1   2  15  15   1  10   1   2   2   1   1   1   1   1   1   1   3   1 
## 369 371 373 379 380 381 382 384 386 387 389 390 391 397 402 407 410 412 414 416 
##   2   1   1   1   6   2   1   1   1   1   1   1   2   1   1   5   1   1   1   1 
## 420 421 425 426 427 431 433 435 437 440 446 451 452 454 456 457 460 461 462 463 
##   2   4   1   1   3   1   2   1   3   1   1   1   1  60   2   3   1   1   5  24 
## 466 467 469 472 473 474 475 476 477 479 480 481 482 483 485 488 489 490 491 493 
##   1   3   1   1   1   3   2   1   1  17   1   1   2   1   8  41   1   4   2   1 
## 495 502 505 507 508 509 510 512 515 516 517 518 519 520 521 522 523 524 525 526 
##   1   1   1   1   1  17   2   2  23   1   2   4   2   2   2  19   8   2   3   8 
## 527 528 529 530 531 532 533 534 537 538 539 541 542 543 544 545 546 548 549 550 
##   2   1   2  10   1   5   3   3   1   1   3   3   8   3   4   5   2  14   6   4 
## 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 
##  10   8  10   9   7  14  46  10  19  20  44  79  45 100 140   6

Sparsity Plot

nrow(hnsc_meta)

## [1] 566

nrow(hnsc_meta)/1.5

## [1] 377.3333

nrow(hnsc_meta)/10

## [1] 56.6

# Only select for columns with more than 10% completeness
keep_attr_ind <- which(colSums(!is.na(hnsc_meta)) > round(nrow(hnsc_meta)/7.5))
meta_sub1 <- hnsc_meta[keep_attr_ind]
meta_sub1 <- subset(meta_sub1, select = -patientID)

# Randomly select for 100 rows
set.seed(1)
random_sample_ind <- sample(1:nrow(meta_sub1), 100)
meta_sub2 <- meta_sub1[random_sample_ind,]

# Check for data types in listData
unique(sapply(hnsc_meta@listData, type))

## [1] "character" "integer"   "double"

charcTb <- meta_sub2[, sapply(meta_sub2, class) == 'character']
numTb <- meta_sub2[, sapply(meta_sub2, class) %in% c('numeric', 'integer')]

# Calculate validation scores
sampleScore <- calculateScore(hnsc_rna, RAVmodel)

# Select for RAVs with a minimum silhouette width of 0.4
val_all <- validate(hnsc_rna, RAVmodel)
validated_ind <- validatedSignatures(val_all, num.out = 30, RAVmodel,
                                     swCutoff = 0.4, indexOnly = TRUE)
validated_ind

##  [1] 4483   95 4489 2670 1301 1959 1302  988  987 3033 1002 1125 4689 2673 1960
## [16]  996 4486  998  999 1001 4113

# Subset sampleScore to join with MCPcounter
sampleScore_sub <- sampleScore[random_sample_ind, validated_ind] %>% as.data.frame()

Calculate r-squared value

Numeric variables

# R squared value function
calculateRsq <- function (x, y) stats::cor(x, y, use = "na.or.complete") ^ 2

# Calculate r-squared for numeric attributes
rsq_numAttr_tb <- as.data.frame(matrix(nrow = ncol(numTb), 
                                       ncol = ncol(sampleScore_sub)))

colnames(rsq_numAttr_tb) <- colnames(sampleScore_sub)
rownames(rsq_numAttr_tb) <- colnames(numTb)

for (i in seq_len(ncol(numTb))) {
    for (j in seq_len(ncol(sampleScore_sub))) {
        rsq <- calculateRsq(numTb[,i], sampleScore_sub[,j])
        rsq_numAttr_tb[i, j] <- rsq
    }
}

dim(rsq_numAttr_tb) # 330 features x 21 RAVs

## [1] 326  21

# rsq_numAttr_tb[1:10, 1:20]
max_rav <- apply(rsq_numAttr_tb, 1, max)
max_attr <- which(max_rav > 0.4 & max_rav < 1) # Decreased the lower limit to 0.4
max_rav[max_attr]

##                              patient.drugs.drug.2.day_of_form_completion 
##                                                                0.6934929 
##                            patient.drugs.drug.2.month_of_form_completion 
##                                                                0.7550794 
##                             patient.drugs.drug.2.year_of_form_completion 
##                                                                0.4328534 
##                   patient.follow_ups.follow_up.2.year_of_form_completion 
##                                                                0.4159321 
##                    patient.radiations.radiation.2.day_of_form_completion 
##                                                                0.4097006 
## patient.samples.sample.portions.portion.analytes.analyte.a260_a280_ratio 
##                                                                0.4227139

Features with an r squared value > 0.4

target_rsq <- rsq_numAttr_tb[max_attr,]

heatmapTable

options(ztable.type="html")
z = ztable(target_rsq)
z %>% makeHeatmap()


	RAV4483	RAV95	RAV4489	RAV2670	RAV1301	RAV1959	RAV1302	RAV988	RAV987	RAV3033	RAV1002	RAV1125	RAV4689	RAV2673	RAV1960	RAV996	RAV4486	RAV998	RAV999	RAV1001	RAV4113
patient.drugs.drug.2.day_of_form_completion	0.11	0.00	0.00	0.01	0.01	0.08	0.10	0.04	0.02	0.01	0.00	0.02	0.07	0.00	0.69	0.15	0.03	0.01	0.01	0.13	0.01
patient.drugs.drug.2.month_of_form_completion	0.05	0.15	0.06	0.24	0.27	0.21	0.32	0.19	0.39	0.18	0.06	0.06	0.18	0.09	0.12	0.03	0.00	0.76	0.03	0.42	0.37
patient.drugs.drug.2.year_of_form_completion	0.31	0.12	0.20	0.02	0.02	0.12	0.01	0.03	0.00	0.09	0.06	0.21	0.25	0.06	0.39	0.01	0.43	0.04	0.01	0.17	0.03
patient.follow_ups.follow_up.2.year_of_form_completion	0.20	0.30	0.05	0.19	0.21	0.06	0.12	0.21	0.09	0.42	0.09	0.06	0.08	0.13	0.01	0.08	0.05	0.09	0.00	0.06	0.05
patient.radiations.radiation.2.day_of_form_completion	0.21	0.17	0.21	0.00	0.00	0.08	0.01	0.04	0.00	0.10	0.07	0.41	0.41	0.30	0.29	0.00	0.11	0.02	0.03	0.12	0.03
patient.samples.sample.portions.portion.analytes.analyte.a260_a280_ratio	0.40	0.42	0.04	0.17	0.18	0.02	0.04	0.13	0.09	0.32	0.37	0.04	0.16	0.26	0.03	0.02	0.01	0.15	0.19	0.00	0.03

Calculate F-Statistic (ANOVA)

Character variables

# Convert to factor data type
factorTb <- meta_sub2[, sapply(meta_sub2, class) == 'character']
factorTb[sapply(factorTb, is.character)] <- lapply(factorTb[sapply(factorTb, is.character)], as.factor)

new_ind <- c()

# Select for factors with at least two possible values
for (i in 1:length(factorTb)) {
  if (nlevels(factorTb[, i]) > 1 & nlevels(factorTb[, i]) < 25) {
    new_ind <- c(new_ind, i)
  }
}

new_factorTb <- factorTb[,new_ind]

# Calculate p-value for factor attributes
aov_factorAttr_tb <- as.data.frame(matrix(nrow = ncol(new_factorTb),
                                          ncol = ncol(sampleScore_sub)))

rownames(aov_factorAttr_tb) <- colnames(new_factorTb)
colnames(aov_factorAttr_tb) <- colnames(sampleScore_sub)

for (i in seq_len(ncol(sampleScore_sub))) {
  for (j in seq_len(ncol(new_factorTb))) {
    if (!is.null(summary(aov(sampleScore_sub[, i] ~ new_factorTb[, j]))[[1]]$`Pr(>F)`[1])) {
      aov_factorAttr_tb[j, i] <- summary(aov(sampleScore_sub[, i] ~ new_factorTb[, j]))[[1]]$`Pr(>F)`[1]
    } else {
      next
    }
  }
}

# Calculate f-statistics for factor attributes
aov_factorAttr_fstat <- as.data.frame(matrix(nrow = ncol(new_factorTb),
                                          ncol = ncol(sampleScore_sub)))

rownames(aov_factorAttr_fstat) <- colnames(new_factorTb)
colnames(aov_factorAttr_fstat) <- colnames(sampleScore_sub)

for (i in seq_len(ncol(sampleScore_sub))) {
  for (j in seq_len(ncol(new_factorTb))) {
    if (!is.null(summary(aov(sampleScore_sub[, i] ~ new_factorTb[, j]))[[1]]$`F value`[1])) {
      aov_factorAttr_fstat[j, i] <- summary(aov(sampleScore_sub[, i] ~ new_factorTb[, j]))[[1]]$`F value`[1]
    } else {
      next
    }
  }
}

summary(aov(sampleScore_sub[, 1] ~ new_factorTb[, 2]))

##                   Df Sum Sq Mean Sq F value Pr(>F)
## new_factorTb[, 2]  6   1.71  0.2849   0.337  0.916
## Residuals         88  74.43  0.8458               
## 5 observations deleted due to missingness

summary(aov(sampleScore_sub[, 1] ~ new_factorTb[, 3]))

##                   Df Sum Sq Mean Sq F value  Pr(>F)   
## new_factorTb[, 3]  7  17.27  2.4667    3.63 0.00178 **
## Residuals         86  58.44  0.6796                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 6 observations deleted due to missingness

new_factorTb[1:5, 1:5]

## DataFrame with 5 rows and 5 columns
##              pathologic_stage pathology_T_stage pathology_N_stage
##                      <factor>          <factor>          <factor>
## TCGA-CN-A6V3        stage iva               t3                n2b
## TCGA-P3-A6T2        stage iva               t3                n2b
## TCGA-HD-A6HZ        stage iii               t2                n1 
## TCGA-CV-7103        stage iva               t2                n2b
## TCGA-CV-6951        stage iva               t4a               n2c
##              pathology_M_stage   gender
##                       <factor> <factor>
## TCGA-CN-A6V3                m0   male  
## TCGA-P3-A6T2                mx   male  
## TCGA-HD-A6HZ                mx   female
## TCGA-CV-7103                NA   male  
## TCGA-CV-6951                NA   male

# Select for p-values < 0.05
min_rav <- apply(aov_factorAttr_tb, 1, min)
max_rav <- apply(aov_factorAttr_tb, 1, max)
min_attr <- which(min_rav < 0.05 & max_rav < 0.5)
head(min_rav[min_attr])

##                                                                                        pathology_M_stage 
##                                                                                             1.588612e-03 
##                                    patient.stage_event.tnm_categories.pathologic_categories.pathologic_m 
##                                                                                             1.588612e-03 
##            patient.samples.sample.2.portions.portion.analytes.analyte.2.dna.pcr_amplification_successful 
##                                                                                             2.466234e-09 
##              patient.samples.sample.2.portions.portion.analytes.analyte.dna.pcr_amplification_successful 
##                                                                                             4.682376e-09 
## patient.samples.sample.2.portions.portion.analytes.analyte.protocols.protocol.experimental_protocol_type 
##                                                                                             3.381745e-05 
##              patient.samples.sample.portions.portion.analytes.analyte.3.dna.pcr_amplification_successful 
##                                                                                             2.690453e-10

Features with a p-value < 0.05

target_aov <- aov_factorAttr_tb[min_attr,]

heatmapTable

options(ztable.type="html")
ztable(target_aov) %>%
  makeHeatmap(palette = 'Blues') %>%
  print(caption="ANOVA p-values")

ANOVA p-values
	RAV4483	RAV95	RAV4489	RAV2670	RAV1301	RAV1959	RAV1302	RAV988	RAV987	RAV3033	RAV1002	RAV1125	RAV4689	RAV2673	RAV1960	RAV996	RAV4486	RAV998	RAV999	RAV1001	RAV4113
pathology_M_stage	0.38	0.31	0.09	0.01	0.01	0.21	0.03	0.00	0.05	0.02	0.49	0.11	0.02	0.15	0.41	0.06	0.03	0.06	0.29	0.43	0.08
patient.stage_event.tnm_categories.pathologic_categories.pathologic_m	0.38	0.31	0.09	0.01	0.01	0.21	0.03	0.00	0.05	0.02	0.49	0.11	0.02	0.15	0.41	0.06	0.03	0.06	0.29	0.43	0.08
patient.samples.sample.2.portions.portion.analytes.analyte.2.dna.pcr_amplification_successful	0.00	0.00	0.05	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.24	0.01	0.00	0.00	0.01	0.01	0.00	0.00	0.03	0.02	0.00
patient.samples.sample.2.portions.portion.analytes.analyte.dna.pcr_amplification_successful	0.00	0.00	0.10	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.19	0.01	0.00	0.00	0.05	0.08	0.00	0.00	0.12	0.02	0.00
patient.samples.sample.2.portions.portion.analytes.analyte.protocols.protocol.experimental_protocol_type	0.00	0.02	0.19	0.00	0.00	0.21	0.01	0.00	0.08	0.00	0.27	0.02	0.03	0.00	0.19	0.20	0.04	0.00	0.33	0.39	0.01
patient.samples.sample.portions.portion.analytes.analyte.3.dna.pcr_amplification_successful	0.00	0.00	0.07	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.19	0.01	0.00	0.00	0.01	0.02	0.00	0.00	0.04	0.01	0.00
patient.samples.sample.portions.portion.analytes.analyte.dna.pcr_amplification_successful	0.00	0.00	0.10	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.19	0.01	0.00	0.00	0.05	0.08	0.00	0.00	0.12	0.02	0.00
patient.samples.sample.portions.shipment_portion.plate_id	0.00	0.02	0.26	0.00	0.00	0.21	0.00	0.00	0.11	0.00	0.32	0.21	0.33	0.02	0.17	0.12	0.12	0.00	0.07	0.27	0.01

# Calculate f-statistics for p-values < 0.01
aov_factorAttr_fstat_2 <- as.data.frame(matrix(nrow = ncol(new_factorTb),
                                          ncol = ncol(sampleScore_sub)))

rownames(aov_factorAttr_fstat_2) <- colnames(new_factorTb)
colnames(aov_factorAttr_fstat_2) <- colnames(sampleScore_sub)

for (i in seq_len(ncol(sampleScore_sub))) {
  for (j in seq_len(ncol(new_factorTb))) {
    if (!is.null(summary(aov(sampleScore_sub[, i] ~ new_factorTb[, j]))[[1]]$`Pr(>F)`[1])) {
      if (!is.null(summary(aov(sampleScore_sub[, i] ~ new_factorTb[, j]))[[1]]$`F value`[1]) & 
          (summary(aov(sampleScore_sub[, i] ~ new_factorTb[, j]))[[1]]$`Pr(>F)`[1] < 0.01)) {
            aov_factorAttr_fstat_2[j, i] <- summary(aov(sampleScore_sub[, i] ~ new_factorTb[, j]))[[1]]$`F value`[1]
      } else {
        next
      }
    } else {
      next
    }
  }
}

fstat_ind <- c()

for (i in seq_len(ncol(new_factorTb))) {
  if (sum(is.na(aov_factorAttr_fstat_2[i,])) < 15) {
    fstat_ind <- c(fstat_ind, i)
  } else {
    next
  }
}

target_aov_2 <- aov_factorAttr_fstat_2[fstat_ind,]

heatmapTable

options(ztable.type="html")
ztable(target_aov_2) %>%
  makeHeatmap(palette = 'Blues') %>%
  print(caption="ANOVA f-stats")

ANOVA f-stats
	RAV4483	RAV95	RAV2670	RAV1301	RAV1959	RAV1302	RAV988	RAV987	RAV3033	RAV1125	RAV4689	RAV2673	RAV1960	RAV4486	RAV998	RAV1001	RAV4113
patient.samples.sample.2.portions.portion.analytes.analyte.2.dna.pcr_amplification_successful	15.44	11.29	36.59	39.67	11.09	27.42	31.88	14.29	43.50	8.21	13.49	14.34		12.53	23.78		11.41
patient.samples.sample.2.portions.portion.analytes.analyte.dna.pcr_amplification_successful	18.39	12.81	29.56	32.88	8.83	21.42	25.60	12.02	41.33	7.55	15.20	14.21		9.15	21.92		11.89
patient.samples.sample.2.portions.portion.analytes.analyte.protocols.protocol.experimental_protocol_type	6.33		6.94	7.80		5.37	9.36		11.47			6.47			7.50		5.06
patient.samples.sample.portions.portion.analytes.analyte.2.aliquots.aliquot.2.plate_id			3.17	3.52		2.38	4.07		3.79			2.27			2.63
patient.samples.sample.portions.portion.analytes.analyte.2.aliquots.aliquot.plate_id			3.17	3.52		2.38	4.07		3.79			2.27			2.63
patient.samples.sample.portions.portion.analytes.analyte.3.dna.pcr_amplification_successful	17.37	13.55	40.77	44.62	12.87	30.25	33.82	16.47	49.52	7.56	15.94	15.38	7.50	11.36	27.03	7.52	12.36
patient.samples.sample.portions.portion.analytes.analyte.aliquots.aliquot.2.plate_id			3.17	3.52		2.38	4.07		3.79			2.27			2.63
patient.samples.sample.portions.portion.analytes.analyte.aliquots.aliquot.3.plate_id			3.17	3.52		2.38	4.07		3.79			2.27			2.63
patient.samples.sample.portions.portion.analytes.analyte.aliquots.aliquot.plate_id			3.17	3.52		2.38	4.07		3.79			2.27			2.63
patient.samples.sample.portions.portion.analytes.analyte.dna.pcr_amplification_successful	18.39	12.81	29.56	32.88	8.83	21.42	25.60	12.02	41.33	7.55	15.20	14.21		9.15	21.92		11.89
patient.samples.sample.portions.shipment_portion.plate_id	4.83		4.56	5.32		4.45	5.16		4.97						7.29

RAV Exploration

ind <- 988
findStudiesInCluster(RAVmodel, ind, studyTitle = TRUE)

##   studyName PC Variance explained (%)
## 1 SRP024274  2                  11.57
## 2 SRP082973  4                   1.90
##                                                                                                    title
## 1       Smoking Dysregulates the Human Airway Basal Cell Transcriptome at COPD-linked Risk Locus 19q13.2
## 2 Mandatory Role of HMGA1 in Human Airway Epithelial Normal Differentiation and Post-injury Regeneration

subsetEnrichedPathways(RAVmodel, ind, include_nes = TRUE) %>% as.data.frame

##                                RAV988.Description RAV988.NES
## Up_1  ROSTY_CERVICAL_CANCER_PROLIFERATION_CLUSTER   4.109677
## Up_2          DUTERTRE_ESTRADIOL_RESPONSE_24HR_UP   4.087688
## Up_3       SOTIRIOU_BREAST_CANCER_GRADE_1_VS_3_UP   4.083082
## Up_4             KOBAYASHI_EGFR_SIGNALING_24HR_DN   4.007970
## Up_5        FLORIO_NEOCORTEX_BASAL_RADIAL_GLIA_DN   4.006780
## Up_6    GOBERT_OLIGODENDROCYTE_DIFFERENTIATION_UP   3.918913
## Up_7                    LEE_EARLY_T_LYMPHOCYTE_UP   3.783045
## Up_8    ZHOU_CELL_CYCLE_GENES_IN_IR_RESPONSE_24HR   3.689651
## Up_9                      FISCHER_G2_M_CELL_CYCLE   3.683076
## Up_10  GRAHAM_CML_DIVIDING_VS_NORMAL_QUIESCENT_UP   3.665484

drawWordcloud(RAVmodel, ind)

## Warning in wordcloud::wordcloud(words = all$word, freq = all$freq, scale =
## scale, : Chromosomes, Human, Pair 19 could not be fit on page. It will not be
## plotted.

## Warning in wordcloud::wordcloud(words = all$word, freq = all$freq, scale =
## scale, : Chronic Obstructive Pulmonary Disease could not be fit on page. It
## will not be plotted.

ind <- 1301
findStudiesInCluster(RAVmodel, ind, studyTitle = TRUE)

##   studyName PC Variance explained (%)
## 1 SRP042228  1                  20.14
## 2 SRP048801  1                  31.86
## 3 SRP096757  1                  24.58
##                                                         title
## 1         Core Ileal Transcriptome in Pediatric Crohn Disease
## 2       Ileal immune maturation in Pediatric Crohn''s Disease
## 3 Profiling of Ileal Transcriptome in Pediatric Crohn Disease

subsetEnrichedPathways(RAVmodel, ind, include_nes = TRUE) %>% as.data.frame

##                                RAV1301.Description RAV1301.NES
## Up_1  BILANGES_SERUM_AND_RAPAMYCIN_SENSITIVE_GENES   -4.319476
## Up_2                                          <NA>          NA
## Up_3                                          <NA>          NA
## Up_4                                          <NA>          NA
## Up_5                                          <NA>          NA
## Up_6                                          <NA>          NA
## Up_7                                          <NA>          NA
## Up_8                                          <NA>          NA
## Up_9                                          <NA>          NA
## Up_10                                         <NA>          NA

drawWordcloud(RAVmodel, ind)

## Warning in wordcloud::wordcloud(words = all$word, freq = all$freq, scale =
## scale, : Pediatric Crohn's disease could not be fit on page. It will not be
## plotted.

EDA HNSC

Sehyun Oh, Britney Pheng

March 12, 2024

Set up

Load Packages

Load TCGA dataset

Load RAVmodel

Select HNSC RNA metadata

heatmapTable: HNSC

Subset

Filter attributes

Sparsity Plot

Calculate r-squared value

Numeric variables

Features with an r squared value > 0.4

heatmapTable

Calculate F-Statistic (ANOVA)

Character variables

Features with a p-value < 0.05

heatmapTable

heatmapTable

RAV Exploration