Deliverable 1: Dashboard of RAVs and Attributes for TCGA Colorectal (COAD) and Lung (LUAD) Cancers

Author

Britney Pheng

Preparation

Packages

knitr::opts_chunk$set(echo = TRUE)

suppressPackageStartupMessages({
  # BiocManager
  library(GenomicSuperSignature)
  library(curatedTCGAData)
  library(MultiAssayExperiment)
  library(TCGAutils)
  library(ComplexHeatmap)
  
  # CRAN
  library(tidyverse) # includes dplyr, ggplot2, magrittr, tidyr
  library(magick)
  library(wordcloud)
  library(ztable)
  library(metafolio)
  library(randomForest)
  library(caret)
})

Load all Replicated Axes of Variation (RAVs) from GenomicSuperSignature package

RAVmodel <- getModel('C2', load=TRUE)

Load TCGA Data for Colorectal and Lung Cancers from curatedTCGAData package

data('diseaseCodes', package = "TCGAutils")

data <- curatedTCGAData(
    diseaseCode = c("COAD", "LUAD"), 
    assays = "RNASeq2Gene", version = "2.0.1",
    dry.run = FALSE
)

data <- TCGAsplitAssays(data, c("01", "11"))

Create Variables for RNA Sequencing Data and Clinical Attributes

coad_cancer <- getWithColData(data,
                           '01_COAD_RNASeq2Gene-20160128')
assay(coad_cancer) <- log2(assay(coad_cancer) + 1)
coad_meta <- colData(coad_cancer)

luad_cancer <- getWithColData(data,
                           '01_LUAD_RNASeq2Gene-20160128')
assay(luad_cancer) <- log2(assay(luad_cancer) + 1)
luad_meta <- colData(luad_cancer)

Function to Display Categorical Attributes’ Score Plots

graph_categorical <- function(RAV1, RAV2, factor.df, sampleScore.df, phenotype, graph_title = "Score Plot", legend_title = "Legend") {
  
  # Sanity Check 1: Check if RAVs are available; if RAVs don't exist in sampleScore.df, the function will error: "undefined columns selected"
  # Sanity Check 2: phenotype is in factor.df
  if(nlevels(factor.df[[phenotype]]) == 0) {
    print("Phenotype is not in provided factor dataframe")
  }
  
  sampleScore1 <- paste0("RAV", RAV1)
  sampleScore2 <- paste0("RAV", RAV2)

  new_df <- sampleScore.df[, c(sampleScore1, sampleScore2)]
  colnames(new_df)[1] <- "sampleScore1"
  colnames(new_df)[2] <- "sampleScore2"

  new_df <- data.frame(new_df, factor.df)
  plot_data <- new_df[which(!is.na(factor(new_df[[phenotype]]))),] #Filter out rows of the phenotype that are N/A
  #print(plot_data[[phenotype]])

  colors <- gg_color_hue(length(unique(plot_data[[phenotype]]))) # Count number of factor levels excluding NA
  colors.toplot <- c(colors)

  pA <- ggplot(plot_data,
             aes(x = sampleScore1, y = sampleScore2, color = plot_data[[phenotype]])) +
        geom_point() +
        labs(title = graph_title) +
        scale_color_manual(values = colors.toplot, name = legend_title) +
        theme_bw() +
        theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank()) +
        geom_hline(yintercept = 0, linetype = 'dashed') +
        geom_vline(xintercept = 0, linetype = 'dashed') +
        xlab(sampleScore1) + ylab(sampleScore2)

print(pA)
}

Exploratory Data Analysis (EDA)

Top Validated RAVs Heatmap for COAD Cancer Samples

validated_coad_cancer <- validate(coad_cancer, RAVmodel)
heatmapTable(validated_coad_cancer, RAVmodel, num.out=15, column_title = "COAD Cancer")

Data Cleaning

Select only clinical attributes with <10% missing data

# Select columns with >10% completeness
keep_attribute_ind <- which(colSums(!is.na(coad_meta)) > round(nrow(coad_meta)/10))

coad_dat <- coad_meta[keep_attribute_ind] %>% subset(select=-patientID)

Calculate validation sample scores for each patient cancer sample and RAV combination for the top 15 validated RAVs

# Calculate validation scores
coad_all_sampleScores <- calculateScore(coad_cancer, RAVmodel)

validated_ind <- validatedSignatures(validated_coad_cancer, RAVmodel, num.out = 15, scoreCutoff = 0.45, indexOnly = TRUE) #Using Pearson Coefficient

## Subset sampleScore to join with MCPcounter
coad_sampleScore <- coad_all_sampleScores[, validated_ind] %>% as.data.frame()

Parse variables out to numeric and categorical variables

# Check for data types
unique(sapply(coad_cancer@colData, type))
[1] "character" "integer"   "double"   
coad_charcTb <- coad_dat[, sapply(coad_dat, class) == 'character']
coad_numTb <- coad_dat[, sapply(coad_dat, class) %in% c('numeric', 'integer')]
# Convert to factor data type
coad_factorTb <- coad_dat[, sapply(coad_dat, class) == 'character']

#factorTb <- charcTb
coad_factorTb[sapply(coad_factorTb, is.character)] <- lapply(coad_factorTb[sapply(coad_factorTb, is.character)], factor, exclude = NULL)
single_factor_ind <- c()
binary_factor_ind <- c()
multi_factor_ind <- c()

# Testing factor grouping
for (i in 1:length(coad_factorTb)) {
  if (nlevels(coad_factorTb[,i]) == 1 | 
      (nlevels(coad_factorTb[,i]) == 2 & any(is.na(levels(coad_factorTb[,i]))))
      ) {
    single_factor_ind <- c(single_factor_ind, i)
    
  } else if (nlevels(coad_factorTb[,i]) == 3 & any(is.na(levels(coad_factorTb[,i]))) |
             (nlevels(coad_factorTb[,i]) == 2 & !any(is.na(levels(coad_factorTb[,i]))))
          ) {
    binary_factor_ind <- c(binary_factor_ind, i)
    
  } else {
    multi_factor_ind <- c(multi_factor_ind, i)
  }
}

coad_polytomous_factor <- coad_factorTb[,multi_factor_ind]
coad_binary_factor <- coad_factorTb[,binary_factor_ind]
coad_single_factor <- coad_factorTb[,single_factor_ind]

Wilcoxon Test for Binary Variables

wilcox_df <- as.data.frame(matrix(nrow = ncol(coad_binary_factor),
                                ncol = ncol(coad_sampleScore)))

rownames(wilcox_df) <- colnames(coad_binary_factor)
colnames(wilcox_df) <- colnames(coad_sampleScore)

coad_wilcox_stat <- wilcox_df
coad_wilcox_pvalue <- wilcox_df
coad_wilcox_sig_stat <- wilcox_df

for (i in seq_len(ncol(coad_sampleScore))) {
  for (j in seq_len(ncol(coad_binary_factor))) {
    ## Wilcoxon test
    wilcox_test <- wilcox.test(coad_sampleScore[, i] ~ coad_binary_factor[,j], alternative="two.sided")
    
    ## Statistic
    stat <- wilcox_test$statistic
    coad_wilcox_stat[j, i] <- stat
    
    ## p-value
    pval <- wilcox_test$p.value
    coad_wilcox_pvalue[j, i] <- pval
      
    ## Wilcoxon statistic with significant p-value (<0.01)
    if (wilcox_test$p.value < 0.01) {
      coad_wilcox_sig_stat[j,i] <- wilcox_test$statistic
    }
  }
}

batch_binary_ind <- grep('analyte|analytes|portion|procurement|aliquot|uuid|barcode',
                  rownames(coad_wilcox_stat))
coad_wilcox_stat <- coad_wilcox_stat[-batch_binary_ind,]
coad_wilcox_pvalue <- coad_wilcox_pvalue[-batch_binary_ind,]
coad_wilcox_sig_stat <- coad_wilcox_sig_stat[-batch_binary_ind,]
coad_wilcox_sig_stat <- coad_wilcox_sig_stat[-which(rowSums(is.na(coad_wilcox_sig_stat)) >= 13),]

Binary Variable Visualizations

heatmap(as.matrix(coad_wilcox_stat), main = 'COAD Cancer Binary Variables: Wilcoxon Test Statistics')

options(ztable.type='html')
ztable(coad_wilcox_sig_stat ) %>%
  makeHeatmap(palette = 'Blues') %>%
  print(caption='COAD Cancer Binary Variables with a Significant Wilcoxon Statistic (p-value < 0.01)')
COAD Cancer Binary Variables with a Significant Wilcoxon Statistic (p-value < 0.01)
  RAV188 RAV1575 RAV324 RAV832 RAV61 RAV1008 RAV868 RAV64 RAV220 RAV834 RAV833 RAV438 RAV190 RAV1166 RAV192
patient.clinical_cqcf.histological_type 3512.00 3239.00 6327.00
patient.histological_type 3183.00 5713.00 3270.00 3299.00 3155.00 6197.00
patient.kras_gene_analysis_performed 6030.00 6354.00 3401.00 3227.00 5982.00 3184.00 2896.00 6306.00 2446.00 2661.00
patient.tissue_prospective_collection_indicator 5898.00 5839.00 12470.00 11679.00 5478.00 12038.00 12060.00 5897.00 11111.00 12409.00 6825.00 11838.00 11872.00 11940.00
patient.tissue_retrospective_collection_indicator 12060.00 12119.00 5488.00 6279.00 12480.00 5920.00 5898.00 12061.00 6847.00 5549.00 11133.00 6120.00 6086.00 6018.00
patient.biospecimen_cqcf.histological_type 3512.00 3239.00 6327.00
patient.biospecimen_cqcf.path_confirm_report_attached 3984.00 3993.00 1431.00 1543.00 4102.00 1422.00 1453.00 4134.00 1405.00 1550.00 1370.00 1744.00
histological_type.x 3183.00 5713.00 3270.00 3299.00 3155.00 6197.00
patient.braf_gene_analysis_performed 4115.00 4203.00 1437.00 1895.00 4131.00 1170.00 1391.00 4318.00 1687.00 4481.00 1228.00 1074.00
patient.lymphatic_invasion 5068.00 8222.00 8182.00 5311.00 8360.00 8221.00

Categorized by the COAD cancer binary variable “lymphatic invasion”, the majority of sample scores for RAVs 832 and 833 are over 0.

graph_categorical(833, 832, coad_binary_factor, coad_sampleScore,
                  "patient.lymphatic_invasion", 
                  "Score Plot for Lymphatic Invasion", "Lymphatic Invasion")

RAV 833 has key phrases connected to possible “lymphatic invasion” data. See “T-Lymphocytes”, “B Cells”, “Immune System” in wordcloud.

drawWordcloud(RAVmodel, 833)

Kruskal-Wallis Test for Polytomous Variables

kw_df <- as.data.frame(matrix(nrow = ncol(coad_polytomous_factor),
                                ncol = ncol(coad_sampleScore)))

rownames(kw_df) <- colnames(coad_polytomous_factor)
colnames(kw_df) <- colnames(coad_sampleScore)

coad_kw_stat <- kw_df
coad_kw_pvalue <- kw_df
coad_kw_sig_stat <- kw_df

for (i in seq_len(ncol(coad_sampleScore))) {
  for (j in seq_len(ncol(coad_polytomous_factor))) {
    ## Kruskal-Wallis Test
    kruskal_test <- kruskal.test(coad_sampleScore[, i] ~ coad_polytomous_factor[,j])
    
    ## Kruskal-Wallis Statistic
    kw_val <-  kruskal_test$statistic
    coad_kw_stat[j, i] <- kw_val

    ## p-value
    pval <-  kruskal_test$p.value
    coad_kw_pvalue[j, i] <- pval
    
    ## Kruskal-Wallis Statistic with significant p-value (<0.05)
    if (kruskal_test$p.value < 0.05) {
      coad_kw_sig_stat[j,i] <- kruskal_test$statistic
    }
  }
}

batch_polytomous_ind <- grep('analyte|analytes|portion|procurement|aliquot|uuid|barcode',
                  rownames(coad_kw_stat))
coad_kw_stat <- coad_kw_stat[-batch_polytomous_ind,]
coad_kw_pvalue <- coad_kw_pvalue[-batch_polytomous_ind,]
coad_kw_sig_stat <- coad_kw_sig_stat[-batch_polytomous_ind,]
coad_kw_sig_stat <- coad_kw_sig_stat[-which(rowSums(is.na(coad_kw_sig_stat)) >= 13),]

Polytomous Variable Visualizations

heatmap(as.matrix(coad_kw_stat), main = "COAD Cancer Polytomous Variables: Kruskal-Wallis Test Statistics")

options(ztable.type='html')
ztable(coad_kw_sig_stat) %>%
  makeHeatmap(palette = 'Blues') %>%
  print(caption='COAD Cancer Polytomous Variables with a Significant Kruskal-Wallis Statistic (p-value < 0.05)')
COAD Cancer Polytomous Variables with a Significant Kruskal-Wallis Statistic (p-value < 0.05)
  RAV188 RAV1575 RAV324 RAV832 RAV61 RAV1008 RAV868 RAV64 RAV220 RAV834 RAV833 RAV438 RAV190 RAV1166 RAV192
pathology_N_stage 15.17 15.19 14.30
patient.icd_10 18.12 19.91 27.72
patient.icd_o_3_site 18.12 19.91 27.72
patient.stage_event.tnm_categories.pathologic_categories.pathologic_n 15.17 15.19 14.30
patient.tissue_source_site 72.94 63.95 87.18 37.05 51.60 71.24 97.30 68.38 73.45 82.22 84.38 50.09 77.32 95.18
patient.biospecimen_cqcf.vessel_used 10.91 8.80 12.96 7.93 14.32 13.65 10.87
patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status 100.13 13.80 12.26

The score plot shows factor level clustering for the COAD cancer attribute, “microsatellite instability test results”. There is a distinction between the MSI-H (microsatellite instability - high) versus the MSS (microsatellite stable) and MSI-L (microsatellite instability - low) sample scores.

graph_categorical(834, 190, coad_factorTb, coad_sampleScore,
                  "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status", 
                  "Score Plot for Microsatellite Status", "Microsatellite Status")

RAVs 834 and 190 have statistically significant Kruskal-Wallis values for the “microsatellite instability test results” attribute.

drawWordcloud(RAVmodel, 834)

drawWordcloud(RAVmodel, 190)

Confirmation of RAV Significance for Variable of Interest

See significant p-values for RAVs 834, 190, and 1166

for (i in seq_len(ncol(coad_sampleScore))) {
  print(colnames(coad_sampleScore)[i])
    xy <- pairwise.wilcox.test(coad_sampleScore[, i], coad_factorTb[,"patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"], p.adjust.method = "bonferroni")
    print(xy)
}
[1] "RAV188"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"] 

      indeterminate msi-h msi-l
msi-h 0.16          -     -    
msi-l 0.20          1.00  -    
mss   0.20          1.00  1.00 

P value adjustment method: bonferroni 
[1] "RAV1575"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"] 

      indeterminate msi-h msi-l
msi-h 0.20          -     -    
msi-l 0.48          1.00  -    
mss   0.25          1.00  1.00 

P value adjustment method: bonferroni 
[1] "RAV324"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"] 

      indeterminate msi-h msi-l
msi-h 1             -     -    
msi-l 1             1     -    
mss   1             1     1    

P value adjustment method: bonferroni 
[1] "RAV832"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"] 

      indeterminate msi-h msi-l
msi-h 0.89          -     -    
msi-l 1.00          1.00  -    
mss   1.00          0.99  1.00 

P value adjustment method: bonferroni 
[1] "RAV61"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"] 

      indeterminate msi-h msi-l
msi-h 0.28          -     -    
msi-l 0.16          1.00  -    
mss   0.24          1.00  1.00 

P value adjustment method: bonferroni 
[1] "RAV1008"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"] 

      indeterminate msi-h msi-l
msi-h 0.82          -     -    
msi-l 0.71          1.00  -    
mss   0.39          0.25  1.00 

P value adjustment method: bonferroni 
[1] "RAV868"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"] 

      indeterminate msi-h msi-l
msi-h 1             -     -    
msi-l 1             1     -    
mss   1             1     1    

P value adjustment method: bonferroni 
[1] "RAV64"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"] 

      indeterminate msi-h msi-l
msi-h 0.25          -     -    
msi-l 0.63          1.00  -    
mss   0.32          1.00  1.00 

P value adjustment method: bonferroni 
[1] "RAV220"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"] 

      indeterminate msi-h msi-l
msi-h 0.25          -     -    
msi-l 0.63          1.00  -    
mss   0.27          1.00  1.00 

P value adjustment method: bonferroni 
[1] "RAV834"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"] 

      indeterminate msi-h   msi-l
msi-h 0.28          -       -    
msi-l 1.00          1.3e-12 -    
mss   1.00          < 2e-16 1.00 

P value adjustment method: bonferroni 
[1] "RAV833"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"] 

      indeterminate msi-h msi-l
msi-h 1.00          -     -    
msi-l 1.00          1.00  -    
mss   0.94          1.00  1.00 

P value adjustment method: bonferroni 
[1] "RAV438"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"] 

      indeterminate msi-h msi-l
msi-h 1             -     -    
msi-l 1             1     -    
mss   1             1     1    

P value adjustment method: bonferroni 
[1] "RAV190"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"] 

      indeterminate msi-h  msi-l 
msi-h 1.0000        -      -     
msi-l 0.7053        0.1356 -     
mss   0.5525        0.0047 1.0000

P value adjustment method: bonferroni 
[1] "RAV1166"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"] 

      indeterminate msi-h msi-l
msi-h 0.223         -     -    
msi-l 1.000         0.111 -    
mss   1.000         0.009 1.000

P value adjustment method: bonferroni 
[1] "RAV192"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"] 

      indeterminate msi-h msi-l
msi-h 0.382         -     -    
msi-l 1.000         0.408 -    
mss   1.000         0.098 1.000

P value adjustment method: bonferroni 

Top Validated RAVs Heatmap for LUAD Cancer Samples

validated_luad_cancer <- validate(luad_cancer, RAVmodel)
heatmapTable(validated_luad_cancer, RAVmodel, num.out=15, column_title = "LUAD Cancer")

Data Cleaning

Select only clinical attributes with <10% missing data

# Select columns with >10% completeness
keep_attribute_ind <- which(colSums(!is.na(luad_meta)) > round(nrow(luad_meta)/10))

luad_dat <- luad_meta[keep_attribute_ind] %>% subset(select=-patientID)

Calculate validation sample scores for each patient cancer sample and RAV combination for the top 15 validated RAVs

# Calculate validation scores
luad_all_sampleScores <- calculateScore(luad_cancer, RAVmodel)

validated_ind <- validatedSignatures(validated_luad_cancer, RAVmodel, num.out = 15, scoreCutoff = 0.45, indexOnly = TRUE) #Using Pearson Coefficient

## Subset sampleScore to join with MCPcounter
luad_sampleScore <- luad_all_sampleScores[, validated_ind] %>% as.data.frame()

Parse variables out to numeric and categorical variables

# Check for data types
unique(sapply(luad_cancer@colData, type))
[1] "character" "integer"   "double"   
luad_charcTb <- luad_dat[, sapply(luad_dat, class) == 'character']
luad_numTb <- luad_dat[, sapply(luad_dat, class) %in% c('numeric', 'integer')]
# Convert to factor data type
luad_factorTb <- luad_dat[, sapply(luad_dat, class) == 'character']

#factorTb <- charcTb
luad_factorTb[sapply(luad_factorTb, is.character)] <- lapply(luad_factorTb[sapply(luad_factorTb, is.character)], factor, exclude = NULL)
single_factor_ind <- c()
binary_factor_ind <- c()
multi_factor_ind <- c()

# Testing factor grouping
for (i in 1:length(luad_factorTb)) {
  if (nlevels(luad_factorTb[,i]) == 1 | 
      (nlevels(luad_factorTb[,i]) == 2 & any(is.na(levels(luad_factorTb[,i]))))
      ) {
    single_factor_ind <- c(single_factor_ind, i)
    
  } else if (nlevels(luad_factorTb[,i]) == 3 & any(is.na(levels(luad_factorTb[,i]))) |
             (nlevels(luad_factorTb[,i]) == 2 & !any(is.na(levels(luad_factorTb[,i]))))
          ) {
    binary_factor_ind <- c(binary_factor_ind, i)
    
  } else {
    multi_factor_ind <- c(multi_factor_ind, i)
  }
}

luad_polytomous_factor <- luad_factorTb[,multi_factor_ind]
luad_binary_factor <- luad_factorTb[,binary_factor_ind]
luad_single_factor <- luad_factorTb[,single_factor_ind]

Wilcoxon Test for Binary Variables

wilcox_df <- as.data.frame(matrix(nrow = ncol(luad_binary_factor),
                                ncol = ncol(luad_sampleScore)))

rownames(wilcox_df) <- colnames(luad_binary_factor)
colnames(wilcox_df) <- colnames(luad_sampleScore)

luad_wilcox_stat <- wilcox_df
luad_wilcox_pvalue <- wilcox_df
luad_wilcox_sig_stat <- wilcox_df

for (i in seq_len(ncol(luad_sampleScore))) {
  for (j in seq_len(ncol(luad_binary_factor))) {
    ## Wilcoxon test
    wilcox_test <- wilcox.test(luad_sampleScore[, i] ~ luad_binary_factor[,j], alternative="two.sided")
    
    ## Statistic
    stat <- wilcox_test$statistic
    luad_wilcox_stat[j, i] <- stat
    
    ## p-value
    pval <- wilcox_test$p.value
    luad_wilcox_pvalue[j, i] <- pval
      
    ## Wilcoxon statistic with significant p-value (<0.01)
    if (wilcox_test$p.value < 0.05) {
      luad_wilcox_sig_stat[j,i] <- wilcox_test$statistic
    }
  }
}

batch_binary_ind <- grep('analyte|analytes|portion|procurement|aliquot|uuid|barcode',
                  rownames(luad_wilcox_stat))
luad_wilcox_stat <- luad_wilcox_stat[-batch_binary_ind,]
luad_wilcox_pvalue <- luad_wilcox_pvalue[-batch_binary_ind,]
luad_wilcox_sig_stat <- luad_wilcox_sig_stat[-batch_binary_ind,]
luad_wilcox_sig_stat <- luad_wilcox_sig_stat[-which(rowSums(is.na(luad_wilcox_sig_stat)) >= 10),]

Binary Variable Visualizations

heatmap(as.matrix(luad_wilcox_stat), main = 'LUAD Cancer Binary Variables: Wilcoxon Test Statistics')

options(ztable.type='html')
ztable(luad_wilcox_sig_stat) %>%
  makeHeatmap(palette = 'Blues') %>%
  print(caption='LUAD Cancer Binary Variables with a Significant Wilcoxon Statistic (p-value < 0.01)')
LUAD Cancer Binary Variables with a Significant Wilcoxon Statistic (p-value < 0.01)
  RAV814 RAV316 RAV684 RAV1065 RAV2538 RAV630 RAV1154 RAV1303 RAV1668 RAV220 RAV435
ethnicity 729.00 1989.00 2122.00 2128.00 632.00
patient.clinical_cqcf.history_of_neoadjuvant_treatment 1300.00 131.00 1405.00 161.00 1286.00 1278.00
patient.drugs.drug.route_of_administrations.route_of_administration 9.00 183.00 18.00
patient.drugs.drug.therapy_ongoing 580.00 1432.00 1411.00 691.00
patient.ethnicity 729.00 1989.00 2122.00 2128.00 632.00
patient.follow_ups.follow_up.followup_case_report_form_submission_reason 2422.00 2469.00
patient.follow_ups.follow_up.person_neoplasm_cancer_status 21269.00 19946.00 14601.00 20276.00
patient.follow_ups.follow_up.vital_status 25467.00 15906.00 25789.00 16984.00 23294.00 23123.00 24986.00
patient.gender 39477.00 27767.00 37861.00 27707.00 26418.00 39207.00 36675.00 37964.00
patient.history_of_neoadjuvant_treatment 1297.00 130.00 1402.00 161.00 1285.00 1276.00
patient.kras_gene_analysis_performed 9167.00 6243.00 6412.00
patient.new_tumor_events.new_tumor_event_after_initial_treatment 3074.00 1600.00 2899.00 2911.00
patient.person_neoplasm_cancer_status 20905.00 14528.00 20468.00
patient.tissue_prospective_collection_indicator 23866.00 39546.00 24774.00 38226.00 26509.00 36231.00 28479.00 25282.00
patient.tissue_retrospective_collection_indicator 40250.00 24570.00 39342.00 25890.00 37607.00 27885.00 35637.00 38834.00
patient.vital_status 30333.00 20152.00 27715.00 20890.00 18758.00 29370.00 29548.00
patient.bcr_canonical_check.bcr_patient_canonical_status 34332.00 24903.00 36345.00 27365.00 34846.00 34838.00 34964.00 24309.00
patient.biospecimen_cqcf.history_of_neoadjuvant_treatment 1300.00 131.00 1405.00 161.00 1286.00 1278.00
patient.biospecimen_cqcf.path_confirm_diagnosis_matching 1599.00 3578.00 3575.00
patient.biospecimen_cqcf.path_confirm_report_attached 7724.00 7389.00 7373.00 3169.00 2626.00 3275.00 9201.00
patient.samples.sample.2.is_ffpe 5020.00 4751.00 1707.00
patient.samples.sample.2.oct_embedded 889.00 881.00 386.00 206.00 930.00
gender 39477.00 27767.00 37861.00 27707.00 26418.00 39207.00 36675.00 37964.00
patient.follow_ups.follow_up.additional_radiation_therapy 1941.00 2969.00
patient.follow_ups.follow_up.new_tumor_event_after_initial_treatment 22530.00 16307.00 22410.00 17071.00 22572.00
patient.follow_ups.follow_up.targeted_molecular_therapy 23354.00 17589.00 17572.00 16938.00 22963.00 23774.00
patient.targeted_molecular_therapy 1636.00 1555.00
Survival 4176.00 6729.00 6738.00 7152.00 3847.00 4389.00
Transversion.High.Low 7705.00 7299.00 5053.00
Oncogene.Negative.or.Positive.Groups 4258.00 7364.00 7402.00
FCNA_CCND1 3599.00 3534.00 1626.00 1843.00

Categorized by the LUAD cancer binary variable “transversion status”, the sample scores for RAVs 1065 and 1668 seem to have an inverse linear relationship.

graph_categorical(1668, 1065, luad_binary_factor, luad_sampleScore,
                  "Transversion.High.Low",
                  "Score Plot for Transversion ", "Transversion Status")

RAV 1065 has key phrases connected to lung cancer clinical attributes.

drawWordcloud(RAVmodel, 1065)

Kruskal-Wallis Test for Polytomous Variables

kw_df <- as.data.frame(matrix(nrow = ncol(luad_polytomous_factor),
                                ncol = ncol(luad_sampleScore)))

rownames(kw_df) <- colnames(luad_polytomous_factor)
colnames(kw_df) <- colnames(luad_sampleScore)

luad_kw_stat <- kw_df
luad_kw_pvalue <- kw_df
luad_kw_sig_stat <- kw_df

for (i in seq_len(ncol(luad_sampleScore))) {
  for (j in seq_len(ncol(luad_polytomous_factor))) {
    ## Kruskal-Wallis Test
    kruskal_test <- kruskal.test(luad_sampleScore[, i] ~ luad_polytomous_factor[,j])
    
    ## Kruskal-Wallis Statistic
    kw_val <-  kruskal_test$statistic
    luad_kw_stat[j, i] <- kw_val

    ## p-value
    pval <-  kruskal_test$p.value
    luad_kw_pvalue[j, i] <- pval
    
    ## Kruskal-Wallis Statistic with significant p-value (<0.05)
    if (kruskal_test$p.value < 0.05) {
    luad_kw_sig_stat[j,i] <- kruskal_test$statistic
    }
  }
}

batch_polytomous_ind <- grep('analyte|analytes|portion|procurement|aliquot|uuid|barcode',
                  rownames(luad_kw_stat))
luad_kw_stat <- luad_kw_stat[-batch_polytomous_ind,]
luad_kw_pvalue <- luad_kw_pvalue[-batch_polytomous_ind,]
luad_kw_sig_stat <- luad_kw_sig_stat[-batch_polytomous_ind,]
luad_kw_sig_stat <- luad_kw_sig_stat[-which(rowSums(is.na(luad_kw_sig_stat)) >= 10),]

Polytomous Variable Visualizations

heatmap(as.matrix(luad_kw_stat), main = "LUAD Cancer Polytomous Variables: Kruskal-Wallis Test Statistics")

options(ztable.type='html')
ztable(luad_kw_sig_stat) %>%
  makeHeatmap(palette = 'Blues') %>%
  print(caption='LUAD Cancer Polytomous Variables with a Significant Kruskal-Wallis Statistic (p-value < 0.05)')
LUAD Cancer Polytomous Variables with a Significant Kruskal-Wallis Statistic (p-value < 0.05)
  RAV814 RAV316 RAV684 RAV1065 RAV2538 RAV630 RAV1154 RAV1303 RAV1668 RAV220 RAV435
pathology_T_stage 25.75 33.17 28.47 19.64 29.55 21.04 18.16 30.15 16.92
pathology_N_stage 11.71 14.28
pathology_M_stage 22.12 12.68
patient.clinical_cqcf.consent_or_death_status 9.59 12.62 10.59 10.78 23.58 7.56 13.11
patient.clinical_cqcf.country 22.11 24.73 18.68 28.39 16.75 16.90
patient.clinical_cqcf.histological_type 41.13 26.37 39.84 41.43 20.74 40.80 27.24
patient.clinical_cqcf.history_of_prior_malignancy 7.10 7.06 8.05
patient.histological_type 41.17 37.31 38.24 38.28 24.80
patient.icd_10 12.65 13.12 17.78 14.86 17.87
patient.icd_o_3_histology 40.35 25.12 38.87 39.30 40.96 24.51
patient.icd_o_3_site 14.16 11.63 11.25
patient.prior_dx 25.49 27.21 8.08 10.79
patient.stage_event.pathologic_stage 20.75 26.65 23.37
patient.stage_event.tnm_categories.pathologic_categories.pathologic_m 22.12 12.68
patient.stage_event.tnm_categories.pathologic_categories.pathologic_n 11.71 14.28
patient.stage_event.tnm_categories.pathologic_categories.pathologic_t 25.75 33.17 28.47 19.64 29.55 21.04 18.16 30.15 16.92
patient.tissue_source_site 73.16 114.81 112.21 48.23 59.48 87.89 77.79 94.77 51.29 64.82 80.34
patient.biospecimen_cqcf.consent_or_death_status 9.59 12.62 10.59 10.78 23.58 7.56 13.11
patient.biospecimen_cqcf.country 22.11 24.73 18.68 28.39 16.75 16.90
patient.biospecimen_cqcf.histological_type 41.13 26.37 39.84 41.43 20.74 40.80 27.24
patient.biospecimen_cqcf.vessel_used 19.67 28.68 30.08 13.30 20.38 12.60
patient.samples.sample.2.sample_type 22.11 11.63 11.80 10.79 12.95 10.43 9.89
patient.samples.sample.2.vial_number 10.73 10.18
histological_type 41.17 37.31 38.24 38.28 24.80
patient.tobacco_smoking_history 25.68 55.66 29.74

The score plot shows sample scores for the LUAD cancer attribute, “tobacco smoking history” that were positive for RAV 2538, but negative for RAV 1065 and vice versa, negative for RAV 2538, but positive for RAV 1065.

graph_categorical(1065, 2538, luad_factorTb, luad_sampleScore,
                  "patient.tobacco_smoking_history",
                  "Score Plot for Tobacco Smoking History", "Tobacco Smoking History")

RAV 814 additionally had statistically significant Kruskal-Wallis values for the “tobacco smoking history” attribute.

drawWordcloud(RAVmodel, 814)

Confirmation of RAV Significance for Variable of Interest

See significant p-values for RAVs 814, 1065, 2538, 1668, and 220

for (i in seq_len(ncol(luad_sampleScore))) {
  print(colnames(luad_sampleScore)[i])
    xy <- pairwise.wilcox.test(luad_sampleScore[, i], luad_factorTb[,"patient.tobacco_smoking_history"], p.adjust.method = "bonferroni")
    print(xy)
}
[1] "RAV814"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  luad_sampleScore[, i] and luad_factorTb[, "patient.tobacco_smoking_history"] 

                                                current reformed smoker for < or = 15 years
current reformed smoker for > 15 years          1.4e-05                                    
current reformed smoker, duration not specified 1.00000                                    
current smoker                                  0.08404                                    
lifelong non-smoker                             0.00074                                    
                                                current reformed smoker for > 15 years
current reformed smoker for > 15 years          -                                     
current reformed smoker, duration not specified 1.00000                               
current smoker                                  8.0e-10                               
lifelong non-smoker                             1.00000                               
                                                current reformed smoker, duration not specified
current reformed smoker for > 15 years          -                                              
current reformed smoker, duration not specified -                                              
current smoker                                  1.00000                                        
lifelong non-smoker                             1.00000                                        
                                                current smoker
current reformed smoker for > 15 years          -             
current reformed smoker, duration not specified -             
current smoker                                  -             
lifelong non-smoker                             1.9e-07       

P value adjustment method: bonferroni 
[1] "RAV316"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  luad_sampleScore[, i] and luad_factorTb[, "patient.tobacco_smoking_history"] 

                                                current reformed smoker for < or = 15 years
current reformed smoker for > 15 years          1                                          
current reformed smoker, duration not specified 1                                          
current smoker                                  1                                          
lifelong non-smoker                             1                                          
                                                current reformed smoker for > 15 years
current reformed smoker for > 15 years          -                                     
current reformed smoker, duration not specified 1                                     
current smoker                                  1                                     
lifelong non-smoker                             1                                     
                                                current reformed smoker, duration not specified
current reformed smoker for > 15 years          -                                              
current reformed smoker, duration not specified -                                              
current smoker                                  1                                              
lifelong non-smoker                             1                                              
                                                current smoker
current reformed smoker for > 15 years          -             
current reformed smoker, duration not specified -             
current smoker                                  -             
lifelong non-smoker                             1             

P value adjustment method: bonferroni 
[1] "RAV684"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  luad_sampleScore[, i] and luad_factorTb[, "patient.tobacco_smoking_history"] 

                                                current reformed smoker for < or = 15 years
current reformed smoker for > 15 years          1.00                                       
current reformed smoker, duration not specified 0.28                                       
current smoker                                  1.00                                       
lifelong non-smoker                             1.00                                       
                                                current reformed smoker for > 15 years
current reformed smoker for > 15 years          -                                     
current reformed smoker, duration not specified 0.43                                  
current smoker                                  1.00                                  
lifelong non-smoker                             1.00                                  
                                                current reformed smoker, duration not specified
current reformed smoker for > 15 years          -                                              
current reformed smoker, duration not specified -                                              
current smoker                                  0.91                                           
lifelong non-smoker                             0.43                                           
                                                current smoker
current reformed smoker for > 15 years          -             
current reformed smoker, duration not specified -             
current smoker                                  -             
lifelong non-smoker                             1.00          

P value adjustment method: bonferroni 
[1] "RAV1065"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  luad_sampleScore[, i] and luad_factorTb[, "patient.tobacco_smoking_history"] 

                                                current reformed smoker for < or = 15 years
current reformed smoker for > 15 years          2.0e-05                                    
current reformed smoker, duration not specified 1.0000                                     
current smoker                                  0.0027                                     
lifelong non-smoker                             0.0050                                     
                                                current reformed smoker for > 15 years
current reformed smoker for > 15 years          -                                     
current reformed smoker, duration not specified 0.8548                                
current smoker                                  4.8e-11                               
lifelong non-smoker                             1.0000                                
                                                current reformed smoker, duration not specified
current reformed smoker for > 15 years          -                                              
current reformed smoker, duration not specified -                                              
current smoker                                  1.0000                                         
lifelong non-smoker                             1.0000                                         
                                                current smoker
current reformed smoker for > 15 years          -             
current reformed smoker, duration not specified -             
current smoker                                  -             
lifelong non-smoker                             7.9e-07       

P value adjustment method: bonferroni 
[1] "RAV2538"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  luad_sampleScore[, i] and luad_factorTb[, "patient.tobacco_smoking_history"] 

                                                current reformed smoker for < or = 15 years
current reformed smoker for > 15 years          3.9e-05                                    
current reformed smoker, duration not specified 1.0000                                     
current smoker                                  0.0530                                     
lifelong non-smoker                             0.0029                                     
                                                current reformed smoker for > 15 years
current reformed smoker for > 15 years          -                                     
current reformed smoker, duration not specified 1.0000                                
current smoker                                  3.4e-09                               
lifelong non-smoker                             1.0000                                
                                                current reformed smoker, duration not specified
current reformed smoker for > 15 years          -                                              
current reformed smoker, duration not specified -                                              
current smoker                                  1.0000                                         
lifelong non-smoker                             1.0000                                         
                                                current smoker
current reformed smoker for > 15 years          -             
current reformed smoker, duration not specified -             
current smoker                                  -             
lifelong non-smoker                             2.2e-06       

P value adjustment method: bonferroni 
[1] "RAV630"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  luad_sampleScore[, i] and luad_factorTb[, "patient.tobacco_smoking_history"] 

                                                current reformed smoker for < or = 15 years
current reformed smoker for > 15 years          1                                          
current reformed smoker, duration not specified 1                                          
current smoker                                  1                                          
lifelong non-smoker                             1                                          
                                                current reformed smoker for > 15 years
current reformed smoker for > 15 years          -                                     
current reformed smoker, duration not specified 1                                     
current smoker                                  1                                     
lifelong non-smoker                             1                                     
                                                current reformed smoker, duration not specified
current reformed smoker for > 15 years          -                                              
current reformed smoker, duration not specified -                                              
current smoker                                  1                                              
lifelong non-smoker                             1                                              
                                                current smoker
current reformed smoker for > 15 years          -             
current reformed smoker, duration not specified -             
current smoker                                  -             
lifelong non-smoker                             1             

P value adjustment method: bonferroni 
[1] "RAV1154"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  luad_sampleScore[, i] and luad_factorTb[, "patient.tobacco_smoking_history"] 

                                                current reformed smoker for < or = 15 years
current reformed smoker for > 15 years          1.00                                       
current reformed smoker, duration not specified 1.00                                       
current smoker                                  1.00                                       
lifelong non-smoker                             0.15                                       
                                                current reformed smoker for > 15 years
current reformed smoker for > 15 years          -                                     
current reformed smoker, duration not specified 1.00                                  
current smoker                                  1.00                                  
lifelong non-smoker                             1.00                                  
                                                current reformed smoker, duration not specified
current reformed smoker for > 15 years          -                                              
current reformed smoker, duration not specified -                                              
current smoker                                  1.00                                           
lifelong non-smoker                             1.00                                           
                                                current smoker
current reformed smoker for > 15 years          -             
current reformed smoker, duration not specified -             
current smoker                                  -             
lifelong non-smoker                             0.23          

P value adjustment method: bonferroni 
[1] "RAV1303"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  luad_sampleScore[, i] and luad_factorTb[, "patient.tobacco_smoking_history"] 

                                                current reformed smoker for < or = 15 years
current reformed smoker for > 15 years          1                                          
current reformed smoker, duration not specified 1                                          
current smoker                                  1                                          
lifelong non-smoker                             1                                          
                                                current reformed smoker for > 15 years
current reformed smoker for > 15 years          -                                     
current reformed smoker, duration not specified 1                                     
current smoker                                  1                                     
lifelong non-smoker                             1                                     
                                                current reformed smoker, duration not specified
current reformed smoker for > 15 years          -                                              
current reformed smoker, duration not specified -                                              
current smoker                                  1                                              
lifelong non-smoker                             1                                              
                                                current smoker
current reformed smoker for > 15 years          -             
current reformed smoker, duration not specified -             
current smoker                                  -             
lifelong non-smoker                             1             

P value adjustment method: bonferroni 
[1] "RAV1668"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  luad_sampleScore[, i] and luad_factorTb[, "patient.tobacco_smoking_history"] 

                                                current reformed smoker for < or = 15 years
current reformed smoker for > 15 years          9.2e-05                                    
current reformed smoker, duration not specified 1.0000                                     
current smoker                                  0.0075                                     
lifelong non-smoker                             0.0374                                     
                                                current reformed smoker for > 15 years
current reformed smoker for > 15 years          -                                     
current reformed smoker, duration not specified 1.0000                                
current smoker                                  7.3e-10                               
lifelong non-smoker                             1.0000                                
                                                current reformed smoker, duration not specified
current reformed smoker for > 15 years          -                                              
current reformed smoker, duration not specified -                                              
current smoker                                  1.0000                                         
lifelong non-smoker                             1.0000                                         
                                                current smoker
current reformed smoker for > 15 years          -             
current reformed smoker, duration not specified -             
current smoker                                  -             
lifelong non-smoker                             1.2e-05       

P value adjustment method: bonferroni 
[1] "RAV220"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  luad_sampleScore[, i] and luad_factorTb[, "patient.tobacco_smoking_history"] 

                                                current reformed smoker for < or = 15 years
current reformed smoker for > 15 years          0.14475                                    
current reformed smoker, duration not specified 1.00000                                    
current smoker                                  0.30075                                    
lifelong non-smoker                             0.07673                                    
                                                current reformed smoker for > 15 years
current reformed smoker for > 15 years          -                                     
current reformed smoker, duration not specified 1.00000                               
current smoker                                  0.00042                               
lifelong non-smoker                             1.00000                               
                                                current reformed smoker, duration not specified
current reformed smoker for > 15 years          -                                              
current reformed smoker, duration not specified -                                              
current smoker                                  1.00000                                        
lifelong non-smoker                             1.00000                                        
                                                current smoker
current reformed smoker for > 15 years          -             
current reformed smoker, duration not specified -             
current smoker                                  -             
lifelong non-smoker                             0.00069       

P value adjustment method: bonferroni 
[1] "RAV435"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  luad_sampleScore[, i] and luad_factorTb[, "patient.tobacco_smoking_history"] 

                                                current reformed smoker for < or = 15 years
current reformed smoker for > 15 years          1                                          
current reformed smoker, duration not specified 1                                          
current smoker                                  1                                          
lifelong non-smoker                             1                                          
                                                current reformed smoker for > 15 years
current reformed smoker for > 15 years          -                                     
current reformed smoker, duration not specified 1                                     
current smoker                                  1                                     
lifelong non-smoker                             1                                     
                                                current reformed smoker, duration not specified
current reformed smoker for > 15 years          -                                              
current reformed smoker, duration not specified -                                              
current smoker                                  1                                              
lifelong non-smoker                             1                                              
                                                current smoker
current reformed smoker for > 15 years          -             
current reformed smoker, duration not specified -             
current smoker                                  -             
lifelong non-smoker                             1             

P value adjustment method: bonferroni