Deliverable 1: Dashboard of RAVs and Attributes for TCGA Colorectal (COAD) and Lung (LUAD) Cancers

Author

Britney Pheng

Preparation

Packages

knitr::opts_chunk$set(echo = TRUE)

suppressPackageStartupMessages({
  # BiocManager
  library(GenomicSuperSignature)
  library(curatedTCGAData)
  library(MultiAssayExperiment)
  library(TCGAutils)
  library(ComplexHeatmap)
  
  # CRAN
  library(tidyverse) # includes dplyr, ggplot2, magrittr, tidyr
  library(magick)
  library(wordcloud)
  library(ztable)
  library(metafolio)
  library(randomForest)
  library(caret)
})

Load all Replicated Axes of Variation (RAVs) from GenomicSuperSignature package

RAVmodel <- getModel('C2', load=TRUE)

Load TCGA Data for Colorectal and Lung Cancers from curatedTCGAData package

data('diseaseCodes', package = "TCGAutils")

data <- curatedTCGAData(
    diseaseCode = c("COAD", "LUAD"), 
    assays = "RNASeq2Gene", version = "2.0.1",
    dry.run = FALSE
)

data <- TCGAsplitAssays(data, c("01", "11"))

Create Variables for RNA Sequencing Data and Clinical Attributes

coad_cancer <- getWithColData(data,
                           '01_COAD_RNASeq2Gene-20160128')
assay(coad_cancer) <- log2(assay(coad_cancer) + 1)
coad_meta <- colData(coad_cancer)

luad_cancer <- getWithColData(data,
                           '01_LUAD_RNASeq2Gene-20160128')
assay(luad_cancer) <- log2(assay(luad_cancer) + 1)
luad_meta <- colData(luad_cancer)

Function to Display Categorical Attributes’ Score Plots

graph_categorical <- function(RAV1, RAV2, factor.df, sampleScore.df, phenotype, graph_title = "Score Plot", legend_title = "Legend") {
  
  # Sanity Check 1: Check if RAVs are available; if RAVs don't exist in sampleScore.df, the function will error: "undefined columns selected"
  # Sanity Check 2: phenotype is in factor.df
  if(nlevels(factor.df[[phenotype]]) == 0) {
    print("Phenotype is not in provided factor dataframe")
  }
  
  sampleScore1 <- paste0("RAV", RAV1)
  sampleScore2 <- paste0("RAV", RAV2)

  new_df <- sampleScore.df[, c(sampleScore1, sampleScore2)]
  colnames(new_df)[1] <- "sampleScore1"
  colnames(new_df)[2] <- "sampleScore2"

  new_df <- data.frame(new_df, factor.df)
  plot_data <- new_df[which(!is.na(factor(new_df[[phenotype]]))),] #Filter out rows of the phenotype that are N/A
  #print(plot_data[[phenotype]])

  colors <- gg_color_hue(length(unique(plot_data[[phenotype]]))) # Count number of factor levels excluding NA
  colors.toplot <- c(colors)

  pA <- ggplot(plot_data,
             aes(x = sampleScore1, y = sampleScore2, color = plot_data[[phenotype]])) +
        geom_point() +
        labs(title = graph_title) +
        scale_color_manual(values = colors.toplot, name = legend_title) +
        theme_bw() +
        theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank()) +
        geom_hline(yintercept = 0, linetype = 'dashed') +
        geom_vline(xintercept = 0, linetype = 'dashed') +
        xlab(sampleScore1) + ylab(sampleScore2)

print(pA)
}

Exploratory Data Analysis (EDA)

Top Validated RAVs Heatmap for COAD Cancer Samples

validated_coad_cancer <- validate(coad_cancer, RAVmodel)
heatmapTable(validated_coad_cancer, RAVmodel, num.out=15, column_title = "COAD Cancer")

Data Cleaning

Select only clinical attributes with <10% missing data

# Select columns with >10% completeness
keep_attribute_ind <- which(colSums(!is.na(coad_meta)) > round(nrow(coad_meta)/10))

coad_dat <- coad_meta[keep_attribute_ind] %>% subset(select=-patientID)

Calculate validation sample scores for each patient cancer sample and RAV combination for the top 15 validated RAVs

# Calculate validation scores
coad_all_sampleScores <- calculateScore(coad_cancer, RAVmodel)

validated_ind <- validatedSignatures(validated_coad_cancer, RAVmodel, num.out = 15, scoreCutoff = 0.45, indexOnly = TRUE) #Using Pearson Coefficient

## Subset sampleScore to join with MCPcounter
coad_sampleScore <- coad_all_sampleScores[, validated_ind] %>% as.data.frame()

Parse variables out to numeric and categorical variables

# Check for data types
unique(sapply(coad_cancer@colData, type))

[1] "character" "integer"   "double"

coad_charcTb <- coad_dat[, sapply(coad_dat, class) == 'character']
coad_numTb <- coad_dat[, sapply(coad_dat, class) %in% c('numeric', 'integer')]

# Convert to factor data type
coad_factorTb <- coad_dat[, sapply(coad_dat, class) == 'character']

#factorTb <- charcTb
coad_factorTb[sapply(coad_factorTb, is.character)] <- lapply(coad_factorTb[sapply(coad_factorTb, is.character)], factor, exclude = NULL)

single_factor_ind <- c()
binary_factor_ind <- c()
multi_factor_ind <- c()

# Testing factor grouping
for (i in 1:length(coad_factorTb)) {
  if (nlevels(coad_factorTb[,i]) == 1 | 
      (nlevels(coad_factorTb[,i]) == 2 & any(is.na(levels(coad_factorTb[,i]))))
      ) {
    single_factor_ind <- c(single_factor_ind, i)
    
  } else if (nlevels(coad_factorTb[,i]) == 3 & any(is.na(levels(coad_factorTb[,i]))) |
             (nlevels(coad_factorTb[,i]) == 2 & !any(is.na(levels(coad_factorTb[,i]))))
          ) {
    binary_factor_ind <- c(binary_factor_ind, i)
    
  } else {
    multi_factor_ind <- c(multi_factor_ind, i)
  }
}

coad_polytomous_factor <- coad_factorTb[,multi_factor_ind]
coad_binary_factor <- coad_factorTb[,binary_factor_ind]
coad_single_factor <- coad_factorTb[,single_factor_ind]

Wilcoxon Test for Binary Variables

wilcox_df <- as.data.frame(matrix(nrow = ncol(coad_binary_factor),
                                ncol = ncol(coad_sampleScore)))

rownames(wilcox_df) <- colnames(coad_binary_factor)
colnames(wilcox_df) <- colnames(coad_sampleScore)

coad_wilcox_stat <- wilcox_df
coad_wilcox_pvalue <- wilcox_df
coad_wilcox_sig_stat <- wilcox_df

for (i in seq_len(ncol(coad_sampleScore))) {
  for (j in seq_len(ncol(coad_binary_factor))) {
    ## Wilcoxon test
    wilcox_test <- wilcox.test(coad_sampleScore[, i] ~ coad_binary_factor[,j], alternative="two.sided")
    
    ## Statistic
    stat <- wilcox_test$statistic
    coad_wilcox_stat[j, i] <- stat
    
    ## p-value
    pval <- wilcox_test$p.value
    coad_wilcox_pvalue[j, i] <- pval
      
    ## Wilcoxon statistic with significant p-value (<0.01)
    if (wilcox_test$p.value < 0.01) {
      coad_wilcox_sig_stat[j,i] <- wilcox_test$statistic
    }
  }
}

batch_binary_ind <- grep('analyte|analytes|portion|procurement|aliquot|uuid|barcode',
                  rownames(coad_wilcox_stat))
coad_wilcox_stat <- coad_wilcox_stat[-batch_binary_ind,]
coad_wilcox_pvalue <- coad_wilcox_pvalue[-batch_binary_ind,]
coad_wilcox_sig_stat <- coad_wilcox_sig_stat[-batch_binary_ind,]
coad_wilcox_sig_stat <- coad_wilcox_sig_stat[-which(rowSums(is.na(coad_wilcox_sig_stat)) >= 13),]

Binary Variable Visualizations

heatmap(as.matrix(coad_wilcox_stat), main = 'COAD Cancer Binary Variables: Wilcoxon Test Statistics')

options(ztable.type='html')
ztable(coad_wilcox_sig_stat ) %>%
  makeHeatmap(palette = 'Blues') %>%
  print(caption='COAD Cancer Binary Variables with a Significant Wilcoxon Statistic (p-value < 0.01)')

COAD Cancer Binary Variables with a Significant Wilcoxon Statistic (p-value < 0.01)
	RAV188	RAV1575	RAV324	RAV61	RAV1008	RAV868	RAV64	RAV220	RAV834	RAV833	RAV438	RAV190	RAV1166	RAV192
patient.clinical_cqcf.histological_type		3512.00							3239.00				6327.00
patient.histological_type		3183.00		5713.00	3270.00			3299.00	3155.00				6197.00
patient.kras_gene_analysis_performed	6030.00	6354.00	3401.00	3227.00	5982.00	3184.00	2896.00	6306.00					2446.00	2661.00
patient.tissue_prospective_collection_indicator	5898.00	5839.00	12470.00	11679.00	5478.00	12038.00	12060.00	5897.00	11111.00	12409.00	6825.00	11838.00	11872.00	11940.00
patient.tissue_retrospective_collection_indicator	12060.00	12119.00	5488.00	6279.00	12480.00	5920.00	5898.00	12061.00	6847.00	5549.00	11133.00	6120.00	6086.00	6018.00
patient.biospecimen_cqcf.histological_type		3512.00							3239.00				6327.00
patient.biospecimen_cqcf.path_confirm_report_attached	3984.00	3993.00	1431.00	1543.00	4102.00	1422.00	1453.00	4134.00		1405.00		1550.00	1370.00	1744.00
histological_type.x		3183.00		5713.00	3270.00			3299.00	3155.00				6197.00
patient.braf_gene_analysis_performed	4115.00	4203.00	1437.00	1895.00	4131.00	1170.00	1391.00	4318.00		1687.00	4481.00		1228.00	1074.00
patient.lymphatic_invasion	5068.00					8222.00	8182.00	5311.00		8360.00		8221.00

Categorized by the COAD cancer binary variable “lymphatic invasion”, the majority of sample scores for RAVs 832 and 833 are over 0.

graph_categorical(833, 832, coad_binary_factor, coad_sampleScore,
                  "patient.lymphatic_invasion", 
                  "Score Plot for Lymphatic Invasion", "Lymphatic Invasion")

RAV 833 has key phrases connected to possible “lymphatic invasion” data. See “T-Lymphocytes”, “B Cells”, “Immune System” in wordcloud.

drawWordcloud(RAVmodel, 833)

Kruskal-Wallis Test for Polytomous Variables

kw_df <- as.data.frame(matrix(nrow = ncol(coad_polytomous_factor),
                                ncol = ncol(coad_sampleScore)))

rownames(kw_df) <- colnames(coad_polytomous_factor)
colnames(kw_df) <- colnames(coad_sampleScore)

coad_kw_stat <- kw_df
coad_kw_pvalue <- kw_df
coad_kw_sig_stat <- kw_df

for (i in seq_len(ncol(coad_sampleScore))) {
  for (j in seq_len(ncol(coad_polytomous_factor))) {
    ## Kruskal-Wallis Test
    kruskal_test <- kruskal.test(coad_sampleScore[, i] ~ coad_polytomous_factor[,j])
    
    ## Kruskal-Wallis Statistic
    kw_val <-  kruskal_test$statistic
    coad_kw_stat[j, i] <- kw_val

    ## p-value
    pval <-  kruskal_test$p.value
    coad_kw_pvalue[j, i] <- pval
    
    ## Kruskal-Wallis Statistic with significant p-value (<0.05)
    if (kruskal_test$p.value < 0.05) {
      coad_kw_sig_stat[j,i] <- kruskal_test$statistic
    }
  }
}

batch_polytomous_ind <- grep('analyte|analytes|portion|procurement|aliquot|uuid|barcode',
                  rownames(coad_kw_stat))
coad_kw_stat <- coad_kw_stat[-batch_polytomous_ind,]
coad_kw_pvalue <- coad_kw_pvalue[-batch_polytomous_ind,]
coad_kw_sig_stat <- coad_kw_sig_stat[-batch_polytomous_ind,]
coad_kw_sig_stat <- coad_kw_sig_stat[-which(rowSums(is.na(coad_kw_sig_stat)) >= 13),]

Polytomous Variable Visualizations

heatmap(as.matrix(coad_kw_stat), main = "COAD Cancer Polytomous Variables: Kruskal-Wallis Test Statistics")

options(ztable.type='html')
ztable(coad_kw_sig_stat) %>%
  makeHeatmap(palette = 'Blues') %>%
  print(caption='COAD Cancer Polytomous Variables with a Significant Kruskal-Wallis Statistic (p-value < 0.05)')

COAD Cancer Polytomous Variables with a Significant Kruskal-Wallis Statistic (p-value < 0.05)
	RAV188	RAV1575	RAV324	RAV832	RAV61	RAV1008	RAV868	RAV64	RAV220	RAV834	RAV833	RAV438	RAV190	RAV1166	RAV192
pathology_N_stage						15.17	15.19						14.30
patient.icd_10						18.12			19.91	27.72
patient.icd_o_3_site						18.12			19.91	27.72
patient.stage_event.tnm_categories.pathologic_categories.pathologic_n						15.17	15.19						14.30
patient.tissue_source_site	72.94	63.95	87.18	37.05	51.60	71.24	97.30	68.38	73.45		82.22	84.38	50.09	77.32	95.18
patient.biospecimen_cqcf.vessel_used	10.91	8.80				12.96	7.93		14.32			13.65		10.87
patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status										100.13			13.80	12.26

The score plot shows factor level clustering for the COAD cancer attribute, “microsatellite instability test results”. There is a distinction between the MSI-H (microsatellite instability - high) versus the MSS (microsatellite stable) and MSI-L (microsatellite instability - low) sample scores.

graph_categorical(834, 190, coad_factorTb, coad_sampleScore,
                  "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status", 
                  "Score Plot for Microsatellite Status", "Microsatellite Status")

RAVs 834 and 190 have statistically significant Kruskal-Wallis values for the “microsatellite instability test results” attribute.

drawWordcloud(RAVmodel, 834)

drawWordcloud(RAVmodel, 190)

Confirmation of RAV Significance for Variable of Interest

See significant p-values for RAVs 834, 190, and 1166

for (i in seq_len(ncol(coad_sampleScore))) {
  print(colnames(coad_sampleScore)[i])
    xy <- pairwise.wilcox.test(coad_sampleScore[, i], coad_factorTb[,"patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"], p.adjust.method = "bonferroni")
    print(xy)
}

[1] "RAV188"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"] 

      indeterminate msi-h msi-l
msi-h 0.16          -     -    
msi-l 0.20          1.00  -    
mss   0.20          1.00  1.00 

P value adjustment method: bonferroni 
[1] "RAV1575"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"] 

      indeterminate msi-h msi-l
msi-h 0.20          -     -    
msi-l 0.48          1.00  -    
mss   0.25          1.00  1.00 

P value adjustment method: bonferroni 
[1] "RAV324"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"] 

      indeterminate msi-h msi-l
msi-h 1             -     -    
msi-l 1             1     -    
mss   1             1     1    

P value adjustment method: bonferroni 
[1] "RAV832"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"] 

      indeterminate msi-h msi-l
msi-h 0.89          -     -    
msi-l 1.00          1.00  -    
mss   1.00          0.99  1.00 

P value adjustment method: bonferroni 
[1] "RAV61"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"] 

      indeterminate msi-h msi-l
msi-h 0.28          -     -    
msi-l 0.16          1.00  -    
mss   0.24          1.00  1.00 

P value adjustment method: bonferroni 
[1] "RAV1008"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"] 

      indeterminate msi-h msi-l
msi-h 0.82          -     -    
msi-l 0.71          1.00  -    
mss   0.39          0.25  1.00 

P value adjustment method: bonferroni 
[1] "RAV868"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"] 

      indeterminate msi-h msi-l
msi-h 1             -     -    
msi-l 1             1     -    
mss   1             1     1    

P value adjustment method: bonferroni 
[1] "RAV64"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"] 

      indeterminate msi-h msi-l
msi-h 0.25          -     -    
msi-l 0.63          1.00  -    
mss   0.32          1.00  1.00 

P value adjustment method: bonferroni 
[1] "RAV220"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"] 

      indeterminate msi-h msi-l
msi-h 0.25          -     -    
msi-l 0.63          1.00  -    
mss   0.27          1.00  1.00 

P value adjustment method: bonferroni 
[1] "RAV834"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"] 

      indeterminate msi-h   msi-l
msi-h 0.28          -       -    
msi-l 1.00          1.3e-12 -    
mss   1.00          < 2e-16 1.00 

P value adjustment method: bonferroni 
[1] "RAV833"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"] 

      indeterminate msi-h msi-l
msi-h 1.00          -     -    
msi-l 1.00          1.00  -    
mss   0.94          1.00  1.00 

P value adjustment method: bonferroni 
[1] "RAV438"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"] 

      indeterminate msi-h msi-l
msi-h 1             -     -    
msi-l 1             1     -    
mss   1             1     1    

P value adjustment method: bonferroni 
[1] "RAV190"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"] 

      indeterminate msi-h  msi-l 
msi-h 1.0000        -      -     
msi-l 0.7053        0.1356 -     
mss   0.5525        0.0047 1.0000

P value adjustment method: bonferroni 
[1] "RAV1166"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"] 

      indeterminate msi-h msi-l
msi-h 0.223         -     -    
msi-l 1.000         0.111 -    
mss   1.000         0.009 1.000

P value adjustment method: bonferroni 
[1] "RAV192"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"] 

      indeterminate msi-h msi-l
msi-h 0.382         -     -    
msi-l 1.000         0.408 -    
mss   1.000         0.098 1.000

P value adjustment method: bonferroni

Top Validated RAVs Heatmap for LUAD Cancer Samples

validated_luad_cancer <- validate(luad_cancer, RAVmodel)
heatmapTable(validated_luad_cancer, RAVmodel, num.out=15, column_title = "LUAD Cancer")

Data Cleaning

Select only clinical attributes with <10% missing data

# Select columns with >10% completeness
keep_attribute_ind <- which(colSums(!is.na(luad_meta)) > round(nrow(luad_meta)/10))

luad_dat <- luad_meta[keep_attribute_ind] %>% subset(select=-patientID)

Calculate validation sample scores for each patient cancer sample and RAV combination for the top 15 validated RAVs

# Calculate validation scores
luad_all_sampleScores <- calculateScore(luad_cancer, RAVmodel)

validated_ind <- validatedSignatures(validated_luad_cancer, RAVmodel, num.out = 15, scoreCutoff = 0.45, indexOnly = TRUE) #Using Pearson Coefficient

## Subset sampleScore to join with MCPcounter
luad_sampleScore <- luad_all_sampleScores[, validated_ind] %>% as.data.frame()

Parse variables out to numeric and categorical variables

# Check for data types
unique(sapply(luad_cancer@colData, type))

[1] "character" "integer"   "double"

luad_charcTb <- luad_dat[, sapply(luad_dat, class) == 'character']
luad_numTb <- luad_dat[, sapply(luad_dat, class) %in% c('numeric', 'integer')]

# Convert to factor data type
luad_factorTb <- luad_dat[, sapply(luad_dat, class) == 'character']

#factorTb <- charcTb
luad_factorTb[sapply(luad_factorTb, is.character)] <- lapply(luad_factorTb[sapply(luad_factorTb, is.character)], factor, exclude = NULL)

single_factor_ind <- c()
binary_factor_ind <- c()
multi_factor_ind <- c()

# Testing factor grouping
for (i in 1:length(luad_factorTb)) {
  if (nlevels(luad_factorTb[,i]) == 1 | 
      (nlevels(luad_factorTb[,i]) == 2 & any(is.na(levels(luad_factorTb[,i]))))
      ) {
    single_factor_ind <- c(single_factor_ind, i)
    
  } else if (nlevels(luad_factorTb[,i]) == 3 & any(is.na(levels(luad_factorTb[,i]))) |
             (nlevels(luad_factorTb[,i]) == 2 & !any(is.na(levels(luad_factorTb[,i]))))
          ) {
    binary_factor_ind <- c(binary_factor_ind, i)
    
  } else {
    multi_factor_ind <- c(multi_factor_ind, i)
  }
}

luad_polytomous_factor <- luad_factorTb[,multi_factor_ind]
luad_binary_factor <- luad_factorTb[,binary_factor_ind]
luad_single_factor <- luad_factorTb[,single_factor_ind]

Wilcoxon Test for Binary Variables

wilcox_df <- as.data.frame(matrix(nrow = ncol(luad_binary_factor),
                                ncol = ncol(luad_sampleScore)))

rownames(wilcox_df) <- colnames(luad_binary_factor)
colnames(wilcox_df) <- colnames(luad_sampleScore)

luad_wilcox_stat <- wilcox_df
luad_wilcox_pvalue <- wilcox_df
luad_wilcox_sig_stat <- wilcox_df

for (i in seq_len(ncol(luad_sampleScore))) {
  for (j in seq_len(ncol(luad_binary_factor))) {
    ## Wilcoxon test
    wilcox_test <- wilcox.test(luad_sampleScore[, i] ~ luad_binary_factor[,j], alternative="two.sided")
    
    ## Statistic
    stat <- wilcox_test$statistic
    luad_wilcox_stat[j, i] <- stat
    
    ## p-value
    pval <- wilcox_test$p.value
    luad_wilcox_pvalue[j, i] <- pval
      
    ## Wilcoxon statistic with significant p-value (<0.01)
    if (wilcox_test$p.value < 0.05) {
      luad_wilcox_sig_stat[j,i] <- wilcox_test$statistic
    }
  }
}

batch_binary_ind <- grep('analyte|analytes|portion|procurement|aliquot|uuid|barcode',
                  rownames(luad_wilcox_stat))
luad_wilcox_stat <- luad_wilcox_stat[-batch_binary_ind,]
luad_wilcox_pvalue <- luad_wilcox_pvalue[-batch_binary_ind,]
luad_wilcox_sig_stat <- luad_wilcox_sig_stat[-batch_binary_ind,]
luad_wilcox_sig_stat <- luad_wilcox_sig_stat[-which(rowSums(is.na(luad_wilcox_sig_stat)) >= 10),]

Binary Variable Visualizations

heatmap(as.matrix(luad_wilcox_stat), main = 'LUAD Cancer Binary Variables: Wilcoxon Test Statistics')

options(ztable.type='html')
ztable(luad_wilcox_sig_stat) %>%
  makeHeatmap(palette = 'Blues') %>%
  print(caption='LUAD Cancer Binary Variables with a Significant Wilcoxon Statistic (p-value < 0.01)')

LUAD Cancer Binary Variables with a Significant Wilcoxon Statistic (p-value < 0.01)
	RAV814	RAV316	RAV684	RAV1065	RAV2538	RAV630	RAV1154	RAV1303	RAV1668	RAV220	RAV435
ethnicity	729.00	1989.00		2122.00	2128.00				632.00
patient.clinical_cqcf.history_of_neoadjuvant_treatment	1300.00	131.00	1405.00		161.00	1286.00			1278.00
patient.drugs.drug.route_of_administrations.route_of_administration	9.00						183.00			18.00
patient.drugs.drug.therapy_ongoing	580.00			1432.00			1411.00			691.00
patient.ethnicity	729.00	1989.00		2122.00	2128.00				632.00
patient.follow_ups.follow_up.followup_case_report_form_submission_reason	2422.00									2469.00
patient.follow_ups.follow_up.person_neoplasm_cancer_status	21269.00		19946.00		14601.00					20276.00
patient.follow_ups.follow_up.vital_status	25467.00	15906.00	25789.00		16984.00	23294.00			23123.00	24986.00
patient.gender	39477.00	27767.00	37861.00	27707.00	26418.00	39207.00			36675.00	37964.00
patient.history_of_neoadjuvant_treatment	1297.00	130.00	1402.00		161.00	1285.00			1276.00
patient.kras_gene_analysis_performed		9167.00	6243.00			6412.00
patient.new_tumor_events.new_tumor_event_after_initial_treatment	3074.00						1600.00			2899.00	2911.00
patient.person_neoplasm_cancer_status	20905.00				14528.00					20468.00
patient.tissue_prospective_collection_indicator	23866.00	39546.00	24774.00		38226.00	26509.00		36231.00	28479.00	25282.00
patient.tissue_retrospective_collection_indicator	40250.00	24570.00	39342.00		25890.00	37607.00		27885.00	35637.00	38834.00
patient.vital_status	30333.00	20152.00	27715.00	20890.00	18758.00				29370.00	29548.00
patient.bcr_canonical_check.bcr_patient_canonical_status	34332.00	24903.00	36345.00		27365.00		34846.00	34838.00	34964.00		24309.00
patient.biospecimen_cqcf.history_of_neoadjuvant_treatment	1300.00	131.00	1405.00		161.00	1286.00			1278.00
patient.biospecimen_cqcf.path_confirm_diagnosis_matching		1599.00	3578.00			3575.00
patient.biospecimen_cqcf.path_confirm_report_attached		7724.00		7389.00	7373.00	3169.00	2626.00		3275.00		9201.00
patient.samples.sample.2.is_ffpe							5020.00	4751.00			1707.00
patient.samples.sample.2.oct_embedded	889.00	881.00	386.00					206.00			930.00
gender	39477.00	27767.00	37861.00	27707.00	26418.00	39207.00			36675.00	37964.00
patient.follow_ups.follow_up.additional_radiation_therapy					1941.00				2969.00
patient.follow_ups.follow_up.new_tumor_event_after_initial_treatment	22530.00	16307.00	22410.00		17071.00	22572.00
patient.follow_ups.follow_up.targeted_molecular_therapy	23354.00	17589.00		17572.00	16938.00				22963.00	23774.00
patient.targeted_molecular_therapy		1636.00									1555.00
Survival	4176.00	6729.00		6738.00	7152.00				3847.00	4389.00
Transversion.High.Low				7705.00	7299.00				5053.00
Oncogene.Negative.or.Positive.Groups	4258.00			7364.00	7402.00
FCNA_CCND1				3599.00	3534.00				1626.00	1843.00

Categorized by the LUAD cancer binary variable “transversion status”, the sample scores for RAVs 1065 and 1668 seem to have an inverse linear relationship.

graph_categorical(1668, 1065, luad_binary_factor, luad_sampleScore,
                  "Transversion.High.Low",
                  "Score Plot for Transversion ", "Transversion Status")

RAV 1065 has key phrases connected to lung cancer clinical attributes.

drawWordcloud(RAVmodel, 1065)

Kruskal-Wallis Test for Polytomous Variables

kw_df <- as.data.frame(matrix(nrow = ncol(luad_polytomous_factor),
                                ncol = ncol(luad_sampleScore)))

rownames(kw_df) <- colnames(luad_polytomous_factor)
colnames(kw_df) <- colnames(luad_sampleScore)

luad_kw_stat <- kw_df
luad_kw_pvalue <- kw_df
luad_kw_sig_stat <- kw_df

for (i in seq_len(ncol(luad_sampleScore))) {
  for (j in seq_len(ncol(luad_polytomous_factor))) {
    ## Kruskal-Wallis Test
    kruskal_test <- kruskal.test(luad_sampleScore[, i] ~ luad_polytomous_factor[,j])
    
    ## Kruskal-Wallis Statistic
    kw_val <-  kruskal_test$statistic
    luad_kw_stat[j, i] <- kw_val

    ## p-value
    pval <-  kruskal_test$p.value
    luad_kw_pvalue[j, i] <- pval
    
    ## Kruskal-Wallis Statistic with significant p-value (<0.05)
    if (kruskal_test$p.value < 0.05) {
    luad_kw_sig_stat[j,i] <- kruskal_test$statistic
    }
  }
}

batch_polytomous_ind <- grep('analyte|analytes|portion|procurement|aliquot|uuid|barcode',
                  rownames(luad_kw_stat))
luad_kw_stat <- luad_kw_stat[-batch_polytomous_ind,]
luad_kw_pvalue <- luad_kw_pvalue[-batch_polytomous_ind,]
luad_kw_sig_stat <- luad_kw_sig_stat[-batch_polytomous_ind,]
luad_kw_sig_stat <- luad_kw_sig_stat[-which(rowSums(is.na(luad_kw_sig_stat)) >= 10),]

Polytomous Variable Visualizations

heatmap(as.matrix(luad_kw_stat), main = "LUAD Cancer Polytomous Variables: Kruskal-Wallis Test Statistics")

options(ztable.type='html')
ztable(luad_kw_sig_stat) %>%
  makeHeatmap(palette = 'Blues') %>%
  print(caption='LUAD Cancer Polytomous Variables with a Significant Kruskal-Wallis Statistic (p-value < 0.05)')

LUAD Cancer Polytomous Variables with a Significant Kruskal-Wallis Statistic (p-value < 0.05)
	RAV814	RAV316	RAV684	RAV1065	RAV2538	RAV630	RAV1154	RAV1303	RAV1668	RAV220	RAV435
pathology_T_stage	25.75	33.17	28.47	19.64	29.55	21.04			18.16	30.15	16.92
pathology_N_stage					11.71					14.28
pathology_M_stage		22.12			12.68
patient.clinical_cqcf.consent_or_death_status	9.59	12.62	10.59				10.78	23.58	7.56		13.11
patient.clinical_cqcf.country		22.11	24.73			18.68	28.39	16.75		16.90
patient.clinical_cqcf.histological_type	41.13	26.37		39.84	41.43	20.74			40.80	27.24
patient.clinical_cqcf.history_of_prior_malignancy	7.10		7.06							8.05
patient.histological_type	41.17			37.31	38.24				38.28	24.80
patient.icd_10	12.65	13.12		17.78	14.86				17.87
patient.icd_o_3_histology	40.35	25.12		38.87	39.30				40.96	24.51
patient.icd_o_3_site		14.16	11.63						11.25
patient.prior_dx		25.49	27.21					8.08		10.79
patient.stage_event.pathologic_stage				20.75	26.65					23.37
patient.stage_event.tnm_categories.pathologic_categories.pathologic_m		22.12			12.68
patient.stage_event.tnm_categories.pathologic_categories.pathologic_n					11.71					14.28
patient.stage_event.tnm_categories.pathologic_categories.pathologic_t	25.75	33.17	28.47	19.64	29.55	21.04			18.16	30.15	16.92
patient.tissue_source_site	73.16	114.81	112.21	48.23	59.48	87.89	77.79	94.77	51.29	64.82	80.34
patient.biospecimen_cqcf.consent_or_death_status	9.59	12.62	10.59				10.78	23.58	7.56		13.11
patient.biospecimen_cqcf.country		22.11	24.73			18.68	28.39	16.75		16.90
patient.biospecimen_cqcf.histological_type	41.13	26.37		39.84	41.43	20.74			40.80	27.24
patient.biospecimen_cqcf.vessel_used	19.67	28.68	30.08		13.30	20.38				12.60
patient.samples.sample.2.sample_type	22.11	11.63	11.80		10.79		12.95	10.43		9.89
patient.samples.sample.2.vial_number							10.73				10.18
histological_type	41.17			37.31	38.24				38.28	24.80
patient.tobacco_smoking_history	25.68			55.66	29.74

The score plot shows sample scores for the LUAD cancer attribute, “tobacco smoking history” that were positive for RAV 2538, but negative for RAV 1065 and vice versa, negative for RAV 2538, but positive for RAV 1065.

graph_categorical(1065, 2538, luad_factorTb, luad_sampleScore,
                  "patient.tobacco_smoking_history",
                  "Score Plot for Tobacco Smoking History", "Tobacco Smoking History")

RAV 814 additionally had statistically significant Kruskal-Wallis values for the “tobacco smoking history” attribute.

drawWordcloud(RAVmodel, 814)

Confirmation of RAV Significance for Variable of Interest

See significant p-values for RAVs 814, 1065, 2538, 1668, and 220

for (i in seq_len(ncol(luad_sampleScore))) {
  print(colnames(luad_sampleScore)[i])
    xy <- pairwise.wilcox.test(luad_sampleScore[, i], luad_factorTb[,"patient.tobacco_smoking_history"], p.adjust.method = "bonferroni")
    print(xy)
}

[1] "RAV814"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  luad_sampleScore[, i] and luad_factorTb[, "patient.tobacco_smoking_history"] 

                                                current reformed smoker for < or = 15 years
current reformed smoker for > 15 years          1.4e-05                                    
current reformed smoker, duration not specified 1.00000                                    
current smoker                                  0.08404                                    
lifelong non-smoker                             0.00074                                    
                                                current reformed smoker for > 15 years
current reformed smoker for > 15 years          -                                     
current reformed smoker, duration not specified 1.00000                               
current smoker                                  8.0e-10                               
lifelong non-smoker                             1.00000                               
                                                current reformed smoker, duration not specified
current reformed smoker for > 15 years          -                                              
current reformed smoker, duration not specified -                                              
current smoker                                  1.00000                                        
lifelong non-smoker                             1.00000                                        
                                                current smoker
current reformed smoker for > 15 years          -             
current reformed smoker, duration not specified -             
current smoker                                  -             
lifelong non-smoker                             1.9e-07       

P value adjustment method: bonferroni 
[1] "RAV316"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  luad_sampleScore[, i] and luad_factorTb[, "patient.tobacco_smoking_history"] 

                                                current reformed smoker for < or = 15 years
current reformed smoker for > 15 years          1                                          
current reformed smoker, duration not specified 1                                          
current smoker                                  1                                          
lifelong non-smoker                             1                                          
                                                current reformed smoker for > 15 years
current reformed smoker for > 15 years          -                                     
current reformed smoker, duration not specified 1                                     
current smoker                                  1                                     
lifelong non-smoker                             1                                     
                                                current reformed smoker, duration not specified
current reformed smoker for > 15 years          -                                              
current reformed smoker, duration not specified -                                              
current smoker                                  1                                              
lifelong non-smoker                             1                                              
                                                current smoker
current reformed smoker for > 15 years          -             
current reformed smoker, duration not specified -             
current smoker                                  -             
lifelong non-smoker                             1             

P value adjustment method: bonferroni 
[1] "RAV684"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  luad_sampleScore[, i] and luad_factorTb[, "patient.tobacco_smoking_history"] 

                                                current reformed smoker for < or = 15 years
current reformed smoker for > 15 years          1.00                                       
current reformed smoker, duration not specified 0.28                                       
current smoker                                  1.00                                       
lifelong non-smoker                             1.00                                       
                                                current reformed smoker for > 15 years
current reformed smoker for > 15 years          -                                     
current reformed smoker, duration not specified 0.43                                  
current smoker                                  1.00                                  
lifelong non-smoker                             1.00                                  
                                                current reformed smoker, duration not specified
current reformed smoker for > 15 years          -                                              
current reformed smoker, duration not specified -                                              
current smoker                                  0.91                                           
lifelong non-smoker                             0.43                                           
                                                current smoker
current reformed smoker for > 15 years          -             
current reformed smoker, duration not specified -             
current smoker                                  -             
lifelong non-smoker                             1.00          

P value adjustment method: bonferroni 
[1] "RAV1065"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  luad_sampleScore[, i] and luad_factorTb[, "patient.tobacco_smoking_history"] 

                                                current reformed smoker for < or = 15 years
current reformed smoker for > 15 years          2.0e-05                                    
current reformed smoker, duration not specified 1.0000                                     
current smoker                                  0.0027                                     
lifelong non-smoker                             0.0050                                     
                                                current reformed smoker for > 15 years
current reformed smoker for > 15 years          -                                     
current reformed smoker, duration not specified 0.8548                                
current smoker                                  4.8e-11                               
lifelong non-smoker                             1.0000                                
                                                current reformed smoker, duration not specified
current reformed smoker for > 15 years          -                                              
current reformed smoker, duration not specified -                                              
current smoker                                  1.0000                                         
lifelong non-smoker                             1.0000                                         
                                                current smoker
current reformed smoker for > 15 years          -             
current reformed smoker, duration not specified -             
current smoker                                  -             
lifelong non-smoker                             7.9e-07       

P value adjustment method: bonferroni 
[1] "RAV2538"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  luad_sampleScore[, i] and luad_factorTb[, "patient.tobacco_smoking_history"] 

                                                current reformed smoker for < or = 15 years
current reformed smoker for > 15 years          3.9e-05                                    
current reformed smoker, duration not specified 1.0000                                     
current smoker                                  0.0530                                     
lifelong non-smoker                             0.0029                                     
                                                current reformed smoker for > 15 years
current reformed smoker for > 15 years          -                                     
current reformed smoker, duration not specified 1.0000                                
current smoker                                  3.4e-09                               
lifelong non-smoker                             1.0000                                
                                                current reformed smoker, duration not specified
current reformed smoker for > 15 years          -                                              
current reformed smoker, duration not specified -                                              
current smoker                                  1.0000                                         
lifelong non-smoker                             1.0000                                         
                                                current smoker
current reformed smoker for > 15 years          -             
current reformed smoker, duration not specified -             
current smoker                                  -             
lifelong non-smoker                             2.2e-06       

P value adjustment method: bonferroni 
[1] "RAV630"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  luad_sampleScore[, i] and luad_factorTb[, "patient.tobacco_smoking_history"] 

                                                current reformed smoker for < or = 15 years
current reformed smoker for > 15 years          1                                          
current reformed smoker, duration not specified 1                                          
current smoker                                  1                                          
lifelong non-smoker                             1                                          
                                                current reformed smoker for > 15 years
current reformed smoker for > 15 years          -                                     
current reformed smoker, duration not specified 1                                     
current smoker                                  1                                     
lifelong non-smoker                             1                                     
                                                current reformed smoker, duration not specified
current reformed smoker for > 15 years          -                                              
current reformed smoker, duration not specified -                                              
current smoker                                  1                                              
lifelong non-smoker                             1                                              
                                                current smoker
current reformed smoker for > 15 years          -             
current reformed smoker, duration not specified -             
current smoker                                  -             
lifelong non-smoker                             1             

P value adjustment method: bonferroni 
[1] "RAV1154"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  luad_sampleScore[, i] and luad_factorTb[, "patient.tobacco_smoking_history"] 

                                                current reformed smoker for < or = 15 years
current reformed smoker for > 15 years          1.00                                       
current reformed smoker, duration not specified 1.00                                       
current smoker                                  1.00                                       
lifelong non-smoker                             0.15                                       
                                                current reformed smoker for > 15 years
current reformed smoker for > 15 years          -                                     
current reformed smoker, duration not specified 1.00                                  
current smoker                                  1.00                                  
lifelong non-smoker                             1.00                                  
                                                current reformed smoker, duration not specified
current reformed smoker for > 15 years          -                                              
current reformed smoker, duration not specified -                                              
current smoker                                  1.00                                           
lifelong non-smoker                             1.00                                           
                                                current smoker
current reformed smoker for > 15 years          -             
current reformed smoker, duration not specified -             
current smoker                                  -             
lifelong non-smoker                             0.23          

P value adjustment method: bonferroni 
[1] "RAV1303"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  luad_sampleScore[, i] and luad_factorTb[, "patient.tobacco_smoking_history"] 

                                                current reformed smoker for < or = 15 years
current reformed smoker for > 15 years          1                                          
current reformed smoker, duration not specified 1                                          
current smoker                                  1                                          
lifelong non-smoker                             1                                          
                                                current reformed smoker for > 15 years
current reformed smoker for > 15 years          -                                     
current reformed smoker, duration not specified 1                                     
current smoker                                  1                                     
lifelong non-smoker                             1                                     
                                                current reformed smoker, duration not specified
current reformed smoker for > 15 years          -                                              
current reformed smoker, duration not specified -                                              
current smoker                                  1                                              
lifelong non-smoker                             1                                              
                                                current smoker
current reformed smoker for > 15 years          -             
current reformed smoker, duration not specified -             
current smoker                                  -             
lifelong non-smoker                             1             

P value adjustment method: bonferroni 
[1] "RAV1668"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  luad_sampleScore[, i] and luad_factorTb[, "patient.tobacco_smoking_history"] 

                                                current reformed smoker for < or = 15 years
current reformed smoker for > 15 years          9.2e-05                                    
current reformed smoker, duration not specified 1.0000                                     
current smoker                                  0.0075                                     
lifelong non-smoker                             0.0374                                     
                                                current reformed smoker for > 15 years
current reformed smoker for > 15 years          -                                     
current reformed smoker, duration not specified 1.0000                                
current smoker                                  7.3e-10                               
lifelong non-smoker                             1.0000                                
                                                current reformed smoker, duration not specified
current reformed smoker for > 15 years          -                                              
current reformed smoker, duration not specified -                                              
current smoker                                  1.0000                                         
lifelong non-smoker                             1.0000                                         
                                                current smoker
current reformed smoker for > 15 years          -             
current reformed smoker, duration not specified -             
current smoker                                  -             
lifelong non-smoker                             1.2e-05       

P value adjustment method: bonferroni 
[1] "RAV220"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  luad_sampleScore[, i] and luad_factorTb[, "patient.tobacco_smoking_history"] 

                                                current reformed smoker for < or = 15 years
current reformed smoker for > 15 years          0.14475                                    
current reformed smoker, duration not specified 1.00000                                    
current smoker                                  0.30075                                    
lifelong non-smoker                             0.07673                                    
                                                current reformed smoker for > 15 years
current reformed smoker for > 15 years          -                                     
current reformed smoker, duration not specified 1.00000                               
current smoker                                  0.00042                               
lifelong non-smoker                             1.00000                               
                                                current reformed smoker, duration not specified
current reformed smoker for > 15 years          -                                              
current reformed smoker, duration not specified -                                              
current smoker                                  1.00000                                        
lifelong non-smoker                             1.00000                                        
                                                current smoker
current reformed smoker for > 15 years          -             
current reformed smoker, duration not specified -             
current smoker                                  -             
lifelong non-smoker                             0.00069       

P value adjustment method: bonferroni 
[1] "RAV435"

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  luad_sampleScore[, i] and luad_factorTb[, "patient.tobacco_smoking_history"] 

                                                current reformed smoker for < or = 15 years
current reformed smoker for > 15 years          1                                          
current reformed smoker, duration not specified 1                                          
current smoker                                  1                                          
lifelong non-smoker                             1                                          
                                                current reformed smoker for > 15 years
current reformed smoker for > 15 years          -                                     
current reformed smoker, duration not specified 1                                     
current smoker                                  1                                     
lifelong non-smoker                             1                                     
                                                current reformed smoker, duration not specified
current reformed smoker for > 15 years          -                                              
current reformed smoker, duration not specified -                                              
current smoker                                  1                                              
lifelong non-smoker                             1                                              
                                                current smoker
current reformed smoker for > 15 years          -             
current reformed smoker, duration not specified -             
current smoker                                  -             
lifelong non-smoker                             1             

P value adjustment method: bonferroni