::opts_chunk$set(echo = TRUE)
knitr
suppressPackageStartupMessages({
# BiocManager
library(GenomicSuperSignature)
library(curatedTCGAData)
library(MultiAssayExperiment)
library(TCGAutils)
library(ComplexHeatmap)
# CRAN
library(tidyverse) # includes dplyr, ggplot2, magrittr, tidyr
library(magick)
library(wordcloud)
library(ztable)
library(metafolio)
library(randomForest)
library(caret)
})
Deliverable 1: Dashboard of RAVs and Attributes for TCGA Colorectal (COAD) and Lung (LUAD) Cancers
Preparation
Packages
Load all Replicated Axes of Variation (RAVs) from GenomicSuperSignature package
<- getModel('C2', load=TRUE) RAVmodel
Load TCGA Data for Colorectal and Lung Cancers from curatedTCGAData package
data('diseaseCodes', package = "TCGAutils")
<- curatedTCGAData(
data diseaseCode = c("COAD", "LUAD"),
assays = "RNASeq2Gene", version = "2.0.1",
dry.run = FALSE
)
<- TCGAsplitAssays(data, c("01", "11")) data
Create Variables for RNA Sequencing Data and Clinical Attributes
<- getWithColData(data,
coad_cancer '01_COAD_RNASeq2Gene-20160128')
assay(coad_cancer) <- log2(assay(coad_cancer) + 1)
<- colData(coad_cancer)
coad_meta
<- getWithColData(data,
luad_cancer '01_LUAD_RNASeq2Gene-20160128')
assay(luad_cancer) <- log2(assay(luad_cancer) + 1)
<- colData(luad_cancer) luad_meta
Function to Display Categorical Attributes’ Score Plots
<- function(RAV1, RAV2, factor.df, sampleScore.df, phenotype, graph_title = "Score Plot", legend_title = "Legend") {
graph_categorical
# Sanity Check 1: Check if RAVs are available; if RAVs don't exist in sampleScore.df, the function will error: "undefined columns selected"
# Sanity Check 2: phenotype is in factor.df
if(nlevels(factor.df[[phenotype]]) == 0) {
print("Phenotype is not in provided factor dataframe")
}
<- paste0("RAV", RAV1)
sampleScore1 <- paste0("RAV", RAV2)
sampleScore2
<- sampleScore.df[, c(sampleScore1, sampleScore2)]
new_df colnames(new_df)[1] <- "sampleScore1"
colnames(new_df)[2] <- "sampleScore2"
<- data.frame(new_df, factor.df)
new_df <- new_df[which(!is.na(factor(new_df[[phenotype]]))),] #Filter out rows of the phenotype that are N/A
plot_data #print(plot_data[[phenotype]])
<- gg_color_hue(length(unique(plot_data[[phenotype]]))) # Count number of factor levels excluding NA
colors <- c(colors)
colors.toplot
<- ggplot(plot_data,
pA aes(x = sampleScore1, y = sampleScore2, color = plot_data[[phenotype]])) +
geom_point() +
labs(title = graph_title) +
scale_color_manual(values = colors.toplot, name = legend_title) +
theme_bw() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank()) +
geom_hline(yintercept = 0, linetype = 'dashed') +
geom_vline(xintercept = 0, linetype = 'dashed') +
xlab(sampleScore1) + ylab(sampleScore2)
print(pA)
}
Exploratory Data Analysis (EDA)
Top Validated RAVs Heatmap for COAD Cancer Samples
<- validate(coad_cancer, RAVmodel)
validated_coad_cancer heatmapTable(validated_coad_cancer, RAVmodel, num.out=15, column_title = "COAD Cancer")
Data Cleaning
Select only clinical attributes with <10% missing data
# Select columns with >10% completeness
<- which(colSums(!is.na(coad_meta)) > round(nrow(coad_meta)/10))
keep_attribute_ind
<- coad_meta[keep_attribute_ind] %>% subset(select=-patientID) coad_dat
Calculate validation sample scores for each patient cancer sample and RAV combination for the top 15 validated RAVs
# Calculate validation scores
<- calculateScore(coad_cancer, RAVmodel)
coad_all_sampleScores
<- validatedSignatures(validated_coad_cancer, RAVmodel, num.out = 15, scoreCutoff = 0.45, indexOnly = TRUE) #Using Pearson Coefficient
validated_ind
## Subset sampleScore to join with MCPcounter
<- coad_all_sampleScores[, validated_ind] %>% as.data.frame() coad_sampleScore
Parse variables out to numeric and categorical variables
# Check for data types
unique(sapply(coad_cancer@colData, type))
[1] "character" "integer" "double"
<- coad_dat[, sapply(coad_dat, class) == 'character']
coad_charcTb <- coad_dat[, sapply(coad_dat, class) %in% c('numeric', 'integer')] coad_numTb
# Convert to factor data type
<- coad_dat[, sapply(coad_dat, class) == 'character']
coad_factorTb
#factorTb <- charcTb
sapply(coad_factorTb, is.character)] <- lapply(coad_factorTb[sapply(coad_factorTb, is.character)], factor, exclude = NULL) coad_factorTb[
<- c()
single_factor_ind <- c()
binary_factor_ind <- c()
multi_factor_ind
# Testing factor grouping
for (i in 1:length(coad_factorTb)) {
if (nlevels(coad_factorTb[,i]) == 1 |
nlevels(coad_factorTb[,i]) == 2 & any(is.na(levels(coad_factorTb[,i]))))
(
) {<- c(single_factor_ind, i)
single_factor_ind
else if (nlevels(coad_factorTb[,i]) == 3 & any(is.na(levels(coad_factorTb[,i]))) |
} nlevels(coad_factorTb[,i]) == 2 & !any(is.na(levels(coad_factorTb[,i]))))
(
) {<- c(binary_factor_ind, i)
binary_factor_ind
else {
} <- c(multi_factor_ind, i)
multi_factor_ind
}
}
<- coad_factorTb[,multi_factor_ind]
coad_polytomous_factor <- coad_factorTb[,binary_factor_ind]
coad_binary_factor <- coad_factorTb[,single_factor_ind] coad_single_factor
Wilcoxon Test for Binary Variables
<- as.data.frame(matrix(nrow = ncol(coad_binary_factor),
wilcox_df ncol = ncol(coad_sampleScore)))
rownames(wilcox_df) <- colnames(coad_binary_factor)
colnames(wilcox_df) <- colnames(coad_sampleScore)
<- wilcox_df
coad_wilcox_stat <- wilcox_df
coad_wilcox_pvalue <- wilcox_df
coad_wilcox_sig_stat
for (i in seq_len(ncol(coad_sampleScore))) {
for (j in seq_len(ncol(coad_binary_factor))) {
## Wilcoxon test
<- wilcox.test(coad_sampleScore[, i] ~ coad_binary_factor[,j], alternative="two.sided")
wilcox_test
## Statistic
<- wilcox_test$statistic
stat <- stat
coad_wilcox_stat[j, i]
## p-value
<- wilcox_test$p.value
pval <- pval
coad_wilcox_pvalue[j, i]
## Wilcoxon statistic with significant p-value (<0.01)
if (wilcox_test$p.value < 0.01) {
<- wilcox_test$statistic
coad_wilcox_sig_stat[j,i]
}
}
}
<- grep('analyte|analytes|portion|procurement|aliquot|uuid|barcode',
batch_binary_ind rownames(coad_wilcox_stat))
<- coad_wilcox_stat[-batch_binary_ind,]
coad_wilcox_stat <- coad_wilcox_pvalue[-batch_binary_ind,]
coad_wilcox_pvalue <- coad_wilcox_sig_stat[-batch_binary_ind,]
coad_wilcox_sig_stat <- coad_wilcox_sig_stat[-which(rowSums(is.na(coad_wilcox_sig_stat)) >= 13),] coad_wilcox_sig_stat
Binary Variable Visualizations
heatmap(as.matrix(coad_wilcox_stat), main = 'COAD Cancer Binary Variables: Wilcoxon Test Statistics')
options(ztable.type='html')
ztable(coad_wilcox_sig_stat ) %>%
makeHeatmap(palette = 'Blues') %>%
print(caption='COAD Cancer Binary Variables with a Significant Wilcoxon Statistic (p-value < 0.01)')
RAV188 | RAV1575 | RAV324 | RAV832 | RAV61 | RAV1008 | RAV868 | RAV64 | RAV220 | RAV834 | RAV833 | RAV438 | RAV190 | RAV1166 | RAV192 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
patient.clinical_cqcf.histological_type | 3512.00 | 3239.00 | 6327.00 | ||||||||||||
Categorized by the COAD cancer binary variable “lymphatic invasion”, the majority of sample scores for RAVs 832 and 833 are over 0.
graph_categorical(833, 832, coad_binary_factor, coad_sampleScore,
"patient.lymphatic_invasion",
"Score Plot for Lymphatic Invasion", "Lymphatic Invasion")
RAV 833 has key phrases connected to possible “lymphatic invasion” data. See “T-Lymphocytes”, “B Cells”, “Immune System” in wordcloud.
drawWordcloud(RAVmodel, 833)
Kruskal-Wallis Test for Polytomous Variables
<- as.data.frame(matrix(nrow = ncol(coad_polytomous_factor),
kw_df ncol = ncol(coad_sampleScore)))
rownames(kw_df) <- colnames(coad_polytomous_factor)
colnames(kw_df) <- colnames(coad_sampleScore)
<- kw_df
coad_kw_stat <- kw_df
coad_kw_pvalue <- kw_df
coad_kw_sig_stat
for (i in seq_len(ncol(coad_sampleScore))) {
for (j in seq_len(ncol(coad_polytomous_factor))) {
## Kruskal-Wallis Test
<- kruskal.test(coad_sampleScore[, i] ~ coad_polytomous_factor[,j])
kruskal_test
## Kruskal-Wallis Statistic
<- kruskal_test$statistic
kw_val <- kw_val
coad_kw_stat[j, i]
## p-value
<- kruskal_test$p.value
pval <- pval
coad_kw_pvalue[j, i]
## Kruskal-Wallis Statistic with significant p-value (<0.05)
if (kruskal_test$p.value < 0.05) {
<- kruskal_test$statistic
coad_kw_sig_stat[j,i]
}
}
}
<- grep('analyte|analytes|portion|procurement|aliquot|uuid|barcode',
batch_polytomous_ind rownames(coad_kw_stat))
<- coad_kw_stat[-batch_polytomous_ind,]
coad_kw_stat <- coad_kw_pvalue[-batch_polytomous_ind,]
coad_kw_pvalue <- coad_kw_sig_stat[-batch_polytomous_ind,]
coad_kw_sig_stat <- coad_kw_sig_stat[-which(rowSums(is.na(coad_kw_sig_stat)) >= 13),] coad_kw_sig_stat
Polytomous Variable Visualizations
heatmap(as.matrix(coad_kw_stat), main = "COAD Cancer Polytomous Variables: Kruskal-Wallis Test Statistics")
options(ztable.type='html')
ztable(coad_kw_sig_stat) %>%
makeHeatmap(palette = 'Blues') %>%
print(caption='COAD Cancer Polytomous Variables with a Significant Kruskal-Wallis Statistic (p-value < 0.05)')
RAV188 | RAV1575 | RAV324 | RAV832 | RAV61 | RAV1008 | RAV868 | RAV64 | RAV220 | RAV834 | RAV833 | RAV438 | RAV190 | RAV1166 | RAV192 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
pathology_N_stage | 15.17 | 15.19 | 14.30 | ||||||||||||
The score plot shows factor level clustering for the COAD cancer attribute, “microsatellite instability test results”. There is a distinction between the MSI-H (microsatellite instability - high) versus the MSS (microsatellite stable) and MSI-L (microsatellite instability - low) sample scores.
graph_categorical(834, 190, coad_factorTb, coad_sampleScore,
"patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status",
"Score Plot for Microsatellite Status", "Microsatellite Status")
RAVs 834 and 190 have statistically significant Kruskal-Wallis values for the “microsatellite instability test results” attribute.
drawWordcloud(RAVmodel, 834)
drawWordcloud(RAVmodel, 190)
Confirmation of RAV Significance for Variable of Interest
See significant p-values for RAVs 834, 190, and 1166
for (i in seq_len(ncol(coad_sampleScore))) {
print(colnames(coad_sampleScore)[i])
<- pairwise.wilcox.test(coad_sampleScore[, i], coad_factorTb[,"patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"], p.adjust.method = "bonferroni")
xy print(xy)
}
[1] "RAV188"
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"]
indeterminate msi-h msi-l
msi-h 0.16 - -
msi-l 0.20 1.00 -
mss 0.20 1.00 1.00
P value adjustment method: bonferroni
[1] "RAV1575"
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"]
indeterminate msi-h msi-l
msi-h 0.20 - -
msi-l 0.48 1.00 -
mss 0.25 1.00 1.00
P value adjustment method: bonferroni
[1] "RAV324"
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"]
indeterminate msi-h msi-l
msi-h 1 - -
msi-l 1 1 -
mss 1 1 1
P value adjustment method: bonferroni
[1] "RAV832"
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"]
indeterminate msi-h msi-l
msi-h 0.89 - -
msi-l 1.00 1.00 -
mss 1.00 0.99 1.00
P value adjustment method: bonferroni
[1] "RAV61"
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"]
indeterminate msi-h msi-l
msi-h 0.28 - -
msi-l 0.16 1.00 -
mss 0.24 1.00 1.00
P value adjustment method: bonferroni
[1] "RAV1008"
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"]
indeterminate msi-h msi-l
msi-h 0.82 - -
msi-l 0.71 1.00 -
mss 0.39 0.25 1.00
P value adjustment method: bonferroni
[1] "RAV868"
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"]
indeterminate msi-h msi-l
msi-h 1 - -
msi-l 1 1 -
mss 1 1 1
P value adjustment method: bonferroni
[1] "RAV64"
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"]
indeterminate msi-h msi-l
msi-h 0.25 - -
msi-l 0.63 1.00 -
mss 0.32 1.00 1.00
P value adjustment method: bonferroni
[1] "RAV220"
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"]
indeterminate msi-h msi-l
msi-h 0.25 - -
msi-l 0.63 1.00 -
mss 0.27 1.00 1.00
P value adjustment method: bonferroni
[1] "RAV834"
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"]
indeterminate msi-h msi-l
msi-h 0.28 - -
msi-l 1.00 1.3e-12 -
mss 1.00 < 2e-16 1.00
P value adjustment method: bonferroni
[1] "RAV833"
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"]
indeterminate msi-h msi-l
msi-h 1.00 - -
msi-l 1.00 1.00 -
mss 0.94 1.00 1.00
P value adjustment method: bonferroni
[1] "RAV438"
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"]
indeterminate msi-h msi-l
msi-h 1 - -
msi-l 1 1 -
mss 1 1 1
P value adjustment method: bonferroni
[1] "RAV190"
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"]
indeterminate msi-h msi-l
msi-h 1.0000 - -
msi-l 0.7053 0.1356 -
mss 0.5525 0.0047 1.0000
P value adjustment method: bonferroni
[1] "RAV1166"
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"]
indeterminate msi-h msi-l
msi-h 0.223 - -
msi-l 1.000 0.111 -
mss 1.000 0.009 1.000
P value adjustment method: bonferroni
[1] "RAV192"
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: coad_sampleScore[, i] and coad_factorTb[, "patient.microsatellite_instability_test_results.microsatellite_instability_test_result.mononucleotide_and_dinucleotide_marker_panel_analysis_status"]
indeterminate msi-h msi-l
msi-h 0.382 - -
msi-l 1.000 0.408 -
mss 1.000 0.098 1.000
P value adjustment method: bonferroni
Top Validated RAVs Heatmap for LUAD Cancer Samples
<- validate(luad_cancer, RAVmodel)
validated_luad_cancer heatmapTable(validated_luad_cancer, RAVmodel, num.out=15, column_title = "LUAD Cancer")
Data Cleaning
Select only clinical attributes with <10% missing data
# Select columns with >10% completeness
<- which(colSums(!is.na(luad_meta)) > round(nrow(luad_meta)/10))
keep_attribute_ind
<- luad_meta[keep_attribute_ind] %>% subset(select=-patientID) luad_dat
Calculate validation sample scores for each patient cancer sample and RAV combination for the top 15 validated RAVs
# Calculate validation scores
<- calculateScore(luad_cancer, RAVmodel)
luad_all_sampleScores
<- validatedSignatures(validated_luad_cancer, RAVmodel, num.out = 15, scoreCutoff = 0.45, indexOnly = TRUE) #Using Pearson Coefficient
validated_ind
## Subset sampleScore to join with MCPcounter
<- luad_all_sampleScores[, validated_ind] %>% as.data.frame() luad_sampleScore
Parse variables out to numeric and categorical variables
# Check for data types
unique(sapply(luad_cancer@colData, type))
[1] "character" "integer" "double"
<- luad_dat[, sapply(luad_dat, class) == 'character']
luad_charcTb <- luad_dat[, sapply(luad_dat, class) %in% c('numeric', 'integer')] luad_numTb
# Convert to factor data type
<- luad_dat[, sapply(luad_dat, class) == 'character']
luad_factorTb
#factorTb <- charcTb
sapply(luad_factorTb, is.character)] <- lapply(luad_factorTb[sapply(luad_factorTb, is.character)], factor, exclude = NULL) luad_factorTb[
<- c()
single_factor_ind <- c()
binary_factor_ind <- c()
multi_factor_ind
# Testing factor grouping
for (i in 1:length(luad_factorTb)) {
if (nlevels(luad_factorTb[,i]) == 1 |
nlevels(luad_factorTb[,i]) == 2 & any(is.na(levels(luad_factorTb[,i]))))
(
) {<- c(single_factor_ind, i)
single_factor_ind
else if (nlevels(luad_factorTb[,i]) == 3 & any(is.na(levels(luad_factorTb[,i]))) |
} nlevels(luad_factorTb[,i]) == 2 & !any(is.na(levels(luad_factorTb[,i]))))
(
) {<- c(binary_factor_ind, i)
binary_factor_ind
else {
} <- c(multi_factor_ind, i)
multi_factor_ind
}
}
<- luad_factorTb[,multi_factor_ind]
luad_polytomous_factor <- luad_factorTb[,binary_factor_ind]
luad_binary_factor <- luad_factorTb[,single_factor_ind] luad_single_factor
Wilcoxon Test for Binary Variables
<- as.data.frame(matrix(nrow = ncol(luad_binary_factor),
wilcox_df ncol = ncol(luad_sampleScore)))
rownames(wilcox_df) <- colnames(luad_binary_factor)
colnames(wilcox_df) <- colnames(luad_sampleScore)
<- wilcox_df
luad_wilcox_stat <- wilcox_df
luad_wilcox_pvalue <- wilcox_df
luad_wilcox_sig_stat
for (i in seq_len(ncol(luad_sampleScore))) {
for (j in seq_len(ncol(luad_binary_factor))) {
## Wilcoxon test
<- wilcox.test(luad_sampleScore[, i] ~ luad_binary_factor[,j], alternative="two.sided")
wilcox_test
## Statistic
<- wilcox_test$statistic
stat <- stat
luad_wilcox_stat[j, i]
## p-value
<- wilcox_test$p.value
pval <- pval
luad_wilcox_pvalue[j, i]
## Wilcoxon statistic with significant p-value (<0.01)
if (wilcox_test$p.value < 0.05) {
<- wilcox_test$statistic
luad_wilcox_sig_stat[j,i]
}
}
}
<- grep('analyte|analytes|portion|procurement|aliquot|uuid|barcode',
batch_binary_ind rownames(luad_wilcox_stat))
<- luad_wilcox_stat[-batch_binary_ind,]
luad_wilcox_stat <- luad_wilcox_pvalue[-batch_binary_ind,]
luad_wilcox_pvalue <- luad_wilcox_sig_stat[-batch_binary_ind,]
luad_wilcox_sig_stat <- luad_wilcox_sig_stat[-which(rowSums(is.na(luad_wilcox_sig_stat)) >= 10),] luad_wilcox_sig_stat
Binary Variable Visualizations
heatmap(as.matrix(luad_wilcox_stat), main = 'LUAD Cancer Binary Variables: Wilcoxon Test Statistics')
options(ztable.type='html')
ztable(luad_wilcox_sig_stat) %>%
makeHeatmap(palette = 'Blues') %>%
print(caption='LUAD Cancer Binary Variables with a Significant Wilcoxon Statistic (p-value < 0.01)')
RAV814 | RAV316 | RAV684 | RAV1065 | RAV2538 | RAV630 | RAV1154 | RAV1303 | RAV1668 | RAV220 | RAV435 | |
---|---|---|---|---|---|---|---|---|---|---|---|
ethnicity | 729.00 | 1989.00 | 2122.00 | 2128.00 | 632.00 | ||||||
Categorized by the LUAD cancer binary variable “transversion status”, the sample scores for RAVs 1065 and 1668 seem to have an inverse linear relationship.
graph_categorical(1668, 1065, luad_binary_factor, luad_sampleScore,
"Transversion.High.Low",
"Score Plot for Transversion ", "Transversion Status")
RAV 1065 has key phrases connected to lung cancer clinical attributes.
drawWordcloud(RAVmodel, 1065)
Kruskal-Wallis Test for Polytomous Variables
<- as.data.frame(matrix(nrow = ncol(luad_polytomous_factor),
kw_df ncol = ncol(luad_sampleScore)))
rownames(kw_df) <- colnames(luad_polytomous_factor)
colnames(kw_df) <- colnames(luad_sampleScore)
<- kw_df
luad_kw_stat <- kw_df
luad_kw_pvalue <- kw_df
luad_kw_sig_stat
for (i in seq_len(ncol(luad_sampleScore))) {
for (j in seq_len(ncol(luad_polytomous_factor))) {
## Kruskal-Wallis Test
<- kruskal.test(luad_sampleScore[, i] ~ luad_polytomous_factor[,j])
kruskal_test
## Kruskal-Wallis Statistic
<- kruskal_test$statistic
kw_val <- kw_val
luad_kw_stat[j, i]
## p-value
<- kruskal_test$p.value
pval <- pval
luad_kw_pvalue[j, i]
## Kruskal-Wallis Statistic with significant p-value (<0.05)
if (kruskal_test$p.value < 0.05) {
<- kruskal_test$statistic
luad_kw_sig_stat[j,i]
}
}
}
<- grep('analyte|analytes|portion|procurement|aliquot|uuid|barcode',
batch_polytomous_ind rownames(luad_kw_stat))
<- luad_kw_stat[-batch_polytomous_ind,]
luad_kw_stat <- luad_kw_pvalue[-batch_polytomous_ind,]
luad_kw_pvalue <- luad_kw_sig_stat[-batch_polytomous_ind,]
luad_kw_sig_stat <- luad_kw_sig_stat[-which(rowSums(is.na(luad_kw_sig_stat)) >= 10),] luad_kw_sig_stat
Polytomous Variable Visualizations
heatmap(as.matrix(luad_kw_stat), main = "LUAD Cancer Polytomous Variables: Kruskal-Wallis Test Statistics")
options(ztable.type='html')
ztable(luad_kw_sig_stat) %>%
makeHeatmap(palette = 'Blues') %>%
print(caption='LUAD Cancer Polytomous Variables with a Significant Kruskal-Wallis Statistic (p-value < 0.05)')
RAV814 | RAV316 | RAV684 | RAV1065 | RAV2538 | RAV630 | RAV1154 | RAV1303 | RAV1668 | RAV220 | RAV435 | |
---|---|---|---|---|---|---|---|---|---|---|---|
pathology_T_stage | 25.75 | 33.17 | 28.47 | 19.64 | 29.55 | 21.04 | 18.16 | 30.15 | 16.92 | ||
The score plot shows sample scores for the LUAD cancer attribute, “tobacco smoking history” that were positive for RAV 2538, but negative for RAV 1065 and vice versa, negative for RAV 2538, but positive for RAV 1065.
graph_categorical(1065, 2538, luad_factorTb, luad_sampleScore,
"patient.tobacco_smoking_history",
"Score Plot for Tobacco Smoking History", "Tobacco Smoking History")
RAV 814 additionally had statistically significant Kruskal-Wallis values for the “tobacco smoking history” attribute.
drawWordcloud(RAVmodel, 814)
Confirmation of RAV Significance for Variable of Interest
See significant p-values for RAVs 814, 1065, 2538, 1668, and 220
for (i in seq_len(ncol(luad_sampleScore))) {
print(colnames(luad_sampleScore)[i])
<- pairwise.wilcox.test(luad_sampleScore[, i], luad_factorTb[,"patient.tobacco_smoking_history"], p.adjust.method = "bonferroni")
xy print(xy)
}
[1] "RAV814"
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: luad_sampleScore[, i] and luad_factorTb[, "patient.tobacco_smoking_history"]
current reformed smoker for < or = 15 years
current reformed smoker for > 15 years 1.4e-05
current reformed smoker, duration not specified 1.00000
current smoker 0.08404
lifelong non-smoker 0.00074
current reformed smoker for > 15 years
current reformed smoker for > 15 years -
current reformed smoker, duration not specified 1.00000
current smoker 8.0e-10
lifelong non-smoker 1.00000
current reformed smoker, duration not specified
current reformed smoker for > 15 years -
current reformed smoker, duration not specified -
current smoker 1.00000
lifelong non-smoker 1.00000
current smoker
current reformed smoker for > 15 years -
current reformed smoker, duration not specified -
current smoker -
lifelong non-smoker 1.9e-07
P value adjustment method: bonferroni
[1] "RAV316"
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: luad_sampleScore[, i] and luad_factorTb[, "patient.tobacco_smoking_history"]
current reformed smoker for < or = 15 years
current reformed smoker for > 15 years 1
current reformed smoker, duration not specified 1
current smoker 1
lifelong non-smoker 1
current reformed smoker for > 15 years
current reformed smoker for > 15 years -
current reformed smoker, duration not specified 1
current smoker 1
lifelong non-smoker 1
current reformed smoker, duration not specified
current reformed smoker for > 15 years -
current reformed smoker, duration not specified -
current smoker 1
lifelong non-smoker 1
current smoker
current reformed smoker for > 15 years -
current reformed smoker, duration not specified -
current smoker -
lifelong non-smoker 1
P value adjustment method: bonferroni
[1] "RAV684"
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: luad_sampleScore[, i] and luad_factorTb[, "patient.tobacco_smoking_history"]
current reformed smoker for < or = 15 years
current reformed smoker for > 15 years 1.00
current reformed smoker, duration not specified 0.28
current smoker 1.00
lifelong non-smoker 1.00
current reformed smoker for > 15 years
current reformed smoker for > 15 years -
current reformed smoker, duration not specified 0.43
current smoker 1.00
lifelong non-smoker 1.00
current reformed smoker, duration not specified
current reformed smoker for > 15 years -
current reformed smoker, duration not specified -
current smoker 0.91
lifelong non-smoker 0.43
current smoker
current reformed smoker for > 15 years -
current reformed smoker, duration not specified -
current smoker -
lifelong non-smoker 1.00
P value adjustment method: bonferroni
[1] "RAV1065"
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: luad_sampleScore[, i] and luad_factorTb[, "patient.tobacco_smoking_history"]
current reformed smoker for < or = 15 years
current reformed smoker for > 15 years 2.0e-05
current reformed smoker, duration not specified 1.0000
current smoker 0.0027
lifelong non-smoker 0.0050
current reformed smoker for > 15 years
current reformed smoker for > 15 years -
current reformed smoker, duration not specified 0.8548
current smoker 4.8e-11
lifelong non-smoker 1.0000
current reformed smoker, duration not specified
current reformed smoker for > 15 years -
current reformed smoker, duration not specified -
current smoker 1.0000
lifelong non-smoker 1.0000
current smoker
current reformed smoker for > 15 years -
current reformed smoker, duration not specified -
current smoker -
lifelong non-smoker 7.9e-07
P value adjustment method: bonferroni
[1] "RAV2538"
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: luad_sampleScore[, i] and luad_factorTb[, "patient.tobacco_smoking_history"]
current reformed smoker for < or = 15 years
current reformed smoker for > 15 years 3.9e-05
current reformed smoker, duration not specified 1.0000
current smoker 0.0530
lifelong non-smoker 0.0029
current reformed smoker for > 15 years
current reformed smoker for > 15 years -
current reformed smoker, duration not specified 1.0000
current smoker 3.4e-09
lifelong non-smoker 1.0000
current reformed smoker, duration not specified
current reformed smoker for > 15 years -
current reformed smoker, duration not specified -
current smoker 1.0000
lifelong non-smoker 1.0000
current smoker
current reformed smoker for > 15 years -
current reformed smoker, duration not specified -
current smoker -
lifelong non-smoker 2.2e-06
P value adjustment method: bonferroni
[1] "RAV630"
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: luad_sampleScore[, i] and luad_factorTb[, "patient.tobacco_smoking_history"]
current reformed smoker for < or = 15 years
current reformed smoker for > 15 years 1
current reformed smoker, duration not specified 1
current smoker 1
lifelong non-smoker 1
current reformed smoker for > 15 years
current reformed smoker for > 15 years -
current reformed smoker, duration not specified 1
current smoker 1
lifelong non-smoker 1
current reformed smoker, duration not specified
current reformed smoker for > 15 years -
current reformed smoker, duration not specified -
current smoker 1
lifelong non-smoker 1
current smoker
current reformed smoker for > 15 years -
current reformed smoker, duration not specified -
current smoker -
lifelong non-smoker 1
P value adjustment method: bonferroni
[1] "RAV1154"
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: luad_sampleScore[, i] and luad_factorTb[, "patient.tobacco_smoking_history"]
current reformed smoker for < or = 15 years
current reformed smoker for > 15 years 1.00
current reformed smoker, duration not specified 1.00
current smoker 1.00
lifelong non-smoker 0.15
current reformed smoker for > 15 years
current reformed smoker for > 15 years -
current reformed smoker, duration not specified 1.00
current smoker 1.00
lifelong non-smoker 1.00
current reformed smoker, duration not specified
current reformed smoker for > 15 years -
current reformed smoker, duration not specified -
current smoker 1.00
lifelong non-smoker 1.00
current smoker
current reformed smoker for > 15 years -
current reformed smoker, duration not specified -
current smoker -
lifelong non-smoker 0.23
P value adjustment method: bonferroni
[1] "RAV1303"
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: luad_sampleScore[, i] and luad_factorTb[, "patient.tobacco_smoking_history"]
current reformed smoker for < or = 15 years
current reformed smoker for > 15 years 1
current reformed smoker, duration not specified 1
current smoker 1
lifelong non-smoker 1
current reformed smoker for > 15 years
current reformed smoker for > 15 years -
current reformed smoker, duration not specified 1
current smoker 1
lifelong non-smoker 1
current reformed smoker, duration not specified
current reformed smoker for > 15 years -
current reformed smoker, duration not specified -
current smoker 1
lifelong non-smoker 1
current smoker
current reformed smoker for > 15 years -
current reformed smoker, duration not specified -
current smoker -
lifelong non-smoker 1
P value adjustment method: bonferroni
[1] "RAV1668"
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: luad_sampleScore[, i] and luad_factorTb[, "patient.tobacco_smoking_history"]
current reformed smoker for < or = 15 years
current reformed smoker for > 15 years 9.2e-05
current reformed smoker, duration not specified 1.0000
current smoker 0.0075
lifelong non-smoker 0.0374
current reformed smoker for > 15 years
current reformed smoker for > 15 years -
current reformed smoker, duration not specified 1.0000
current smoker 7.3e-10
lifelong non-smoker 1.0000
current reformed smoker, duration not specified
current reformed smoker for > 15 years -
current reformed smoker, duration not specified -
current smoker 1.0000
lifelong non-smoker 1.0000
current smoker
current reformed smoker for > 15 years -
current reformed smoker, duration not specified -
current smoker -
lifelong non-smoker 1.2e-05
P value adjustment method: bonferroni
[1] "RAV220"
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: luad_sampleScore[, i] and luad_factorTb[, "patient.tobacco_smoking_history"]
current reformed smoker for < or = 15 years
current reformed smoker for > 15 years 0.14475
current reformed smoker, duration not specified 1.00000
current smoker 0.30075
lifelong non-smoker 0.07673
current reformed smoker for > 15 years
current reformed smoker for > 15 years -
current reformed smoker, duration not specified 1.00000
current smoker 0.00042
lifelong non-smoker 1.00000
current reformed smoker, duration not specified
current reformed smoker for > 15 years -
current reformed smoker, duration not specified -
current smoker 1.00000
lifelong non-smoker 1.00000
current smoker
current reformed smoker for > 15 years -
current reformed smoker, duration not specified -
current smoker -
lifelong non-smoker 0.00069
P value adjustment method: bonferroni
[1] "RAV435"
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: luad_sampleScore[, i] and luad_factorTb[, "patient.tobacco_smoking_history"]
current reformed smoker for < or = 15 years
current reformed smoker for > 15 years 1
current reformed smoker, duration not specified 1
current smoker 1
lifelong non-smoker 1
current reformed smoker for > 15 years
current reformed smoker for > 15 years -
current reformed smoker, duration not specified 1
current smoker 1
lifelong non-smoker 1
current reformed smoker, duration not specified
current reformed smoker for > 15 years -
current reformed smoker, duration not specified -
current smoker 1
lifelong non-smoker 1
current smoker
current reformed smoker for > 15 years -
current reformed smoker, duration not specified -
current smoker -
lifelong non-smoker 1
P value adjustment method: bonferroni